Source code for wayward.specific_term_estimators

#!/usr/bin/env python3

# Copyright 2019 TinQwise Stamkracht, University of Amsterdam
# Author: Alex Olieman

from __future__ import annotations
# TODO: remove redundant typing imports once PEP 585 is finalized

import functools
import logging
from typing import Sequence, Callable

import numpy as np
from wayward.logsum import logsum

logger = logging.getLogger(__name__)

SpecificTermEstimator = Callable[[Sequence[np.ndarray]], np.ndarray]


[docs]class RequiresMultipleDocuments(Exception):
    pass


[docs]def requires_multiple_docs(estimator_func: SpecificTermEstimator):
    """
    Do not let the decorated function be called with fewer than two docs.

    Parameters
    ----------
    estimator_func : SpecificTermEstimator

    Raises
    ------
    RequiresMultipleDocuments

    Returns
    -------
    decorated_func : SpecificTermEstimator
    """
    @functools.wraps(estimator_func)
    def wrapper_func(document_term_frequencies):
        if len(document_term_frequencies) < 2:
            raise RequiresMultipleDocuments

        return estimator_func(document_term_frequencies)

    return wrapper_func


[docs]@requires_multiple_docs
def mutual_exclusion(
        document_term_frequencies: Sequence[np.ndarray]
) -> np.ndarray:
    """Estimate the fixed specific model with the mutual exclusion method."""
    doc_term_probs = [
        np.log(tf) - np.log(np.sum(tf))
        for tf in document_term_frequencies
    ]
    # complement events: 1 - p
    complements = [
        np.log1p(-np.exp(p_doc))
        for p_doc in doc_term_probs
    ]
    # probability of term to be important in one doc, and not others
    complement_products = np.array([
        dlm + complement
        for i, dlm in enumerate(doc_term_probs)
        for j, complement in enumerate(complements)
        if i != j
    ])
    # marginalize over all documents
    p_specific = (
        logsum(complement_products)
        - np.log(
            np.count_nonzero(complement_products > np.NINF, axis=0)
        )
    )
    # prevent NaNs from causing downstream errors
    p_specific[np.isnan(p_specific)] = np.NINF

    return p_specific


[docs]@requires_multiple_docs
def inverse_doc_frequency(
        document_term_frequencies: Sequence[np.ndarray]
) -> np.ndarray:
    """Estimate the fixed specific model with the inverse doc frequency method."""
    idf = 1 / np.count_nonzero(document_term_frequencies, axis=0)
    idf[~np.isfinite(idf)] = 0.

    # calculate normalized idf as log-probabilities
    p_specific = np.log(idf) - np.log(np.sum(idf))

    return p_specific


[docs]def idf_fallback_for_many_docs(
        document_term_frequencies: Sequence[np.ndarray],
        primary_estimator: SpecificTermEstimator,
        fallback_thresh: int
):
    if len(document_term_frequencies) < fallback_thresh:
        estimator_func = primary_estimator
    else:
        estimator_func = inverse_doc_frequency
        logger.warning(
            f'Estimator got more than {fallback_thresh} docs:'
            ' falling back to IDF for the current doc group.'
        )

    return estimator_func(document_term_frequencies)


me_up_to_40_docs = functools.partial(
    idf_fallback_for_many_docs,
    primary_estimator=mutual_exclusion,
    fallback_thresh=40
)