Source code for wayward.significant_words

#!/usr/bin/env python3

# Copyright 2019 TinQwise Stamkracht, University of Amsterdam
# Author: Alex Olieman

from __future__ import annotations
# TODO: remove redundant typing imports once PEP 585 is finalized

import logging
from heapq import nlargest
from operator import itemgetter
from typing import Iterable, Optional, Sequence, Tuple, List, Dict, cast

import numpy as np

from wayward import ParsimoniousLM
from wayward.logsum import logsum
from wayward.specific_term_estimators import (
    SpecificTermEstimator,
    RequiresMultipleDocuments,
    mutual_exclusion,
)

logger = logging.getLogger(__name__)

InitialLambdas = Tuple[np.floating, np.floating, np.floating]


[docs]class SignificantWordsLM(ParsimoniousLM): """ Language model that consists of three sub-models: - Corpus model: represents term probabilities in a (large) background collection; - Group model: parsimonious term probabilities in a group of documents; - Specific model: represents the same group, but is biased towards terms that occur with a high frequency in single docs, and a low frequency in others. References ---------- M. Dehghani, H. Azarbonyad, J. Kamps, D. Hiemstra, and M. Marx (2016). `Luhn Revisited: Significant Words Language Models <https://djoerdhiemstra.com/wp-content/uploads/cikm2016.pdf>`_. Proc. CKIM'16. Parameters ---------- documents : iterable over iterable of str terms All documents that should be included in the corpus model. lambdas : 3-tuple of float Weight of corpus, group, and specific models. Will be normalized if the weights in the tuple don't sum to one. thresh : int Don't include words that occur fewer than `thresh` times. Attributes ---------- vocab : dict of term -> int Mapping of terms to numeric indices p_corpus : array of float Log probability of terms in background model (indexed by `vocab`) p_group : array of float Log probability of terms in the last processed group model (indexed by `vocab`) p_specific : array of float Log probability of terms in the last processed specific model (indexed by `vocab`) lambda_corpus : array of float Log probability (weight) of corpus model for documents lambda_group : array of float Log probability (weight) of group model for documents lambda_specific : array of float Log probability (weight) of specific model for documents See Also -------- wayward.parsimonious.ParsimoniousLM : one-sided parsimonious model """ def __init__( self, documents: Iterable[Iterable[str]], lambdas: InitialLambdas, thresh: int = 0 ): """Collect the vocabulary and fit the background model.""" self.initial_lambdas = self.normalize_lambdas(lambdas) super().__init__(documents, self.initial_lambdas[1], thresh=thresh) self.lambda_corpus: Optional[np.ndarray] = None self.lambda_group: Optional[np.ndarray] = None self.lambda_specific: Optional[np.ndarray] = None self.p_group: Optional[np.ndarray] = None self.p_specific: Optional[np.ndarray] = None self.fix_lambdas = False
[docs] def group_top( self, k: int, document_group: Iterable[Iterable[str]], **kwargs ) -> List[Tuple[str, float]]: """ Get the top `k` terms of a `document_group` and their probabilities. This is a shortcut to retrieve the top terms found by :py:meth:`~.fit_parsimonious_group`. Parameters ---------- k : int Number of top terms to return. document_group : iterable over iterable of str terms All documents that should be included in the group model. kwargs Optional keyword arguments for :py:meth:`~.fit_parsimonious_group`. Returns ------- t_p : list of (str, float) Terms and their probabilities in the group model. See Also -------- SignificantWordsLM.fit_parsimonious_group """ term_probabilities = self.fit_parsimonious_group(document_group, **kwargs) return nlargest(k, term_probabilities.items(), itemgetter(1))
[docs] def fit_parsimonious_group( self, document_group: Iterable[Iterable[str]], max_iter: int = 50, eps: float = 1e-5, lambdas: Optional[InitialLambdas] = None, fix_lambdas: bool = False, parsimonize_specific: bool = False, post_parsimonize: bool = False, specific_estimator: SpecificTermEstimator = mutual_exclusion ) -> Dict[str, float]: """ Estimate a document group model, and parsimonize it against fixed corpus and specific models. The documents may be unseen, but any terms that are not in the vocabulary will be ignored. Parameters ---------- document_group : iterable over iterable of str terms All documents that should be included in the group model. max_iter : int, optional Maximum number of iterations of EM algorithm to run. eps : float, optional Epsilon: convergence threshold for EM algorithm. lambdas : 3-tuple of float, optional Weight of corpus, group, and specific models. Will be normalized if the weights in the tuple don't sum to one. fix_lambdas : bool, optional Fix the weights of the three sub-models (i.e. don't estimate lambdas as part of the M-step). parsimonize_specific : bool, optional Bias the specific model towards uncommon terms before applying the EM algorithm to the group model. This generally results in a group model that stands out less from the corpus model. post_parsimonize : bool, optional Bias the group model towards uncommon terms after applying the EM algorithm. This may be used to compensate when the frequency of common terms varies much between the documents in the group. specific_estimator : callable, optional Function that estimates the specific terms model based on the document term frequencies of the doc group. Returns ------- t_p_map : dict of term -> float Dictionary of terms and their probabilities in the group model. """ if lambdas is None: lambdas = self.initial_lambdas else: lambdas = self.normalize_lambdas(lambdas) self.fix_lambdas = fix_lambdas document_models = [ self._document_model(doc) for doc in document_group ] del document_group doc_term_frequencies = [tf for tf, _ in document_models] group_tf, p_group = self._group_model( doc_term_frequencies ) try: self.p_specific = specific_estimator(doc_term_frequencies) except RequiresMultipleDocuments: logger.warning( 'Cannot calculate `p_specific` for a single document, ' 'using `p_corpus` as replacement.' ) self.p_specific = self.p_corpus if parsimonize_specific: self.p_specific = self._EM( group_tf, self.p_specific, cast(np.floating, 1/3), max_iter, eps ) weights_shape = len(document_models) if self.fix_lambdas: weights_shape = 1 general_w, group_w, specific_w = np.log(lambdas) self.lambda_corpus = np.full(weights_shape, general_w, dtype=np.double) self.lambda_specific = np.full(weights_shape, specific_w, dtype=np.double) self.lambda_group = np.full(weights_shape, group_w, dtype=np.double) logger.info( f'Lambdas initialized to: Corpus={lambdas[0]:.4f}, ' f'Group={lambdas[1]:.4f}, Specific={lambdas[2]:.4f}' ) self.p_group = self._estimate( p_group, self.p_specific, doc_term_frequencies, max_iter, eps ) if post_parsimonize: self.p_group = self._EM(group_tf, self.p_group, self.w, max_iter, eps) if self.fix_lambdas is False: logger.info( f'Final lambdas (mean): ' f'Corpus={np.mean(np.exp(self.lambda_corpus)):.4f}, ' f'Group={np.mean(np.exp(self.lambda_group)):.4f}, ' f'Specific={np.mean(np.exp(self.lambda_specific)):.4f}' ) return self.get_term_probabilities(self.p_group)
def _estimate( self, p_group: np.ndarray, p_specific: np.ndarray, doc_tf: Sequence[np.ndarray], max_iter: int, eps: float ) -> np.ndarray: """Apply the Expectation Maximization algorithm.""" try: old_error_settings = np.seterr(divide='ignore') log_doc_tf = np.log(doc_tf) for i in range(1, 1 + max_iter): expectation = self._e_step(p_group, p_specific) new_p_group = self._m_step(expectation, log_doc_tf) diff = new_p_group - p_group p_group = new_p_group if (diff[np.isfinite(diff)] < eps).all(): logger.info(f'EM: convergence reached after {i} iterations') break finally: np.seterr(**old_error_settings) return p_group def _e_step( self, p_group: np.ndarray, p_specific: np.ndarray ) -> Dict[str, np.ndarray]: """Run an E-step.""" corpus_numerator = np.add.outer(self.lambda_corpus, self.p_corpus) specific_numerator = np.add.outer(self.lambda_specific, p_specific) group_numerator = np.add.outer(self.lambda_group, p_group) denominator = [ logsum(np.asarray(doc_numerators)) for doc_numerators in zip( corpus_numerator, specific_numerator, group_numerator ) ] out = { 'corpus': corpus_numerator - denominator, 'specific': specific_numerator - denominator, 'group': group_numerator - denominator } # prevent NaNs from causing downstream errors for v in out.values(): v[np.isnan(v)] = np.NINF return out def _m_step( self, expectation: Dict[str, np.ndarray], log_doc_tf: Sequence[np.ndarray] ) -> np.ndarray: """Run an M-step.""" term_weighted_group = log_doc_tf + expectation['group'] group_numerator = logsum(term_weighted_group) p_group = group_numerator - logsum(group_numerator) if self.fix_lambdas is False: # estimate lambdas corpus_numerator = logsum( np.transpose(log_doc_tf + expectation['corpus']) ) specific_numerator = logsum( np.transpose(log_doc_tf + expectation['specific']) ) group_numerator = logsum(np.transpose(term_weighted_group)) denominator = logsum( np.asarray([corpus_numerator, specific_numerator, group_numerator]) ) self.lambda_corpus = corpus_numerator - denominator self.lambda_specific = specific_numerator - denominator self.lambda_group = group_numerator - denominator return p_group @staticmethod def _group_model( document_term_frequencies: Sequence[np.ndarray] ) -> Tuple[np.ndarray, np.ndarray]: """Create the initial group model.""" group_tf = np.array(document_term_frequencies).sum(axis=0) try: old_error_settings = np.seterr(divide='ignore') p_group = np.log(group_tf) - np.log(np.sum(group_tf)) finally: np.seterr(**old_error_settings) return group_tf, p_group
[docs] @staticmethod def normalize_lambdas(lambdas: InitialLambdas) -> InitialLambdas: """ Check and normalize the initial lambdas of the three sub-models. Parameters ---------- lambdas : 3-tuple of float Weight of corpus, group, and specific models. Returns ------- lambdas : 3-tuple of float Normalized probability of corpus, group, and specific models. """ assert len(lambdas) == 3, f'lambdas should be a 3-tuple, not {lambdas}' total_weight = sum(lambdas) if abs(total_weight - 1) > 1e-10: lambdas = cast( InitialLambdas, tuple( w / total_weight for w in lambdas ) ) return lambdas