Source code for lm_polygraph.generation_metrics.model_score

import numpy as np
from typing import List, Dict

from .generation_metric import GenerationMetric


[docs]class ModelScoreTokenwiseMetric(GenerationMetric):
    """
    Calculates token-level ModelScore metric between model-generated texts and ground truth texts.
    For each ground-truth text `r` and model-generated text 'h', method measures
    log-probabilities of generation 'h' on prompt 'Paraphrase "{r}"'.
    """

    def __init__(self):
        super().__init__(["model_rh"], "token")

    def __str__(self):
        return "ModelScoreToken-rh"

    def __call__(
        self,
        stats: Dict[str, np.ndarray],
        target_texts: List[str],
    ) -> np.ndarray:
        """
        Calculates token-level ModelScore between stats['greedy_texts'] and target_texts.

        Parameters:
            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
                * log-probabilities of generation on prompt 'Paraphrase "{target text}"', in 'model_rh'
            target_texts (List[str]): ground-truth texts
        Returns:
            np.ndarray: concatenated float Model Scores for each token in each input sample.
        """
        return np.array([s for sample in stats["model_rh"] for s in sample[:-1]])


[docs]class ModelScoreSeqMetric(GenerationMetric):
    """
    Calculates sequence-level ModelScore metric between model-generated texts and ground truth texts.
    For each ground-truth text `r` and model-generated text 'h', method measures
    sum log-probabilitiy of generation 'h' on prompt 'Paraphrase "{r}"' normalized by the `h` length.
    """

    def __init__(self):
        super().__init__(["model_rh"], "sequence")

    def __str__(self):
        return "ModelScoreSeq-rh"

    def __call__(
        self,
        stats: Dict[str, np.ndarray],
        target_texts: List[str],
    ) -> np.ndarray:
        """
        Calculates sequence-level ModelScore between stats['greedy_texts'] and target_texts.

        Parameters:
            stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
                * log-probabilities of generation on prompt 'Paraphrase "{target text}"', in 'model_rh'
            target_texts (List[str]): ground-truth texts
        Returns:
            np.ndarray: float Model Scores for each input sample.
        """
        return np.array(
            [np.logaddexp.reduce(sample) / len(sample) for sample in stats["model_rh"]]
        )