Source code for lm_polygraph.generation_metrics.bart_score

import traceback
import logging
from typing import List, Dict
import numpy as np
import torch
import torch.nn as nn
from transformers import BartTokenizer, BartForConditionalGeneration
from .generation_metric import GenerationMetric

log = logging.getLogger(__name__)

SCORE_TYPES = ["rh"]


[docs]class BartScoreSeqMetric(GenerationMetric): """ Calculates BARTScore metric (https://arxiv.org/abs/2106.11520) between model-generated texts and ground truth texts. """ def __init__( self, score_type: str = "rh", device=None, max_length=256, checkpoint="facebook/bart-large-cnn", ): assert score_type in SCORE_TYPES self.score_type = score_type self.model = None self.max_length = max_length self.checkpoint = checkpoint self.device = device if device is None: self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.tokenizer = None self.model = None self.loss_fct = None self.lsm = None super().__init__(["greedy_texts", "input_texts"], "sequence") def __str__(self): return "BARTScoreSeq-" + self.score_type def _setup(self): self.tokenizer = BartTokenizer.from_pretrained(self.checkpoint) self.model = BartForConditionalGeneration.from_pretrained(self.checkpoint) self.model.eval() self.model.to(self.device) # Set up loss self.loss_fct = nn.NLLLoss( reduction="none", ignore_index=self.model.config.pad_token_id ) self.lsm = nn.LogSoftmax(dim=1)
[docs] def load(self, path=None): """Load model from paraphrase finetuning""" if path is None: path = "models/bart.pth" if self.model is None: self._setup() self.model.load_state_dict(torch.load(path, map_location=self.device))
[docs] def score(self, srcs, tgts, batch_size=4): """Score a batch of examples""" if self.model is None: self._setup() score_list = [] for i in range(0, len(srcs), batch_size): src_list = srcs[i : i + batch_size] tgt_list = tgts[i : i + batch_size] try: with torch.no_grad(): encoded_src = self.tokenizer( src_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt", ) encoded_tgt = self.tokenizer( tgt_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt", ) src_tokens = encoded_src["input_ids"].to(self.device) src_mask = encoded_src["attention_mask"].to(self.device) tgt_tokens = encoded_tgt["input_ids"].to(self.device) tgt_mask = encoded_tgt["attention_mask"] tgt_len = tgt_mask.sum(dim=1).to(self.device) output = self.model( input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens ) logits = output.logits.view(-1, self.model.config.vocab_size) loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) loss = loss.view(tgt_tokens.shape[0], -1) loss = loss.sum(dim=1) / tgt_len curr_score_list = [-x.item() for x in loss] score_list += curr_score_list except RuntimeError: traceback.print_exc() log.error(f"source: {src_list}") log.error(f"target: {tgt_list}") exit(0) return score_list
[docs] def test(self, batch_size=3): """Test""" if self.model is None: self._setup() src_list = [ "This is a very good idea. Although simple, but very insightful.", "Can I take a look?", "Do not trust him, he is a liar.", ] tgt_list = ["That's stupid.", "What's the problem?", "He is trustworthy."] log.info(self.score(src_list, tgt_list, batch_size))
def __call__( self, stats: Dict[str, np.ndarray], target_texts: List[str], ) -> np.ndarray: """ Calculates BARTScore(https://arxiv.org/abs/2106.11520) between stats['greedy_texts'] and target_texts. Parameters: stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes: * model-generated texts in 'greedy_texts' target_texts (List[str]): ground-truth texts Returns: np.ndarray: list of BART Scores for each sample in input. """ if self.model is None: self._setup() scores = self.score(stats["greedy_texts"], target_texts) return np.array(scores)