Source code for lmflow.pipeline.utils.rm_trainer
import numpy as np
import torch
import torch.nn as nn
from transformers import Trainer
from .peft_trainer import PeftTrainer
[docs]
def compute_metrics(eval_pred):
result = {}
pos_predictions_scores = eval_pred.predictions[0]
neg_predictions_scores = eval_pred.predictions[1]
# We assume that the first sample is preferred by default in groundtruth
result['accuracy'] = np.sum(
pos_predictions_scores > neg_predictions_scores) / len(pos_predictions_scores)
return result
[docs]
def rm_loss(model, inputs, return_outputs=False):
rewards = model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"]
)[0]
bsz = rewards.size(0)
jidx = torch.arange(0, bsz, 2)
kidx = jidx + 1
rewards_j = rewards[jidx]
rewards_k = rewards[kidx]
loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
if return_outputs:
return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
return loss
[docs]
class RewardTrainer(Trainer):
[docs]
def compute_loss(self, model, inputs, return_outputs=False):
return rm_loss(model, inputs, return_outputs)
[docs]
class PeftRewardTrainer(PeftTrainer):
[docs]
def compute_loss(self, model, inputs, return_outputs=False):
return rm_loss(model, inputs, return_outputs)