Spaces:

chenzihong
/

GraphGen

Running

GraphGen / graphgen /models /evaluator /qa /uni_evaluator.py

github-actions[bot]

Auto-sync from demo at Fri Dec 26 08:29:01 UTC 2025

7566ac3 22 days ago

3.94 kB

	# https://github.com/maszhongming/UniEval/tree/main
	from typing import Optional, List
	from graphgen.bases import BaseEvaluator, QAPair


	class UniEvaluator(BaseEvaluator):
	"""
	UniEvaluator for single QAPair evaluation across quality dimensions.

	Dimensions: naturalness, coherence, understandability

	Usage:
	evaluator = UniEvaluator()
	pair = QAPair(question="...", answer="...")
	scores = evaluator.evaluate(pair)
	# {"naturalness": 0.85, "coherence": 0.92, "understandability": 0.88}
	"""

	DEFAULT_MODEL: str = "MingZhong/unieval-sum"
	DEFAULT_DIMS: List[str] = ["naturalness", "coherence", "understandability"]
	DEFAULT_MAX_LENGTH: int = 2560

	def __init__(
	self,
	model_name: Optional[str] = None,
	max_length: Optional[int] = None,
	device: Optional[str] = None,
	):
	"""
	Args:
	model_name: HuggingFace model name/path
	max_length: Tokenizer max sequence length
	device: 'cuda', 'cpu', or None for auto-detect
	"""
	import torch
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	self.torch = torch

	self.model_name = model_name or self.DEFAULT_MODEL
	self.max_length = max_length or self.DEFAULT_MAX_LENGTH
	self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

	# Load model & tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
	self.model.to(self.device)
	self.model.eval()

	# Pre-compute Yes/No token IDs
	self._yes_id = self.tokenizer("Yes")["input_ids"][0]
	self._no_id = self.tokenizer("No")["input_ids"][0]

	@staticmethod
	def _build_input_text(dimension: str, question: str, answer: str) -> str:
	"""Construct input text for specified dimension."""
	if dimension == "naturalness":
	return f"question: Is this a natural response? </s> response: {answer}"
	if dimension == "coherence":
	return f"question: Is this a coherent response? </s> response: {answer} </s> history: {question}"
	if dimension == "understandability":
	return f"question: Is this an understandable response? </s> response: {answer}"
	raise NotImplementedError(f"Unsupported dimension '{dimension}'")

	def evaluate(
	self,
	pair: QAPair,
	dimensions: Optional[List[str]] = None,
	) -> dict[str, float]:
	"""Evaluate a single QAPair across specified dimensions."""
	dimensions = dimensions or self.DEFAULT_DIMS

	# Validate dimensions
	invalid = set(dimensions) - set(self.DEFAULT_DIMS)
	if invalid:
	raise ValueError(f"Invalid dimensions: {invalid}. Available: {self.DEFAULT_DIMS}")

	results = {}
	no_token = self.torch.tensor([[self._no_id]], device=self.device)

	for dim in dimensions:
	# Tokenize input
	src = self.tokenizer(
	self._build_input_text(dim, pair.question, pair.answer),
	max_length=self.max_length,
	truncation=True,
	return_tensors="pt",
	)
	src_tokens = src["input_ids"].to(self.device)
	src_mask = src["attention_mask"].to(self.device)

	# Score
	with self.torch.no_grad():
	logits = self.model(
	input_ids=src_tokens,
	attention_mask=src_mask,
	labels=no_token,
	use_cache=False,
	).logits[:, 0, :] # [1, vocab_size]

	probs = self.torch.softmax(logits, dim=-1)[0]
	score = probs[self._yes_id] / (probs[self._yes_id] + probs[self._no_id])

	results[dim] = score.item()

	return results