Buckets:

leideng
/

QCFuse

Files

xet

leideng/QCFuse / test /longbench_v2 /test_longbench_v2_eval.py

leideng

16 days ago

download

raw

7.35 kB

	"""
	Test cases for LongBench-v2 evaluation utility.
	"""

	import json
	import os
	import tempfile

	from sglang.test.simple_eval_longbench_v2 import (
	LongBenchV2Eval,
	extract_longbench_v2_answer,
	format_longbench_v2_question,
	)


	def test_format_longbench_v2_question():
	"""Test the official LongBench-v2 question formatting."""
	sample_row = {
	"context": "This is a sample context about environmental issues.",
	"question": "What is the main theme?",
	"A": "Technology",
	"B": "Environment",
	"C": "Economics",
	"D": "Politics",
	"answer": "B",
	}

	formatted = format_longbench_v2_question(sample_row)

	# Verify official template structure
	assert "This is a sample context about environmental issues." in formatted
	assert (
	"What is the correct answer to this question: What is the main theme?"
	in formatted
	)
	assert "(A) Technology" in formatted
	assert "(B) Environment" in formatted
	assert "(C) Economics" in formatted
	assert "(D) Politics" in formatted
	assert "The correct answer is" in formatted
	print("✓ Question formatting works correctly")


	def test_extract_longbench_v2_answer():
	"""Test the official LongBench-v2 answer extraction."""

	# Test official format: "The correct answer is (A)"
	response1 = "After analyzing the context, The correct answer is (B)."
	assert extract_longbench_v2_answer(response1) == "B"

	# Test alternative format: "The correct answer is A"
	response2 = "Based on the evidence, The correct answer is C."
	assert extract_longbench_v2_answer(response2) == "C"

	# Test with asterisks
	response3 = "The correct answer is (D)"
	assert extract_longbench_v2_answer(response3) == "D"

	# Test fallback to standard pattern
	response4 = "I think the answer is A."
	assert extract_longbench_v2_answer(response4) == "A"

	# Test no answer
	response5 = "I'm not sure about this."
	assert extract_longbench_v2_answer(response5) is None

	print("✓ Answer extraction works correctly")


	def test_longbench_v2_eval_initialization():
	"""Test LongBench-v2 evaluation class initialization."""

	# Create a temporary JSON file with sample data
	sample_data = [
	{
	"_id": "test_001",
	"domain": "single_document_qa",
	"question": "What is X?",
	"choice_A": "Option A1",
	"choice_B": "Option B1",
	"choice_C": "Option C1",
	"choice_D": "Option D1",
	"answer": "A",
	"context": "Context 1",
	},
	{
	"_id": "test_002",
	"domain": "multi_document_qa",
	"question": "What is Y?",
	"A": "Option A2",
	"B": "Option B2",
	"C": "Option C2",
	"D": "Option D2",
	"answer": "B",
	"context": "Context 2",
	},
	]

	with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
	json.dump(sample_data, f)
	temp_file = f.name

	try:
	# Test initialization with new data_source parameter
	eval_instance = LongBenchV2Eval(data_source=temp_file, num_examples=1)
	assert len(eval_instance.examples) == 1
	first_example = eval_instance.examples[0]
	assert first_example.get("category") in {
	"single_document_qa",
	"multi_document_qa",
	}
	assert first_example.get("A") in {"Option A1", "Option A2"}
	print("✓ Evaluation class initialization works correctly")

	finally:
	os.unlink(temp_file)


	def test_category_filtering():
	"""Ensure category filtering keeps only requested domains."""

	sample_data = [
	{
	"_id": "test_001",
	"domain": "single_document_qa",
	"question": "What is X?",
	"choice_A": "Option A1",
	"choice_B": "Option B1",
	"choice_C": "Option C1",
	"choice_D": "Option D1",
	"answer": "A",
	"context": "Context 1",
	},
	{
	"_id": "test_002",
	"domain": "multi_document_qa",
	"question": "What is Y?",
	"choice_A": "Option A2",
	"choice_B": "Option B2",
	"choice_C": "Option C2",
	"choice_D": "Option D2",
	"answer": "B",
	"context": "Context 2",
	},
	]

	with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
	json.dump(sample_data, f)
	temp_file = f.name

	try:
	eval_instance = LongBenchV2Eval(
	data_source=temp_file,
	categories=["multi_document_qa"],
	)
	assert len(eval_instance.examples) == 1
	assert eval_instance.examples[0]["category"] == "multi_document_qa"
	print("✓ Category filtering works correctly")
	finally:
	os.unlink(temp_file)


	def test_difficulty_metrics():
	"""Validate that difficulty-specific metrics are recorded."""

	sample_data = [
	{
	"_id": "easy_001",
	"domain": "single_document_qa",
	"difficulty": "easy",
	"question": "Easy question?",
	"choice_A": "Correct",
	"choice_B": "Wrong",
	"choice_C": "Wrong",
	"choice_D": "Wrong",
	"answer": "A",
	"context": "Easy context",
	},
	{
	"_id": "hard_001",
	"domain": "single_document_qa",
	"difficulty": "hard",
	"question": "Hard question?",
	"choice_A": "Wrong",
	"choice_B": "Correct",
	"choice_C": "Wrong",
	"choice_D": "Wrong",
	"answer": "B",
	"context": "Hard context",
	},
	]

	with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
	json.dump(sample_data, f)
	temp_file = f.name

	class FixedSampler: # noqa: D401 - simple helper
	"""Mock sampler returning the correct answer based on question text."""

	def _pack_message(self, content: str, role: str):
	return {"content": content, "role": role}

	def __call__(self, messages):
	prompt = messages[0]["content"]
	if "Easy question" in prompt:
	return "The correct answer is (A)"
	return "The correct answer is (B)"

	try:
	eval_instance = LongBenchV2Eval(data_source=temp_file, num_threads=1)
	result = eval_instance(FixedSampler())

	assert result.metrics.get("difficulty_easy") == 1.0
	assert result.metrics.get("difficulty_hard") == 1.0
	print("✓ Difficulty metrics recorded correctly")
	finally:
	os.unlink(temp_file)


	def main():
	"""Run all tests."""
	print("Testing simplified LongBench-v2 evaluation utility...\n")

	test_format_longbench_v2_question()
	test_extract_longbench_v2_answer()
	test_longbench_v2_eval_initialization()
	test_category_filtering()
	test_difficulty_metrics()

	print("\n" + "=" * 50)
	print("✅ ALL TESTS PASSED!")
	print("The simplified implementation follows SGLang patterns")
	print("while maintaining LongBench-v2 compatibility.")
	print("=" * 50)


	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 7.35 kB
Xet hash:: 1759c24f8754b55b73fd9cfa40c17c7ad9eb2a66a667dc765ad448c9b7f186f9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.