| """ | |
| Test cases for LongBench-v2 evaluation utility. | |
| """ | |
| import json | |
| import os | |
| import tempfile | |
| from sglang.test.simple_eval_longbench_v2 import ( | |
| LongBenchV2Eval, | |
| extract_longbench_v2_answer, | |
| format_longbench_v2_question, | |
| ) | |
| def test_format_longbench_v2_question(): | |
| """Test the official LongBench-v2 question formatting.""" | |
| sample_row = { | |
| "context": "This is a sample context about environmental issues.", | |
| "question": "What is the main theme?", | |
| "A": "Technology", | |
| "B": "Environment", | |
| "C": "Economics", | |
| "D": "Politics", | |
| "answer": "B", | |
| } | |
| formatted = format_longbench_v2_question(sample_row) | |
| # Verify official template structure | |
| assert "This is a sample context about environmental issues." in formatted | |
| assert ( | |
| "What is the correct answer to this question: What is the main theme?" | |
| in formatted | |
| ) | |
| assert "(A) Technology" in formatted | |
| assert "(B) Environment" in formatted | |
| assert "(C) Economics" in formatted | |
| assert "(D) Politics" in formatted | |
| assert "The correct answer is" in formatted | |
| print("✓ Question formatting works correctly") | |
| def test_extract_longbench_v2_answer(): | |
| """Test the official LongBench-v2 answer extraction.""" | |
| # Test official format: "The correct answer is (A)" | |
| response1 = "After analyzing the context, The correct answer is (B)." | |
| assert extract_longbench_v2_answer(response1) == "B" | |
| # Test alternative format: "The correct answer is A" | |
| response2 = "Based on the evidence, The correct answer is C." | |
| assert extract_longbench_v2_answer(response2) == "C" | |
| # Test with asterisks | |
| response3 = "*The correct answer is (D)*" | |
| assert extract_longbench_v2_answer(response3) == "D" | |
| # Test fallback to standard pattern | |
| response4 = "I think the answer is A." | |
| assert extract_longbench_v2_answer(response4) == "A" | |
| # Test no answer | |
| response5 = "I'm not sure about this." | |
| assert extract_longbench_v2_answer(response5) is None | |
| print("✓ Answer extraction works correctly") | |
| def test_longbench_v2_eval_initialization(): | |
| """Test LongBench-v2 evaluation class initialization.""" | |
| # Create a temporary JSON file with sample data | |
| sample_data = [ | |
| { | |
| "_id": "test_001", | |
| "domain": "single_document_qa", | |
| "question": "What is X?", | |
| "choice_A": "Option A1", | |
| "choice_B": "Option B1", | |
| "choice_C": "Option C1", | |
| "choice_D": "Option D1", | |
| "answer": "A", | |
| "context": "Context 1", | |
| }, | |
| { | |
| "_id": "test_002", | |
| "domain": "multi_document_qa", | |
| "question": "What is Y?", | |
| "A": "Option A2", | |
| "B": "Option B2", | |
| "C": "Option C2", | |
| "D": "Option D2", | |
| "answer": "B", | |
| "context": "Context 2", | |
| }, | |
| ] | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: | |
| json.dump(sample_data, f) | |
| temp_file = f.name | |
| try: | |
| # Test initialization with new data_source parameter | |
| eval_instance = LongBenchV2Eval(data_source=temp_file, num_examples=1) | |
| assert len(eval_instance.examples) == 1 | |
| first_example = eval_instance.examples[0] | |
| assert first_example.get("category") in { | |
| "single_document_qa", | |
| "multi_document_qa", | |
| } | |
| assert first_example.get("A") in {"Option A1", "Option A2"} | |
| print("✓ Evaluation class initialization works correctly") | |
| finally: | |
| os.unlink(temp_file) | |
| def test_category_filtering(): | |
| """Ensure category filtering keeps only requested domains.""" | |
| sample_data = [ | |
| { | |
| "_id": "test_001", | |
| "domain": "single_document_qa", | |
| "question": "What is X?", | |
| "choice_A": "Option A1", | |
| "choice_B": "Option B1", | |
| "choice_C": "Option C1", | |
| "choice_D": "Option D1", | |
| "answer": "A", | |
| "context": "Context 1", | |
| }, | |
| { | |
| "_id": "test_002", | |
| "domain": "multi_document_qa", | |
| "question": "What is Y?", | |
| "choice_A": "Option A2", | |
| "choice_B": "Option B2", | |
| "choice_C": "Option C2", | |
| "choice_D": "Option D2", | |
| "answer": "B", | |
| "context": "Context 2", | |
| }, | |
| ] | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: | |
| json.dump(sample_data, f) | |
| temp_file = f.name | |
| try: | |
| eval_instance = LongBenchV2Eval( | |
| data_source=temp_file, | |
| categories=["multi_document_qa"], | |
| ) | |
| assert len(eval_instance.examples) == 1 | |
| assert eval_instance.examples[0]["category"] == "multi_document_qa" | |
| print("✓ Category filtering works correctly") | |
| finally: | |
| os.unlink(temp_file) | |
| def test_difficulty_metrics(): | |
| """Validate that difficulty-specific metrics are recorded.""" | |
| sample_data = [ | |
| { | |
| "_id": "easy_001", | |
| "domain": "single_document_qa", | |
| "difficulty": "easy", | |
| "question": "Easy question?", | |
| "choice_A": "Correct", | |
| "choice_B": "Wrong", | |
| "choice_C": "Wrong", | |
| "choice_D": "Wrong", | |
| "answer": "A", | |
| "context": "Easy context", | |
| }, | |
| { | |
| "_id": "hard_001", | |
| "domain": "single_document_qa", | |
| "difficulty": "hard", | |
| "question": "Hard question?", | |
| "choice_A": "Wrong", | |
| "choice_B": "Correct", | |
| "choice_C": "Wrong", | |
| "choice_D": "Wrong", | |
| "answer": "B", | |
| "context": "Hard context", | |
| }, | |
| ] | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: | |
| json.dump(sample_data, f) | |
| temp_file = f.name | |
| class FixedSampler: # noqa: D401 - simple helper | |
| """Mock sampler returning the correct answer based on question text.""" | |
| def _pack_message(self, content: str, role: str): | |
| return {"content": content, "role": role} | |
| def __call__(self, messages): | |
| prompt = messages[0]["content"] | |
| if "Easy question" in prompt: | |
| return "The correct answer is (A)" | |
| return "The correct answer is (B)" | |
| try: | |
| eval_instance = LongBenchV2Eval(data_source=temp_file, num_threads=1) | |
| result = eval_instance(FixedSampler()) | |
| assert result.metrics.get("difficulty_easy") == 1.0 | |
| assert result.metrics.get("difficulty_hard") == 1.0 | |
| print("✓ Difficulty metrics recorded correctly") | |
| finally: | |
| os.unlink(temp_file) | |
| def main(): | |
| """Run all tests.""" | |
| print("Testing simplified LongBench-v2 evaluation utility...\n") | |
| test_format_longbench_v2_question() | |
| test_extract_longbench_v2_answer() | |
| test_longbench_v2_eval_initialization() | |
| test_category_filtering() | |
| test_difficulty_metrics() | |
| print("\n" + "=" * 50) | |
| print("✅ ALL TESTS PASSED!") | |
| print("The simplified implementation follows SGLang patterns") | |
| print("while maintaining LongBench-v2 compatibility.") | |
| print("=" * 50) | |
| if __name__ == "__main__": | |
| main() | |
Xet Storage Details
- Size:
- 7.35 kB
- Xet hash:
- 1759c24f8754b55b73fd9cfa40c17c7ad9eb2a66a667dc765ad448c9b7f186f9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.