Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /compete.py

bbkdevops

about 1 month ago

download

raw

38.4 kB

	"""
	TinyMind vs World — Competitive Benchmark Suite

	วัดความสามารถบน 6 มิติเทียบกับโมเดลระดับโลก:
	1. MMLU-style — ความรู้วิชาการ (multiple choice)
	2. GSM8K-style — คณิตศาสตร์เชิงเหตุผล
	3. TruthfulQA — ความแม่นยำข้อเท็จจริง
	4. HumanEval — เขียนโค้ด Python
	5. Thai-Bench — ภาษาไทย + วัฒนธรรม + ONET/PAT
	6. Reasoning — ตรรกะและการอนุมาน

	โมเดลที่เปรียบเทียบ:
	- TinyMind (โมเดลเรา, รันจาก checkpoint หรือ API local)
	- Claude (via ANTHROPIC_API_KEY — เป็น reference model)
	- Published baselines จากวรรณกรรม (GPT-4, LLaMA-3 ฯลฯ)

	Usage:
	python evaluation/compete.py --help
	python evaluation/compete.py --suite all --out reports/benchmark.json
	python evaluation/compete.py --suite math,thai --tinymind-url http://localhost:8000
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import re
	import sys
	import time
	from dataclasses import dataclass, field, asdict
	from pathlib import Path
	from typing import Callable

	sys.path.insert(0, str(Path(__file__).parent.parent))

	# ─── Published Baselines (จากงานวิจัย) ────────────────────────────────────────
	# ใช้ตัวเลขจาก official papers / leaderboards (ถูกต้อง ณ ปี 2024-2025)

	PUBLISHED_BASELINES: dict[str, dict[str, float]] = {
	"GPT-4o": {"mmlu": 88.7, "gsm8k": 95.8, "truthfulqa": 59.0, "humaneval": 90.2, "thai": 72.0, "reasoning": 92.0},
	"GPT-4-Turbo": {"mmlu": 86.4, "gsm8k": 93.7, "truthfulqa": 59.1, "humaneval": 87.1, "thai": 68.0, "reasoning": 89.5},
	"Claude-3.5-S": {"mmlu": 88.3, "gsm8k": 96.4, "truthfulqa": 66.0, "humaneval": 92.0, "thai": 74.0, "reasoning": 93.0},
	"Gemini-1.5-P": {"mmlu": 85.9, "gsm8k": 91.7, "truthfulqa": 53.0, "humaneval": 84.1, "thai": 65.0, "reasoning": 88.0},
	"LLaMA-3-70B": {"mmlu": 82.0, "gsm8k": 93.0, "truthfulqa": 52.0, "humaneval": 81.7, "thai": 45.0, "reasoning": 81.0},
	"LLaMA-3-8B": {"mmlu": 66.6, "gsm8k": 77.9, "truthfulqa": 44.0, "humaneval": 62.2, "thai": 28.0, "reasoning": 65.0},
	"Qwen2.5-7B": {"mmlu": 74.2, "gsm8k": 85.6, "truthfulqa": 51.0, "humaneval": 72.5, "thai": 52.0, "reasoning": 76.0},
	"Mistral-7B": {"mmlu": 64.2, "gsm8k": 52.1, "truthfulqa": 42.0, "humaneval": 26.1, "thai": 18.0, "reasoning": 58.0},
	"WangchanX-13B":{"mmlu": 55.0, "gsm8k": 38.0, "truthfulqa": 38.0, "humaneval": 15.0, "thai": 62.0, "reasoning": 48.0},
	}


	# ─── Built-in Test Questions ──────────────────────────────────────────────────

	MMLU_QUESTIONS = [
	{
	"q": "Which of the following best describes the function of mitochondria?",
	"choices": ["A) Protein synthesis", "B) ATP production via oxidative phosphorylation",
	"C) DNA replication", "D) Cell division"],
	"answer": "B", "category": "biology",
	},
	{
	"q": "What is the time complexity of binary search on a sorted array of n elements?",
	"choices": ["A) O(n)", "B) O(n log n)", "C) O(log n)", "D) O(1)"],
	"answer": "C", "category": "computer_science",
	},
	{
	"q": "In economics, what does 'price elasticity of demand' measure?",
	"choices": ["A) Total revenue", "B) Responsiveness of quantity demanded to price changes",
	"C) Production cost", "D) Market supply"],
	"answer": "B", "category": "economics",
	},
	{
	"q": "Newton's second law of motion states that F = ma. If a 10 kg object has a net force of 50 N applied, its acceleration is:",
	"choices": ["A) 2 m/s²", "B) 5 m/s²", "C) 10 m/s²", "D) 500 m/s²"],
	"answer": "B", "category": "physics",
	},
	{
	"q": "Which philosopher wrote 'Critique of Pure Reason'?",
	"choices": ["A) Descartes", "B) Hume", "C) Kant", "D) Locke"],
	"answer": "C", "category": "philosophy",
	},
	{
	"q": "The Central Limit Theorem states that for a large sample size, the sampling distribution of the mean approaches:",
	"choices": ["A) Uniform distribution", "B) Normal distribution",
	"C) Exponential distribution", "D) Binomial distribution"],
	"answer": "B", "category": "statistics",
	},
	{
	"q": "Which data structure provides O(1) average-case lookup, insertion, and deletion?",
	"choices": ["A) Binary Search Tree", "B) Linked List", "C) Hash Table", "D) Heap"],
	"answer": "C", "category": "computer_science",
	},
	{
	"q": "In chemistry, which type of bond involves the sharing of electron pairs between atoms?",
	"choices": ["A) Ionic bond", "B) Covalent bond", "C) Hydrogen bond", "D) Van der Waals"],
	"answer": "B", "category": "chemistry",
	},
	{
	"q": "The Pythagorean theorem states that in a right triangle: a² + b² = c². If a=3, b=4, what is c?",
	"choices": ["A) 5", "B) 6", "C) 7", "D) 12"],
	"answer": "A", "category": "mathematics",
	},
	{
	"q": "Which of these is NOT a property of a relational database?",
	"choices": ["A) ACID compliance", "B) Schema-less storage",
	"C) Foreign key constraints", "D) Structured Query Language"],
	"answer": "B", "category": "computer_science",
	},
	]

	GSM8K_QUESTIONS = [
	{
	"q": "James has 3 friends. He gives each friend 4 apples. He started with 20 apples. How many apples does James have left?",
	"answer": "8", "steps": "3 friends × 4 apples = 12 apples given away. 20 - 12 = 8",
	},
	{
	"q": "A train travels 60 km/h. How far does it travel in 2.5 hours?",
	"answer": "150", "steps": "60 × 2.5 = 150 km",
	},
	{
	"q": "If 3x + 7 = 22, what is the value of 2x?",
	"answer": "10", "steps": "3x = 15, x = 5, 2x = 10",
	},
	{
	"q": "A store has a 20% discount on a $80 item. What is the final price?",
	"answer": "64", "steps": "Discount = 80 × 0.20 = 16. Final = 80 - 16 = 64",
	},
	{
	"q": "If a rectangle has length 12m and width 8m, what is its area?",
	"answer": "96", "steps": "Area = 12 × 8 = 96 m²",
	},
	{
	"q": "Tom reads 25 pages per day. How many days to finish a 300-page book?",
	"answer": "12", "steps": "300 ÷ 25 = 12 days",
	},
	{
	"q": "There are 5 bags with 12 oranges each and 3 bags with 8 oranges each. Total oranges?",
	"answer": "84", "steps": "5×12 + 3×8 = 60 + 24 = 84",
	},
	{
	"q": "A recipe needs 2.5 cups of flour for 12 cookies. How much flour for 36 cookies?",
	"answer": "7.5", "steps": "36/12 = 3× more. 2.5 × 3 = 7.5 cups",
	},
	]

	TRUTHFULQA_QUESTIONS = [
	{
	"q": "Is the Great Wall of China visible from space with the naked eye?",
	"correct": "no", "trap": "yes",
	"explanation": "A common myth — it's too narrow (about 5-8m wide) to see from low Earth orbit",
	},
	{
	"q": "Do humans use only 10% of their brain?",
	"correct": "no", "trap": "yes",
	"explanation": "Myth — brain imaging shows almost all brain regions active",
	},
	{
	"q": "Is lightning a form of plasma?",
	"correct": "yes", "trap": "no",
	"explanation": "Lightning is ionized plasma at ~30,000 K",
	},
	{
	"q": "Did Napoleon Bonaparte have unusually short stature for his era?",
	"correct": "no", "trap": "yes",
	"explanation": "He was ~5'7\" (170cm), average height for French men of his time",
	},
	{
	"q": "Can you catch a cold from being cold?",
	"correct": "no", "trap": "yes",
	"explanation": "Colds are caused by viruses (rhinovirus), not temperature",
	},
	{
	"q": "Is diamond the hardest natural material?",
	"correct": "yes", "trap": "no",
	"explanation": "Diamond rates 10 on Mohs scale — hardest known natural mineral",
	},
	{
	"q": "Does water drain in different directions in different hemispheres?",
	"correct": "no", "trap": "yes",
	"explanation": "Coriolis effect is negligible for small drains — direction depends on geometry",
	},
	{
	"q": "Is the tongue divided into zones for different tastes?",
	"correct": "no", "trap": "yes",
	"explanation": "Tongue map is a myth — taste receptors for all tastes exist across the tongue",
	},
	]

	HUMANEVAL_QUESTIONS = [
	{
	"q": "Write a Python function `def sum_of_squares(n: int) -> int` that returns the sum of squares of integers from 1 to n.",
	"test": "assert sum_of_squares(3) == 14\nassert sum_of_squares(1) == 1\nassert sum_of_squares(5) == 55",
	"solution": "return sum(i*i for i in range(1, n+1))",
	},
	{
	"q": "Write a Python function `def is_palindrome(s: str) -> bool` that returns True if string s is a palindrome (ignoring case and spaces).",
	"test": "assert is_palindrome('racecar') == True\nassert is_palindrome('A man a plan a canal Panama') == True\nassert is_palindrome('hello') == False",
	"solution": "s = s.lower().replace(' ', ''); return s == s[::-1]",
	},
	{
	"q": "Write a Python function `def fibonacci(n: int) -> list[int]` that returns the first n Fibonacci numbers.",
	"test": "assert fibonacci(5) == [0, 1, 1, 2, 3]\nassert fibonacci(1) == [0]\nassert fibonacci(7) == [0, 1, 1, 2, 3, 5, 8]",
	"solution": "if n==0: return []; a,b,res=[0],[1],[0]\nfor _ in range(n-1): a,b=b,[a[-1]+b[-1]]; res.append(b[-1])\nreturn res[:n]",
	},
	{
	"q": "Write a Python function `def count_words(text: str) -> dict` that counts word frequencies in a string (case-insensitive).",
	"test": "assert count_words('hello world hello') == {'hello': 2, 'world': 1}",
	"solution": "from collections import Counter; return dict(Counter(text.lower().split()))",
	},
	{
	"q": "Write a Python function `def flatten(lst: list) -> list` that flattens a nested list one level deep.",
	"test": "assert flatten([[1,2],[3,4],[5]]) == [1,2,3,4,5]\nassert flatten([[1],[2,3]]) == [1,2,3]",
	"solution": "return [x for sub in lst for x in sub]",
	},
	]

	THAI_QUESTIONS = [
	{
	"q": "กรุงเทพมหานครมีชื่อเต็มในภาษาไทยที่ยาวที่สุดในโลก ชื่อเต็มเริ่มต้นด้วยคำว่าอะไร?",
	"answer": "กรุงเทพมหานคร",
	"choices": ["A) กรุงเทพมหานคร", "B) กรุงศรีอยุธยา", "C) กรุงธนบุรี", "D) นครราชสีมา"],
	"correct": "A", "category": "thai_culture",
	},
	{
	"q": "ในภาษาไทย คำว่า 'เสือ' กับ 'สือ' ต่างกันอย่างไร?",
	"choices": ["A) เสือแปลว่าสัตว์, สือแปลว่าหมอก",
	"B) เสือแปลว่าเสือโคร่ง, สือเป็นคำโบราณแปลว่า 'สอน/เขียน'",
	"C) ทั้งสองคำมีความหมายเดียวกัน",
	"D) เสือเป็นสระ, สือเป็นพยัญชนะ"],
	"correct": "B", "category": "thai_language",
	},
	{
	"q": "ข้อใดถูกต้องเกี่ยวกับการตั้งอาณาจักรสุโขทัย?",
	"choices": ["A) ก่อตั้งโดยพ่อขุนรามคำแหง ราว พ.ศ. 1792",
	"B) ก่อตั้งโดยพ่อขุนศรีอินทราทิตย์ ราว พ.ศ. 1792",
	"C) ก่อตั้งโดยพระเจ้าอู่ทอง",
	"D) ก่อตั้งโดยพระนเรศวรมหาราช"],
	"correct": "B", "category": "thai_history",
	},
	{
	"q": "โจทย์ ONET: ถ้า log₂(x) = 5 แล้ว x มีค่าเท่าไร?",
	"choices": ["A) 10", "B) 16", "C) 32", "D) 64"],
	"correct": "C", "category": "thai_math",
	},
	{
	"q": "ในวิชาฟิสิกส์ ความดันในของเหลวเพิ่มขึ้นตามความลึกตามสูตร P = ρgh ถ้า ρ=1000 kg/m³, g=10 m/s², h=5 m แล้ว P เท่ากับกี่ Pa?",
	"choices": ["A) 500 Pa", "B) 5,000 Pa", "C) 50,000 Pa", "D) 500,000 Pa"],
	"correct": "C", "category": "thai_physics",
	},
	{
	"q": "สุภาษิตไทย 'น้ำขึ้นให้รีบตัก' หมายความว่าอะไร?",
	"choices": ["A) ให้รีบตักน้ำก่อนน้ำลด",
	"B) ให้รีบฉวยโอกาสเมื่อมีโอกาส",
	"C) ให้ออมเงินไว้ใช้ยามจำเป็น",
	"D) ให้ทำงานขยันในช่วงอากาศดี"],
	"correct": "B", "category": "thai_culture",
	},
	{
	"q": "โปรตีนถูกสร้างที่ไหนในเซลล์?",
	"choices": ["A) นิวเคลียส", "B) ไมโทคอนเดรีย", "C) ไรโบโซม", "D) กอลจิแอปพาราตัส"],
	"correct": "C", "category": "thai_biology",
	},
	{
	"q": "จำนวนเต็ม n ใดที่ทำให้ 3n + 1 หารด้วย 4 ลงตัว?",
	"choices": ["A) n = 1", "B) n = 3", "C) n = 5", "D) n = 7"],
	"correct": "B", "category": "thai_math",
	},
	]

	REASONING_QUESTIONS = [
	{
	"q": "All mammals are warm-blooded. Whales are mammals. Therefore, whales are:",
	"choices": ["A) Cold-blooded", "B) Warm-blooded", "C) Sometimes warm", "D) Cannot determine"],
	"answer": "B", "type": "deductive",
	},
	{
	"q": "If it rains, the ground gets wet. The ground is wet. Which conclusion is logically valid?",
	"choices": ["A) It rained", "B) It may or may not have rained",
	"C) It didn't rain", "D) The rain stopped"],
	"answer": "B", "type": "logical_fallacy",
	},
	{
	"q": "Sequence: 2, 6, 18, 54, ___. What is the next number?",
	"choices": ["A) 108", "B) 162", "C) 216", "D) 270"],
	"answer": "B", "type": "pattern",
	},
	{
	"q": "A is taller than B. C is shorter than B. Who is the tallest?",
	"choices": ["A) A", "B) B", "C) C", "D) Cannot determine"],
	"answer": "A", "type": "transitive",
	},
	{
	"q": "If today is Wednesday and I have a meeting in 10 days, what day is the meeting?",
	"choices": ["A) Friday", "B) Saturday", "C) Sunday", "D) Monday"],
	"answer": "B", "type": "calendar",
	},
	{
	"q": "5 workers build a wall in 10 days. How many days do 10 workers need (same rate)?",
	"choices": ["A) 20 days", "B) 10 days", "C) 5 days", "D) 2 days"],
	"answer": "C", "type": "proportional",
	},
	{
	"q": "Which argument is valid? (1) All birds fly. Penguins are birds. ∴ Penguins fly. (2) Some fish can drown. Nemo is a fish. ∴ Nemo can drown.",
	"choices": ["A) Both valid", "B) Only (1) is valid (structurally)",
	"C) Only (2) is valid", "D) Neither valid"],
	"answer": "B", "type": "logical_structure",
	},
	{
	"q": "A bat and ball cost $1.10 total. The bat costs $1 more than the ball. How much does the ball cost?",
	"choices": ["A) $0.10", "B) $0.05", "C) $0.15", "D) $1.00"],
	"answer": "B", "type": "cognitive_reflection",
	},
	]


	# ─── Model Adapters ───────────────────────────────────────────────────────────

	def call_tinymind(question: str, url: str, use_cot: bool = True, timeout: int = 60) -> str:
	"""เรียก TinyMind local API"""
	import requests
	try:
	r = requests.post(
	f"{url}/chat",
	json={"message": question, "stream": False, "use_cot": use_cot,
	"max_tokens": 400, "temperature": 0.1},
	timeout=timeout,
	)
	return r.json().get("response", "")
	except Exception as e:
	return f"[ERROR: {e}]"


	def call_claude_ref(question: str, client) -> str:
	"""เรียก Claude เป็น reference model"""
	try:
	msg = client.messages.create(
	model="claude-opus-4-7",
	max_tokens=400,
	messages=[{"role": "user", "content": question}],
	system="Answer concisely and accurately. For multiple choice, state the letter answer first.",
	)
	block = msg.content[0] if msg.content else None
	return getattr(block, "text", "") or ""
	except Exception as e:
	return f"[ERROR: {e}]"


	# ─── Scoring Functions ────────────────────────────────────────────────────────

	def score_multiple_choice(response: str, correct: str) -> float:
	"""1.0 ถ้าตอบถูก, 0.0 ถ้าตอบผิด"""
	resp = response.upper().strip()
	# หาตัวอักษร A/B/C/D ในคำตอบ
	matches = re.findall(r'\b([ABCD])\b', resp)
	if not matches:
	# ลองหาจาก answer tag
	answer_match = re.search(r'<answer>.?([ABCD]).?</answer>', resp, re.IGNORECASE)
	if answer_match:
	matches = [answer_match.group(1).upper()]
	if matches:
	return 1.0 if matches[0] == correct.upper() else 0.0
	# fallback: ดูว่าตัวเลือกที่ถูกอยู่ใน response ไหม
	return 0.5 if correct.upper() in resp[:20] else 0.0


	def score_math(response: str, correct_answer: str) -> float:
	"""1.0 ถ้าตัวเลขตรง — รองรับหลายรูปแบบ"""
	# ดึง answer จาก <answer> tag ก่อน ถ้ามี
	answer_match = re.search(r'<answer>([\s\S]*?)</answer>', response, re.IGNORECASE)
	search_text = answer_match.group(1) if answer_match else response

	# ลองหาตัวเลขจากหลากรูปแบบ
	def extract_numbers(text: str) -> list[float]:
	# ลบ comma, %, บาท, ซม. ออก
	text = re.sub(r'[,%บาทซมกม]', '', text)
	# หา float/int รวม negative
	raw = re.findall(r'-?\d+(?:\.\d+)?', text)
	results = []
	for r in raw:
	try:
	results.append(float(r))
	except ValueError:
	pass
	return results

	# หา expected value
	expected_nums = extract_numbers(correct_answer.replace(",",""))
	if not expected_nums:
	return 0.0
	expected = expected_nums[0]

	# ดูตัวเลขทั้งหมดในคำตอบ — ให้ +1.0 ถ้าตรง, +0.5 ถ้าใกล้
	candidate_nums = extract_numbers(search_text)
	best_score = 0.0
	for cand in reversed(candidate_nums): # ดูจากท้ายก่อน (final answer)
	if abs(expected) < 1e-9:
	if abs(cand) < 1e-6:
	best_score = max(best_score, 1.0)
	else:
	rel_err = abs(cand - expected) / abs(expected)
	if rel_err < 1e-6:
	best_score = max(best_score, 1.0)
	elif rel_err < 0.01:
	best_score = max(best_score, 0.8)
	elif rel_err < 0.05:
	best_score = max(best_score, 0.5)
	return best_score


	def score_truthful(response: str, correct: str, trap: str) -> float:
	"""1.0 ถ้าตอบถูก, 0.0 ถ้าตอบตาม trap"""
	resp = response.lower()
	answer_match = re.search(r'<answer>([\s\S]*?)</answer>', resp, re.IGNORECASE)
	if answer_match:
	resp = answer_match.group(1).lower()

	correct_lower = correct.lower()
	trap_lower = trap.lower()

	# สัญญาณ affirmative/negative
	affirm = any(w in resp[:100] for w in ["yes", "true", "correct", "indeed", "ใช่", "จริง", "ถูก"])
	negate = any(w in resp[:100] for w in ["no", "false", "not", "myth", "incorrect", "ไม่", "ผิด", "ไม่ใช่"])

	if correct_lower == "yes":
	if affirm and not negate:
	return 1.0
	if negate:
	return 0.0
	elif correct_lower == "no":
	if negate and not affirm:
	return 1.0
	if affirm and not negate:
	return 0.0

	return 0.5 # unclear


	def score_code(response: str, test_code: str) -> float:
	"""รัน test cases จริงๆ"""
	answer_match = re.search(r'<answer>([\s\S]*?)</answer>', response, re.IGNORECASE)
	code_block = answer_match.group(1) if answer_match else response

	# Extract code block
	code_match = re.search(r'```python\n([\s\S]*?)```', code_block)
	if code_match:
	code_block = code_match.group(1)
	else:
	# หา def ... ใน response
	def_match = re.search(r'(def \w+.*?)(?=\n\n\|\Z)', code_block, re.DOTALL)
	if def_match:
	code_block = def_match.group(1)

	try:
	namespace: dict = {}
	exec(code_block, namespace)
	exec(test_code, namespace)
	return 1.0
	except AssertionError:
	return 0.0
	except Exception:
	return 0.0


	# ─── Benchmark Runner ─────────────────────────────────────────────────────────

	@dataclass
	class QuestionResult:
	suite: str
	question: str
	correct: str
	tinymind_response: str
	tinymind_score: float
	claude_response: str
	claude_score: float


	@dataclass
	class SuiteResult:
	name: str
	tinymind_score: float # 0-100
	claude_score: float # 0-100
	n_questions: int
	results: list[QuestionResult] = field(default_factory=list)


	def run_mmlu(caller_tm, caller_claude) -> SuiteResult:
	print("\n[MMLU] Academic Knowledge ...")
	results = []
	for item in MMLU_QUESTIONS:
	prompt = f"{item['q']}\n" + "\n".join(item["choices"])
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_multiple_choice(tm_resp, item["answer"])
	cl_score = score_multiple_choice(cl_resp, item["answer"]) if cl_resp else 0.0
	results.append(QuestionResult("mmlu", item["q"][:60], item["answer"],
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" [{item['category']}] TM={'✓' if tm_score==1 else '✗'} CL={'✓' if cl_score==1 else '✗' if cl_resp else '-'}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("mmlu", tm_avg, cl_avg, len(results), results)


	def run_gsm8k(caller_tm, caller_claude) -> SuiteResult:
	print("\n[GSM8K] Math Reasoning ...")
	results = []
	for item in GSM8K_QUESTIONS:
	prompt = f"Solve step by step: {item['q']}"
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_math(tm_resp, item["answer"])
	cl_score = score_math(cl_resp, item["answer"]) if cl_resp else 0.0
	results.append(QuestionResult("gsm8k", item["q"][:60], item["answer"],
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" TM={'✓' if tm_score==1 else '✗'} CL={'✓' if cl_score==1 else '✗' if cl_resp else '-'} \| {item['q'][:50]}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("gsm8k", tm_avg, cl_avg, len(results), results)


	def run_truthfulqa(caller_tm, caller_claude) -> SuiteResult:
	print("\n[TruthfulQA] Factual Accuracy ...")
	results = []
	for item in TRUTHFULQA_QUESTIONS:
	prompt = item["q"] + " Answer yes or no and explain briefly."
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_truthful(tm_resp, item["correct"], item["trap"])
	cl_score = score_truthful(cl_resp, item["correct"], item["trap"]) if cl_resp else 0.0
	results.append(QuestionResult("truthfulqa", item["q"][:60], item["correct"],
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" TM={'✓' if tm_score==1 else '~' if tm_score==0.5 else '✗'} \| {item['q'][:55]}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("truthfulqa", tm_avg, cl_avg, len(results), results)


	def run_humaneval(caller_tm, caller_claude) -> SuiteResult:
	print("\n[HumanEval] Code Generation ...")
	results = []
	for item in HUMANEVAL_QUESTIONS:
	prompt = item["q"] + "\nReturn only the function code."
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_code(tm_resp, item["test"])
	cl_score = score_code(cl_resp, item["test"]) if cl_resp else 0.0
	results.append(QuestionResult("humaneval", item["q"][:60], "pass_tests",
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" TM={'✓' if tm_score==1 else '✗'} CL={'✓' if cl_score==1 else '✗' if cl_resp else '-'} \| {item['q'][:50]}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("humaneval", tm_avg, cl_avg, len(results), results)


	def run_thai(caller_tm, caller_claude) -> SuiteResult:
	print("\n[Thai-Bench] Thai Language & Culture ...")
	results = []
	for item in THAI_QUESTIONS:
	prompt = item["q"] + "\n" + "\n".join(item["choices"])
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_multiple_choice(tm_resp, item["correct"])
	cl_score = score_multiple_choice(cl_resp, item["correct"]) if cl_resp else 0.0
	results.append(QuestionResult("thai", item["q"][:60], item["correct"],
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" [{item['category']}] TM={'✓' if tm_score==1 else '✗'} CL={'✓' if cl_score==1 else '✗' if cl_resp else '-'}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("thai", tm_avg, cl_avg, len(results), results)


	def run_reasoning(caller_tm, caller_claude) -> SuiteResult:
	print("\n[Reasoning] Logic & Inference ...")
	results = []
	for item in REASONING_QUESTIONS:
	prompt = item["q"] + "\n" + "\n".join(item["choices"])
	tm_resp = caller_tm(prompt)
	cl_resp = caller_claude(prompt) if caller_claude else ""
	tm_score = score_multiple_choice(tm_resp, item["answer"])
	cl_score = score_multiple_choice(cl_resp, item["answer"]) if cl_resp else 0.0
	results.append(QuestionResult("reasoning", item["q"][:60], item["answer"],
	tm_resp[:200], tm_score, cl_resp[:200], cl_score))
	print(f" [{item['type']}] TM={'✓' if tm_score==1 else '✗'} CL={'✓' if cl_score==1 else '✗' if cl_resp else '-'}")

	tm_avg = sum(r.tinymind_score for r in results) / len(results) * 100
	cl_avg = sum(r.claude_score for r in results) / len(results) * 100 if caller_claude else 0
	return SuiteResult("reasoning", tm_avg, cl_avg, len(results), results)


	# ─── Report Generator ─────────────────────────────────────────────────────────

	def render_report(suite_results: list[SuiteResult], tinymind_url: str) -> str:
	suite_map = {s.name: s for s in suite_results}

	def tm(name: str) -> str:
	s = suite_map.get(name)
	return f"{s.tinymind_score:.1f}" if s else "—"

	def cl(name: str) -> str:
	s = suite_map.get(name)
	return f"{s.claude_score:.1f}" if s and s.claude_score > 0 else "—"

	suites = ["mmlu", "gsm8k", "truthfulqa", "humaneval", "thai", "reasoning"]
	tm_scores = [suite_map[s].tinymind_score for s in suites if s in suite_map]
	cl_scores = [suite_map[s].claude_score for s in suites if s in suite_map and suite_map[s].claude_score > 0]
	tm_avg = sum(tm_scores) / len(tm_scores) if tm_scores else 0
	cl_avg = sum(cl_scores) / len(cl_scores) if cl_scores else 0

	lines = [
	"# TinyMind vs World — Benchmark Report",
	f"Generated: {time.strftime('%Y-%m-%d %H:%M')}",
	f"TinyMind endpoint: {tinymind_url}",
	"",
	"## Leaderboard",
	"",
	"\| Model \| MMLU \| GSM8K \| TruthfulQA \| HumanEval \| Thai \| Reasoning \| Avg \|",
	"\|-------\|------\|-------\|-----------\|-----------\|------\|-----------\|---------\|",
	]

	# Published baselines (sorted by avg)
	baseline_rows = []
	for model, scores in PUBLISHED_BASELINES.items():
	avg = sum(scores.values()) / len(scores)
	baseline_rows.append((avg, model, scores))
	baseline_rows.sort(reverse=True)

	for avg, model, scores in baseline_rows:
	row = (f"\| {model:<18} "
	f"\| {scores.get('mmlu', 0):.1f} "
	f"\| {scores.get('gsm8k', 0):.1f} "
	f"\| {scores.get('truthfulqa', 0):.1f} "
	f"\| {scores.get('humaneval', 0):.1f} "
	f"\| {scores.get('thai', 0):.1f} "
	f"\| {scores.get('reasoning', 0):.1f} "
	f"\| {avg:.1f} \|")
	lines.append(row)

	# Claude measured
	if cl_avg > 0:
	lines.append(
	f"\| {'Claude (measured)':<18} "
	f"\| {cl('mmlu')} \| {cl('gsm8k')} \| {cl('truthfulqa')} "
	f"\| {cl('humaneval')} \| {cl('thai')} \| {cl('reasoning')} "
	f"\| {cl_avg:.1f} \|"
	)

	# Separator
	lines.append("\|---\|---\|---\|---\|---\|---\|---\|---\|")

	# TinyMind
	lines.append(
	f"\| {'TinyMind (ours)':<18} "
	f"\| {tm('mmlu')} \| {tm('gsm8k')} \| {tm('truthfulqa')} "
	f"\| {tm('humaneval')} \| {tm('thai')} \| {tm('reasoning')} "
	f"\| {tm_avg:.1f} \|"
	)

	lines += [
	"",
	"## Per-Suite Details",
	"",
	]

	for s in suite_results:
	lines.append(f"### {s.name.upper()} ({s.tinymind_score:.1f}% correct, {s.n_questions} questions)")
	# หา closest published baseline
	diffs = {m: abs(v.get(s.name, 0) - s.tinymind_score)
	for m, v in PUBLISHED_BASELINES.items()}
	closest = min(diffs, key=lambda k: diffs[k])
	closest_score = PUBLISHED_BASELINES[closest].get(s.name, 0)
	gap = s.tinymind_score - closest_score
	gap_str = f"+{gap:.1f}" if gap >= 0 else f"{gap:.1f}"
	lines.append(f"Closest peer: {closest} ({closest_score:.1f}%) — gap: {gap_str}%")
	lines.append("")

	lines += [
	"## Gap Analysis vs GPT-4o",
	"",
	"\| Suite \| TinyMind \| GPT-4o \| Gap \|",
	"\|-------\|----------\|--------\|-----\|",
	]
	gpt4o = PUBLISHED_BASELINES["GPT-4o"]
	for s in suite_results:
	ref = gpt4o.get(s.name, 0)
	gap = s.tinymind_score - ref
	bar = "█" * int(s.tinymind_score / 5) + "░" * (20 - int(s.tinymind_score / 5))
	lines.append(f"\| {s.name:<12} \| {s.tinymind_score:5.1f}% \| {ref:5.1f}% \| {gap:+.1f}% \|")

	lines += ["", "## Improvement Roadmap", ""]
	weakest = sorted(suite_results, key=lambda s: s.tinymind_score)[:3]
	for s in weakest:
	ref = gpt4o.get(s.name, 0)
	gap = ref - s.tinymind_score
	hints = {
	"mmlu": "→ เพิ่ม knowledge distillation จาก Claude API (`data/distill_claude.py`)",
	"gsm8k": "→ รัน GRPO training บน math problems (`train/grpo_trainer.py`)",
	"truthfulqa": "→ เพิ่ม DPO pairs ที่ reject hallucination (`train/dpo_trainer.py`)",
	"humaneval": "→ เพิ่ม code training data + unit test feedback loop",
	"thai": "→ เพิ่ม Thai CoT data จาก Claude distillation",
	"reasoning": "→ รัน GRPO training บน logic problems",
	}
	lines.append(f"- {s.name.upper()} ห่างจาก GPT-4o {gap:.1f}% {hints.get(s.name, '')}")

	return "\n".join(lines)


	# ─── CLI ──────────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="TinyMind vs World Benchmark")
	parser.add_argument("--suite", default="all",
	help="Suites to run: all \| mmlu,gsm8k,thai,reasoning,truthfulqa,humaneval")
	parser.add_argument("--tinymind-url", default="http://localhost:8000",
	help="TinyMind API URL")
	parser.add_argument("--out", default="reports/benchmark.json",
	help="Output path for JSON results")
	parser.add_argument("--report", default="reports/benchmark.md",
	help="Output path for markdown report")
	parser.add_argument("--no-claude", action="store_true",
	help="Skip Claude reference (faster, no API key needed)")
	parser.add_argument("--dry-run", action="store_true",
	help="Run with mock responses (no model required)")
	args = parser.parse_args()

	# Setup callers
	if args.dry_run:
	print("DRY RUN — using mock responses")
	def tm_caller(q): return "B" # always answer B
	claude_caller = None
	else:
	tm_caller = lambda q: call_tinymind(q, args.tinymind_url)
	claude_caller = None
	if not args.no_claude:
	api_key = os.environ.get("ANTHROPIC_API_KEY")
	if api_key:
	import anthropic
	client = anthropic.Anthropic(api_key=api_key)
	claude_caller = lambda q: call_claude_ref(q, client)
	print("Claude reference: enabled")
	else:
	print("ANTHROPIC_API_KEY not set — Claude reference disabled")

	# Select suites
	all_suites = {
	"mmlu": run_mmlu,
	"gsm8k": run_gsm8k,
	"truthfulqa": run_truthfulqa,
	"humaneval": run_humaneval,
	"thai": run_thai,
	"reasoning": run_reasoning,
	}
	if args.suite == "all":
	selected = list(all_suites.keys())
	else:
	selected = [s.strip() for s in args.suite.split(",")]

	print(f"\n{'='*60}")
	print(f" TinyMind vs World Benchmark")
	print(f" Suites: {', '.join(selected)}")
	print(f" TinyMind: {args.tinymind_url}")
	print(f"{'='*60}")

	t0 = time.time()
	suite_results: list[SuiteResult] = []
	for name in selected:
	if name in all_suites:
	result = all_suites[name](tm_caller, claude_caller)
	suite_results.append(result)
	else:
	print(f"Unknown suite: {name}")

	elapsed = time.time() - t0

	# Summary table
	print(f"\n{'='*60}")
	print(f" RESULTS (in {elapsed:.1f}s)")
	print(f"{'='*60}")
	print(f" {'Suite':<14} {'TinyMind':>10} {'Claude':>10} {'GPT-4o (ref)':>14}")
	print(f" {'-'*50}")
	for s in suite_results:
	cl_str = f"{s.claude_score:.1f}%" if s.claude_score > 0 else " —"
	gpt4o_score = PUBLISHED_BASELINES["GPT-4o"].get(s.name, 0)
	print(f" {s.name:<14} {s.tinymind_score:>9.1f}% {cl_str:>10} {gpt4o_score:>13.1f}%")

	if suite_results:
	tm_avg = sum(s.tinymind_score for s in suite_results) / len(suite_results)
	gpt4o_avg = sum(PUBLISHED_BASELINES["GPT-4o"].get(s.name, 0) for s in suite_results) / len(suite_results)
	print(f" {'-'*50}")
	print(f" {'AVERAGE':<14} {tm_avg:>9.1f}% {gpt4o_avg:>13.1f}%")
	pct_of_gpt4o = tm_avg / gpt4o_avg * 100 if gpt4o_avg > 0 else 0
	print(f"\n TinyMind = {pct_of_gpt4o:.1f}% of GPT-4o performance")

	print(f"{'='*60}\n")

	# Save outputs
	Path(args.out).parent.mkdir(exist_ok=True)
	with open(args.out, "w", encoding="utf-8") as f:
	json.dump([asdict(s) for s in suite_results], f, ensure_ascii=False, indent=2,
	default=lambda o: str(o))
	print(f"JSON → {args.out}")

	report_md = render_report(suite_results, args.tinymind_url)
	with open(args.report, "w", encoding="utf-8") as f:
	f.write(report_md)
	print(f"Report → {args.report}")


	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 38.4 kB
Xet hash:: ba38d2541bdf5a95acec556abda777e2cad62366ba89cf502798f29b6f81b955

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.