Marketing-Memory-Routing-8B / training /final_benchmark.py

Upload folder using huggingface_hub

685d968 verified about 1 month ago

32 kB

	"""
	Final Benchmark: Memory Routing Model Comparison

	Compares:
	1. Our RL-trained Llama-8B model
	2. Base Llama-8B (untrained)
	3. Cohere Command-R-Plus (teacher model used for data generation)

	All scenarios are marketing-specific and challenging.
	"""

	import asyncio
	import json
	import os
	import time
	from datetime import datetime
	from dotenv import load_dotenv
	load_dotenv()

	import cohere
	import tinker
	from tinker import types
	from tinker_cookbook import renderers
	from tinker_cookbook.tokenizer_utils import get_tokenizer

	VALID_CATEGORIES = {
	"company.brand_core", "company.strategic_signatures", "company.knowledge_artifacts",
	"company.business_priorities", "company.tools_config", "company.performance_context",
	"user.communication_style", "user.strategic_approach", "user.role_context",
	"user.workflow_patterns", "user.session_history", "user.interaction_preferences",
	"none"
	}

	SYSTEM_PROMPT = """You route marketing conversations into structured memory categories.

	Available categories:
	- company.brand_core: Voice, values, positioning, identity anchors
	- company.strategic_signatures: Decision frameworks, strategic heuristics
	- company.knowledge_artifacts: Docs, style guides, playbooks
	- company.business_priorities: Quarterly/seasonal goals, active campaigns
	- company.tools_config: Integrations, API keys, workflow settings
	- company.performance_context: Campaign metrics, retrospectives, learnings
	- user.communication_style: Tone, verbosity, format expectations
	- user.strategic_approach: Personal priorities, success definitions
	- user.role_context: Title, scope, decision authority
	- user.workflow_patterns: Review cadence, collaboration norms
	- user.session_history: Immediate context, recent asks
	- user.interaction_preferences: Coaching style, feedback expectations
	- none: Irrelevant, vague, or transactional content

	Respond with comma-separated categories. Use 'none' only if no other category applies."""

	# 50 Challenging Marketing Scenarios
	MARKETING_BENCHMARK = [
	# ========== BRAND & POSITIONING (1-8) ==========
	{
	"id": "brand_1",
	"conversation": """USER: Our brand DNA is 'accessible luxury'. We're not Chanel, but we're definitely not Target either. Think Everlane meets Aritzia.
	ASSISTANT: So premium quality at attainable price points, with a focus on transparency?
	USER: Exactly. Every piece of content should feel elevated but not pretentious.""",
	"expected": ["company.brand_core"],
	"difficulty": "medium",
	"category_focus": "Brand positioning with comparisons"
	},
	{
	"id": "brand_2",
	"conversation": """USER: We've been using 'Innovation for Everyone' as our tagline for 5 years. It's core to who we are.
	ASSISTANT: That's a strong positioning statement. Does it extend to your visual identity too?
	USER: Yes, our whole design system is built around democratizing technology.""",
	"expected": ["company.brand_core"],
	"difficulty": "easy",
	"category_focus": "Tagline and identity"
	},
	{
	"id": "brand_3",
	"conversation": """USER: I hate when brands try too hard to be funny. Our voice should be warm and helpful, never sarcastic.
	ASSISTANT: So authentic helpfulness over trendy humor?
	USER: Right. We're the trusted friend, not the class clown.""",
	"expected": ["company.brand_core"],
	"difficulty": "medium",
	"category_focus": "Voice and tone definition"
	},
	{
	"id": "brand_4",
	"conversation": """USER: Our competitors use fear tactics - 'Don't miss out!' We never do that. We believe in empowerment.
	ASSISTANT: Positive framing over scarcity messaging.
	USER: Always. Even our sales emails should feel like opportunities, not threats.""",
	"expected": ["company.brand_core", "company.strategic_signatures"],
	"difficulty": "hard",
	"category_focus": "Brand values + decision framework"
	},
	{
	"id": "brand_5",
	"conversation": """USER: What font are we using for the Q4 campaign headers?
	ASSISTANT: The brand guidelines specify Montserrat for headers.
	USER: Thanks, just needed to confirm.""",
	"expected": ["none"],
	"difficulty": "easy",
	"category_focus": "Transactional brand query"
	},
	{
	"id": "brand_6",
	"conversation": """USER: We position ourselves as the 'anti-agency agency'. No jargon, no BS, just results.
	ASSISTANT: Transparency and authenticity as differentiators.
	USER: And we back it up with our pricing model - no hidden fees, ever.""",
	"expected": ["company.brand_core"],
	"difficulty": "medium",
	"category_focus": "Agency positioning"
	},
	{
	"id": "brand_7",
	"conversation": """USER: Our brand is built on three pillars: sustainability, transparency, and community.
	ASSISTANT: Those are strong values. How do they manifest in your marketing?
	USER: Every campaign must touch at least one pillar. It's non-negotiable.""",
	"expected": ["company.brand_core", "company.strategic_signatures"],
	"difficulty": "hard",
	"category_focus": "Brand pillars as decision framework"
	},
	{
	"id": "brand_8",
	"conversation": """USER: We're rebranding next quarter but I can't share details yet.
	ASSISTANT: Understood. Let me know when you're ready to discuss.
	USER: Will do.""",
	"expected": ["none"],
	"difficulty": "medium",
	"category_focus": "Vague future reference"
	},

	# ========== STRATEGIC DECISIONS (9-16) ==========
	{
	"id": "strategy_1",
	"conversation": """USER: We never launch a campaign without at least 3 weeks of testing. It's our golden rule.
	ASSISTANT: So testing is a non-negotiable gate in your process?
	USER: Absolutely. Even if stakeholders push back, we hold the line.""",
	"expected": ["company.strategic_signatures"],
	"difficulty": "easy",
	"category_focus": "Decision framework"
	},
	{
	"id": "strategy_2",
	"conversation": """USER: Our philosophy is 'measure twice, cut once'. We'd rather delay a launch than ship something half-baked.
	ASSISTANT: Quality over speed.
	USER: Every time. Our reputation depends on it.""",
	"expected": ["company.strategic_signatures"],
	"difficulty": "medium",
	"category_focus": "Strategic heuristic"
	},
	{
	"id": "strategy_3",
	"conversation": """USER: When in doubt, we default to the customer's perspective. What would they want?
	ASSISTANT: Customer-centric decision making.
	USER: It's saved us from many internal-focused mistakes.""",
	"expected": ["company.strategic_signatures"],
	"difficulty": "medium",
	"category_focus": "Decision principle"
	},
	{
	"id": "strategy_4",
	"conversation": """USER: I personally believe in testing everything. Data over gut instinct.
	ASSISTANT: A data-driven approach.
	USER: It's how I've always operated. Even small decisions get A/B tested.""",
	"expected": ["user.strategic_approach"],
	"difficulty": "medium",
	"category_focus": "Personal vs company strategy"
	},
	{
	"id": "strategy_5",
	"conversation": """USER: The company prioritizes brand safety above all. But I think we're too conservative sometimes.
	ASSISTANT: So there's tension between company policy and your personal view?
	USER: Yes. I'd take more calculated risks if it were up to me.""",
	"expected": ["company.strategic_signatures", "user.strategic_approach"],
	"difficulty": "hard",
	"category_focus": "Company vs personal strategy conflict"
	},
	{
	"id": "strategy_6",
	"conversation": """USER: We have a 'no surprises' policy with clients. They hear bad news from us first.
	ASSISTANT: Proactive communication as a core principle.
	USER: It's built trust with every client we've worked with.""",
	"expected": ["company.strategic_signatures"],
	"difficulty": "medium",
	"category_focus": "Client relationship principle"
	},
	{
	"id": "strategy_7",
	"conversation": """USER: Should we go with vendor A or vendor B for the email platform?
	ASSISTANT: What are the key differences you're weighing?
	USER: Just cost really. They're pretty similar otherwise.""",
	"expected": ["none"],
	"difficulty": "easy",
	"category_focus": "Transactional decision"
	},
	{
	"id": "strategy_8",
	"conversation": """USER: Our rule is simple: if we can't explain it to a 10-year-old, we don't publish it.
	ASSISTANT: Clarity as a content standard.
	USER: It keeps our messaging accessible to everyone.""",
	"expected": ["company.strategic_signatures", "company.brand_core"],
	"difficulty": "hard",
	"category_focus": "Content principle overlapping brand"
	},

	# ========== PERFORMANCE & METRICS (17-24) ==========
	{
	"id": "perf_1",
	"conversation": """USER: Last quarter's email campaign hit 42% open rate and 8% CTR. Best we've ever done.
	ASSISTANT: Those are strong numbers. What drove the improvement?
	USER: We switched to personalized subject lines based on past behavior.""",
	"expected": ["company.performance_context"],
	"difficulty": "easy",
	"category_focus": "Campaign metrics"
	},
	{
	"id": "perf_2",
	"conversation": """USER: Our CAC has been climbing - $180 last month, up from $120 in Q1.
	ASSISTANT: That's a significant increase. Any hypothesis on the cause?
	USER: iOS privacy changes are killing our Facebook targeting.""",
	"expected": ["company.performance_context"],
	"difficulty": "medium",
	"category_focus": "Acquisition metrics with context"
	},
	{
	"id": "perf_3",
	"conversation": """USER: The influencer campaign flopped. 2M impressions but only 50 conversions.
	ASSISTANT: That's a 0.0025% conversion rate. What went wrong?
	USER: Wrong audience. The influencer's followers weren't our target demo.""",
	"expected": ["company.performance_context"],
	"difficulty": "medium",
	"category_focus": "Campaign failure analysis"
	},
	{
	"id": "perf_4",
	"conversation": """USER: We learned that video content outperforms static by 3x on engagement.
	ASSISTANT: That's a significant insight. Are you shifting your content mix?
	USER: Yes, Q4 will be 70% video. It's a major strategic shift.""",
	"expected": ["company.performance_context", "company.business_priorities"],
	"difficulty": "hard",
	"category_focus": "Learning leading to priority"
	},
	{
	"id": "perf_5",
	"conversation": """USER: What was our email open rate last week?
	ASSISTANT: I don't have access to that data in real-time.
	USER: I'll check the dashboard.""",
	"expected": ["none"],
	"difficulty": "easy",
	"category_focus": "Data lookup request"
	},
	{
	"id": "perf_6",
	"conversation": """USER: Our NPS dropped from 72 to 58 after the website redesign.
	ASSISTANT: That's a significant drop. Customer feedback?
	USER: They hate the new navigation. We're rolling back next week.""",
	"expected": ["company.performance_context"],
	"difficulty": "medium",
	"category_focus": "Customer satisfaction metrics"
	},
	{
	"id": "perf_7",
	"conversation": """USER: The podcast is our best performing channel - 40% of qualified leads come from it.
	ASSISTANT: That's impressive attribution. How are you tracking it?
	USER: Custom UTM parameters and post-conversion surveys.""",
	"expected": ["company.performance_context"],
	"difficulty": "medium",
	"category_focus": "Channel performance"
	},
	{
	"id": "perf_8",
	"conversation": """USER: We've tested 47 different landing page variants this year.
	ASSISTANT: That's rigorous testing. What's been the biggest winner?
	USER: Social proof above the fold. Increased conversions by 34%.""",
	"expected": ["company.performance_context", "company.knowledge_artifacts"],
	"difficulty": "hard",
	"category_focus": "Testing insights becoming knowledge"
	},

	# ========== TOOLS & INTEGRATIONS (25-30) ==========
	{
	"id": "tools_1",
	"conversation": """USER: We use HubSpot for CRM, Klaviyo for email, and Shopify for commerce. Everything needs to sync.
	ASSISTANT: What's your current integration setup?
	USER: Zapier connects them, but we're moving to native integrations.""",
	"expected": ["company.tools_config"],
	"difficulty": "easy",
	"category_focus": "Tech stack overview"
	},
	{
	"id": "tools_2",
	"conversation": """USER: Our API key for the analytics platform is expiring next month.
	ASSISTANT: I'll note that for renewal.
	USER: Thanks. It's critical - everything breaks without it.""",
	"expected": ["company.tools_config"],
	"difficulty": "medium",
	"category_focus": "API configuration"
	},
	{
	"id": "tools_3",
	"conversation": """USER: We just migrated from Mailchimp to Klaviyo. The segmentation is so much better.
	ASSISTANT: What segments are you using?
	USER: RFM-based: high-value, at-risk, and dormant customers.""",
	"expected": ["company.tools_config"],
	"difficulty": "medium",
	"category_focus": "Tool migration and usage"
	},
	{
	"id": "tools_4",
	"conversation": """USER: Can you remind me how to export data from Google Analytics?
	ASSISTANT: Go to Reports, select your date range, then click Export.
	USER: Got it, thanks.""",
	"expected": ["none"],
	"difficulty": "easy",
	"category_focus": "How-to question"
	},
	{
	"id": "tools_5",
	"conversation": """USER: Our Slack is integrated with everything - Asana, HubSpot, even our custom dashboards.
	ASSISTANT: Centralized notifications.
	USER: It's our command center. All marketing alerts go there.""",
	"expected": ["company.tools_config"],
	"difficulty": "medium",
	"category_focus": "Workflow integrations"
	},
	{
	"id": "tools_6",
	"conversation": """USER: We're evaluating Notion vs Confluence for our marketing wiki.
	ASSISTANT: What are your key requirements?
	USER: Easy editing and good search. Our current docs are a mess.""",
	"expected": ["none"],
	"difficulty": "medium",
	"category_focus": "Tool evaluation (not decided)"
	},

	# ========== USER PREFERENCES & ROLE (31-40) ==========
	{
	"id": "user_1",
	"conversation": """USER: I'm the VP of Growth, reporting to the CMO. I own all acquisition channels.
	ASSISTANT: That's a broad scope. What's your team size?
	USER: 12 people across paid, organic, and partnerships.""",
	"expected": ["user.role_context"],
	"difficulty": "easy",
	"category_focus": "Role and scope"
	},
	{
	"id": "user_2",
	"conversation": """USER: I prefer bullet points over paragraphs. Get to the point quickly.
	ASSISTANT: Noted. I'll keep responses concise and scannable.
	USER: Perfect. Time is my scarcest resource.""",
	"expected": ["user.communication_style"],
	"difficulty": "easy",
	"category_focus": "Format preference"
	},
	{
	"id": "user_3",
	"conversation": """USER: Don't sugarcoat things. If something's not working, tell me directly.
	ASSISTANT: Direct feedback, even when it's uncomfortable.
	USER: Especially when it's uncomfortable. That's when I need it most.""",
	"expected": ["user.interaction_preferences"],
	"difficulty": "medium",
	"category_focus": "Feedback style"
	},
	{
	"id": "user_4",
	"conversation": """USER: I review all campaigns on Friday afternoons. That's when I have focus time.
	ASSISTANT: I'll schedule campaign reviews for Fridays then.
	USER: And send me a summary Thursday evening so I can prep.""",
	"expected": ["user.workflow_patterns"],
	"difficulty": "medium",
	"category_focus": "Review cadence"
	},
	{
	"id": "user_5",
	"conversation": """USER: Yesterday we discussed the influencer strategy. Can you pull up those notes?
	ASSISTANT: Yes, we covered micro-influencer targeting and budget allocation.
	USER: Right. Let's continue from the budget discussion.""",
	"expected": ["user.session_history"],
	"difficulty": "easy",
	"category_focus": "Session continuity"
	},
	{
	"id": "user_6",
	"conversation": """USER: I believe in 'strong opinions, loosely held'. I'll argue a point but change my mind with good data.
	ASSISTANT: Intellectually flexible but not wishy-washy.
	USER: Exactly. Challenge me when you think I'm wrong.""",
	"expected": ["user.strategic_approach", "user.interaction_preferences"],
	"difficulty": "hard",
	"category_focus": "Personal philosophy + interaction style"
	},
	{
	"id": "user_7",
	"conversation": """USER: How's your day going?
	ASSISTANT: Going well, thanks! How can I help you today?
	USER: Just checking in before our meeting.""",
	"expected": ["none"],
	"difficulty": "easy",
	"category_focus": "Small talk"
	},
	{
	"id": "user_8",
	"conversation": """USER: I'm a visual thinker. Show me charts and diagrams whenever possible.
	ASSISTANT: I'll prioritize visual representations in my responses.
	USER: And color-code things by priority if you can.""",
	"expected": ["user.communication_style"],
	"difficulty": "medium",
	"category_focus": "Visual preference"
	},
	{
	"id": "user_9",
	"conversation": """USER: I've been in marketing for 15 years, so skip the basics.
	ASSISTANT: I'll assume expert-level knowledge.
	USER: Good. I hate when tools explain obvious concepts.""",
	"expected": ["user.interaction_preferences", "user.role_context"],
	"difficulty": "hard",
	"category_focus": "Experience level + interaction"
	},
	{
	"id": "user_10",
	"conversation": """USER: My success metric is pipeline contribution. Everything else is vanity.
	ASSISTANT: Revenue-focused measurement.
	USER: If it doesn't move pipeline, I don't care about it.""",
	"expected": ["user.strategic_approach"],
	"difficulty": "medium",
	"category_focus": "Personal success definition"
	},

	# ========== BUSINESS PRIORITIES (41-46) ==========
	{
	"id": "priority_1",
	"conversation": """USER: Q4 is all about Black Friday and Cyber Monday. Everything else takes a back seat.
	ASSISTANT: So BFCM is the top priority through year-end?
	USER: 60% of our annual revenue happens in those two weeks.""",
	"expected": ["company.business_priorities"],
	"difficulty": "easy",
	"category_focus": "Seasonal priority"
	},
	{
	"id": "priority_2",
	"conversation": """USER: We're launching a new product line next month. All marketing resources are shifting to support it.
	ASSISTANT: What's the launch timeline?
	USER: Soft launch March 1st, full campaign March 15th.""",
	"expected": ["company.business_priorities"],
	"difficulty": "medium",
	"category_focus": "Product launch priority"
	},
	{
	"id": "priority_3",
	"conversation": """USER: The board wants us to focus on profitability over growth this year.
	ASSISTANT: A shift from growth-at-all-costs.
	USER: Yes. CAC payback under 12 months is now mandatory.""",
	"expected": ["company.business_priorities", "company.strategic_signatures"],
	"difficulty": "hard",
	"category_focus": "Strategic priority shift"
	},
	{
	"id": "priority_4",
	"conversation": """USER: We need to hit 10,000 email subscribers by end of quarter.
	ASSISTANT: What's the current count?
	USER: 6,500. So we need about 120 new subs per day.""",
	"expected": ["company.business_priorities"],
	"difficulty": "medium",
	"category_focus": "Specific goal"
	},
	{
	"id": "priority_5",
	"conversation": """USER: Retention is our #1 focus this year. We've been leaking customers.
	ASSISTANT: Churn reduction over acquisition.
	USER: We're pausing all acquisition campaigns until we fix the retention problem.""",
	"expected": ["company.business_priorities"],
	"difficulty": "medium",
	"category_focus": "Strategic focus"
	},
	{
	"id": "priority_6",
	"conversation": """USER: What should we prioritize next quarter?
	ASSISTANT: That depends on your goals. What's most important to the business?
	USER: I'm not sure yet. Still figuring it out.""",
	"expected": ["none"],
	"difficulty": "medium",
	"category_focus": "Undefined priority"
	},

	# ========== KNOWLEDGE ARTIFACTS (47-50) ==========
	{
	"id": "knowledge_1",
	"conversation": """USER: We have a 50-page brand book that every agency partner must follow.
	ASSISTANT: Comprehensive brand guidelines.
	USER: It covers everything from logo usage to photography style.""",
	"expected": ["company.knowledge_artifacts"],
	"difficulty": "easy",
	"category_focus": "Brand documentation"
	},
	{
	"id": "knowledge_2",
	"conversation": """USER: Our content playbook defines the exact process from ideation to publication.
	ASSISTANT: A documented workflow.
	USER: Every piece of content goes through the same 7-step process.""",
	"expected": ["company.knowledge_artifacts"],
	"difficulty": "medium",
	"category_focus": "Process documentation"
	},
	{
	"id": "knowledge_3",
	"conversation": """USER: We have templates for every email type - welcome, abandoned cart, win-back, you name it.
	ASSISTANT: A comprehensive email template library.
	USER: It's saved us hundreds of hours. New team members can start producing immediately.""",
	"expected": ["company.knowledge_artifacts"],
	"difficulty": "medium",
	"category_focus": "Template library"
	},
	{
	"id": "knowledge_4",
	"conversation": """USER: Our style guide says we never use exclamation marks in headlines.
	ASSISTANT: A specific editorial rule.
	USER: It's part of our understated brand voice.""",
	"expected": ["company.knowledge_artifacts", "company.brand_core"],
	"difficulty": "hard",
	"category_focus": "Style guide overlapping brand"
	},
	]


	def parse_prediction(text):
	"""Parse model output into category set."""
	if not text or not text.strip():
	return set()
	# Handle various formats
	text = text.lower().strip()
	# Remove common prefixes
	for prefix in ["categories:", "category:", "the categories are:", "answer:"]:
	if text.startswith(prefix):
	text = text[len(prefix):].strip()

	cats = [c.strip() for c in text.split(",")]
	return {c for c in cats if c in VALID_CATEGORIES}


	def compute_metrics(predicted, gold):
	"""Compute F1, precision, recall."""
	if not predicted and not gold:
	return 1.0, 1.0, 1.0, True, True
	if not predicted or not gold:
	return 0.0, 0.0, 0.0, False, False

	tp = len(predicted & gold)
	precision = tp / len(predicted)
	recall = tp / len(gold)
	f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
	any_match = tp > 0
	exact_match = predicted == gold

	return f1, precision, recall, any_match, exact_match


	async def eval_tinker_model(name, checkpoint, model_name, renderer_name):
	"""Evaluate a Tinker model."""
	print(f"\n{'='*60}", flush=True)
	print(f"Evaluating: {name}", flush=True)
	print(f"{'='*60}", flush=True)

	service_client = tinker.ServiceClient()
	sampling_client = service_client.create_sampling_client(model_path=checkpoint)
	tokenizer = get_tokenizer(model_name)
	renderer = renderers.get_renderer(name=renderer_name, tokenizer=tokenizer)
	stop = renderer.get_stop_sequences()
	params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=stop)

	results = []

	for i, test in enumerate(MARKETING_BENCHMARK):
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": f"Analyze this conversation and determine which memory categories apply:\n\n{test['conversation']}"}
	]

	prompt = renderer.build_generation_prompt(messages)
	result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
	response, _ = renderer.parse_response(result.sequences[0].tokens)
	predicted = parse_prediction(response["content"])
	gold = set(test["expected"])

	f1, prec, rec, any_match, exact = compute_metrics(predicted, gold)

	results.append({
	"id": test["id"],
	"predicted": list(predicted),
	"gold": list(gold),
	"f1": f1,
	"any_match": any_match,
	"exact_match": exact,
	"difficulty": test["difficulty"]
	})

	status = "✓" if any_match else "✗"
	print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True)

	return results


	async def eval_cohere_model():
	"""Evaluate Cohere Command-R-Plus (teacher model)."""
	print(f"\n{'='*60}", flush=True)
	print(f"Evaluating: Cohere Command-R-Plus (Teacher)", flush=True)
	print(f"{'='*60}", flush=True)

	client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))

	results = []

	for i, test in enumerate(MARKETING_BENCHMARK):
	prompt = f"""{SYSTEM_PROMPT}

	Analyze this conversation and determine which memory categories apply:

	{test['conversation']}

	Respond with comma-separated categories only. No explanation."""

	try:
	response = client.chat(
	model="command-r-plus-08-2024",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1,
	max_tokens=100
	)

	# Extract text from response
	response_text = ""
	if hasattr(response.message, 'content') and response.message.content:
	for block in response.message.content:
	if hasattr(block, 'text'):
	response_text = block.text
	break

	predicted = parse_prediction(response_text)
	gold = set(test["expected"])

	f1, prec, rec, any_match, exact = compute_metrics(predicted, gold)

	results.append({
	"id": test["id"],
	"predicted": list(predicted),
	"gold": list(gold),
	"f1": f1,
	"any_match": any_match,
	"exact_match": exact,
	"difficulty": test["difficulty"]
	})

	status = "✓" if any_match else "✗"
	print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True)

	# Rate limiting
	await asyncio.sleep(0.5)

	except Exception as e:
	print(f"[{i+1:2d}] ERROR {test['id']}: {e}", flush=True)
	results.append({
	"id": test["id"],
	"predicted": [],
	"gold": list(test["expected"]),
	"f1": 0.0,
	"any_match": False,
	"exact_match": False,
	"difficulty": test["difficulty"],
	"error": str(e)
	})

	return results


	def compute_summary(results, name):
	"""Compute summary statistics."""
	n = len(results)
	avg_f1 = sum(r["f1"] for r in results) / n
	any_match = sum(1 for r in results if r["any_match"]) / n
	exact_match = sum(1 for r in results if r["exact_match"]) / n

	# By difficulty
	by_diff = {}
	for diff in ["easy", "medium", "hard"]:
	subset = [r for r in results if r["difficulty"] == diff]
	if subset:
	by_diff[diff] = {
	"count": len(subset),
	"f1": sum(r["f1"] for r in subset) / len(subset),
	"any_match": sum(1 for r in subset if r["any_match"]) / len(subset),
	"exact_match": sum(1 for r in subset if r["exact_match"]) / len(subset)
	}

	return {
	"name": name,
	"total": n,
	"avg_f1": avg_f1,
	"any_match": any_match,
	"exact_match": exact_match,
	"by_difficulty": by_diff
	}


	async def main():
	print("=" * 70, flush=True)
	print("FINAL BENCHMARK: Memory Routing Model Comparison", flush=True)
	print("50 Challenging Marketing Scenarios", flush=True)
	print("=" * 70, flush=True)

	all_results = {}

	# 1. Our RL-trained model
	rl_results = await eval_tinker_model(
	name="Llama-8B + LoRA + RL (Ours)",
	checkpoint="tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012",
	model_name="meta-llama/Llama-3.1-8B",
	renderer_name="llama3"
	)
	all_results["rl_model"] = rl_results

	# 2. Cohere Command-R-Plus (Teacher)
	cohere_results = await eval_cohere_model()
	all_results["cohere"] = cohere_results

	# Compute summaries
	summaries = {
	"rl_model": compute_summary(rl_results, "Llama-8B + LoRA + RL (Ours)"),
	"cohere": compute_summary(cohere_results, "Cohere Command-R-Plus (104B)")
	}

	# Print comparison
	print("\n" + "=" * 70, flush=True)
	print("BENCHMARK RESULTS", flush=True)
	print("=" * 70, flush=True)

	print(f"\n{'Model':<35} {'Any Match':<12} {'Exact':<12} {'Avg F1':<10}", flush=True)
	print("-" * 70, flush=True)

	for key, summary in summaries.items():
	print(f"{summary['name']:<35} {summary['any_match']:<12.0%} {summary['exact_match']:<12.0%} {summary['avg_f1']:<10.2f}", flush=True)

	print("\n" + "-" * 70, flush=True)
	print("RESULTS BY DIFFICULTY", flush=True)
	print("-" * 70, flush=True)

	for diff in ["easy", "medium", "hard"]:
	print(f"\n{diff.upper()}:", flush=True)
	for key, summary in summaries.items():
	if diff in summary["by_difficulty"]:
	d = summary["by_difficulty"][diff]
	print(f" {summary['name']:<33} Any={d['any_match']:.0%} Exact={d['exact_match']:.0%} F1={d['f1']:.2f} (n={d['count']})", flush=True)

	# Save results
	output = {
	"benchmark_date": datetime.now().isoformat(),
	"num_scenarios": len(MARKETING_BENCHMARK),
	"summaries": summaries,
	"detailed_results": all_results
	}

	os.makedirs("training/benchmarks", exist_ok=True)
	output_path = f"training/benchmarks/final_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

	with open(output_path, "w") as f:
	json.dump(output, f, indent=2, default=str)

	print(f"\nResults saved to: {output_path}", flush=True)

	# Key findings
	print("\n" + "=" * 70, flush=True)
	print("KEY FINDINGS", flush=True)
	print("=" * 70, flush=True)

	rl_f1 = summaries["rl_model"]["avg_f1"]
	cohere_f1 = summaries["cohere"]["avg_f1"]

	if rl_f1 > cohere_f1:
	improvement = ((rl_f1 - cohere_f1) / cohere_f1) * 100
	print(f"✓ Our 8B model OUTPERFORMS the 104B teacher by {improvement:.1f}% on F1", flush=True)
	else:
	gap = ((cohere_f1 - rl_f1) / cohere_f1) * 100
	print(f" Our 8B model is within {gap:.1f}% of the 104B teacher on F1", flush=True)

	print(f"\nModel Sizes:", flush=True)
	print(f" - Llama-8B + LoRA: ~8B parameters (LoRA adds ~0.1B)", flush=True)
	print(f" - Cohere Command-R-Plus: ~104B parameters", flush=True)
	print(f" - Size ratio: 13x smaller", flush=True)


	if __name__ == "__main__":
	asyncio.run(main())