""" Final Benchmark: Memory Routing Model Comparison Compares: 1. Our RL-trained Llama-8B model 2. Base Llama-8B (untrained) 3. Cohere Command-R-Plus (teacher model used for data generation) All scenarios are marketing-specific and challenging. """ import asyncio import json import os import time from datetime import datetime from dotenv import load_dotenv load_dotenv() import cohere import tinker from tinker import types from tinker_cookbook import renderers from tinker_cookbook.tokenizer_utils import get_tokenizer VALID_CATEGORIES = { "company.brand_core", "company.strategic_signatures", "company.knowledge_artifacts", "company.business_priorities", "company.tools_config", "company.performance_context", "user.communication_style", "user.strategic_approach", "user.role_context", "user.workflow_patterns", "user.session_history", "user.interaction_preferences", "none" } SYSTEM_PROMPT = """You route marketing conversations into structured memory categories. Available categories: - company.brand_core: Voice, values, positioning, identity anchors - company.strategic_signatures: Decision frameworks, strategic heuristics - company.knowledge_artifacts: Docs, style guides, playbooks - company.business_priorities: Quarterly/seasonal goals, active campaigns - company.tools_config: Integrations, API keys, workflow settings - company.performance_context: Campaign metrics, retrospectives, learnings - user.communication_style: Tone, verbosity, format expectations - user.strategic_approach: Personal priorities, success definitions - user.role_context: Title, scope, decision authority - user.workflow_patterns: Review cadence, collaboration norms - user.session_history: Immediate context, recent asks - user.interaction_preferences: Coaching style, feedback expectations - none: Irrelevant, vague, or transactional content Respond with comma-separated categories. Use 'none' only if no other category applies.""" # 50 Challenging Marketing Scenarios MARKETING_BENCHMARK = [ # ========== BRAND & POSITIONING (1-8) ========== { "id": "brand_1", "conversation": """USER: Our brand DNA is 'accessible luxury'. We're not Chanel, but we're definitely not Target either. Think Everlane meets Aritzia. ASSISTANT: So premium quality at attainable price points, with a focus on transparency? USER: Exactly. Every piece of content should feel elevated but not pretentious.""", "expected": ["company.brand_core"], "difficulty": "medium", "category_focus": "Brand positioning with comparisons" }, { "id": "brand_2", "conversation": """USER: We've been using 'Innovation for Everyone' as our tagline for 5 years. It's core to who we are. ASSISTANT: That's a strong positioning statement. Does it extend to your visual identity too? USER: Yes, our whole design system is built around democratizing technology.""", "expected": ["company.brand_core"], "difficulty": "easy", "category_focus": "Tagline and identity" }, { "id": "brand_3", "conversation": """USER: I hate when brands try too hard to be funny. Our voice should be warm and helpful, never sarcastic. ASSISTANT: So authentic helpfulness over trendy humor? USER: Right. We're the trusted friend, not the class clown.""", "expected": ["company.brand_core"], "difficulty": "medium", "category_focus": "Voice and tone definition" }, { "id": "brand_4", "conversation": """USER: Our competitors use fear tactics - 'Don't miss out!' We never do that. We believe in empowerment. ASSISTANT: Positive framing over scarcity messaging. USER: Always. Even our sales emails should feel like opportunities, not threats.""", "expected": ["company.brand_core", "company.strategic_signatures"], "difficulty": "hard", "category_focus": "Brand values + decision framework" }, { "id": "brand_5", "conversation": """USER: What font are we using for the Q4 campaign headers? ASSISTANT: The brand guidelines specify Montserrat for headers. USER: Thanks, just needed to confirm.""", "expected": ["none"], "difficulty": "easy", "category_focus": "Transactional brand query" }, { "id": "brand_6", "conversation": """USER: We position ourselves as the 'anti-agency agency'. No jargon, no BS, just results. ASSISTANT: Transparency and authenticity as differentiators. USER: And we back it up with our pricing model - no hidden fees, ever.""", "expected": ["company.brand_core"], "difficulty": "medium", "category_focus": "Agency positioning" }, { "id": "brand_7", "conversation": """USER: Our brand is built on three pillars: sustainability, transparency, and community. ASSISTANT: Those are strong values. How do they manifest in your marketing? USER: Every campaign must touch at least one pillar. It's non-negotiable.""", "expected": ["company.brand_core", "company.strategic_signatures"], "difficulty": "hard", "category_focus": "Brand pillars as decision framework" }, { "id": "brand_8", "conversation": """USER: We're rebranding next quarter but I can't share details yet. ASSISTANT: Understood. Let me know when you're ready to discuss. USER: Will do.""", "expected": ["none"], "difficulty": "medium", "category_focus": "Vague future reference" }, # ========== STRATEGIC DECISIONS (9-16) ========== { "id": "strategy_1", "conversation": """USER: We never launch a campaign without at least 3 weeks of testing. It's our golden rule. ASSISTANT: So testing is a non-negotiable gate in your process? USER: Absolutely. Even if stakeholders push back, we hold the line.""", "expected": ["company.strategic_signatures"], "difficulty": "easy", "category_focus": "Decision framework" }, { "id": "strategy_2", "conversation": """USER: Our philosophy is 'measure twice, cut once'. We'd rather delay a launch than ship something half-baked. ASSISTANT: Quality over speed. USER: Every time. Our reputation depends on it.""", "expected": ["company.strategic_signatures"], "difficulty": "medium", "category_focus": "Strategic heuristic" }, { "id": "strategy_3", "conversation": """USER: When in doubt, we default to the customer's perspective. What would they want? ASSISTANT: Customer-centric decision making. USER: It's saved us from many internal-focused mistakes.""", "expected": ["company.strategic_signatures"], "difficulty": "medium", "category_focus": "Decision principle" }, { "id": "strategy_4", "conversation": """USER: I personally believe in testing everything. Data over gut instinct. ASSISTANT: A data-driven approach. USER: It's how I've always operated. Even small decisions get A/B tested.""", "expected": ["user.strategic_approach"], "difficulty": "medium", "category_focus": "Personal vs company strategy" }, { "id": "strategy_5", "conversation": """USER: The company prioritizes brand safety above all. But I think we're too conservative sometimes. ASSISTANT: So there's tension between company policy and your personal view? USER: Yes. I'd take more calculated risks if it were up to me.""", "expected": ["company.strategic_signatures", "user.strategic_approach"], "difficulty": "hard", "category_focus": "Company vs personal strategy conflict" }, { "id": "strategy_6", "conversation": """USER: We have a 'no surprises' policy with clients. They hear bad news from us first. ASSISTANT: Proactive communication as a core principle. USER: It's built trust with every client we've worked with.""", "expected": ["company.strategic_signatures"], "difficulty": "medium", "category_focus": "Client relationship principle" }, { "id": "strategy_7", "conversation": """USER: Should we go with vendor A or vendor B for the email platform? ASSISTANT: What are the key differences you're weighing? USER: Just cost really. They're pretty similar otherwise.""", "expected": ["none"], "difficulty": "easy", "category_focus": "Transactional decision" }, { "id": "strategy_8", "conversation": """USER: Our rule is simple: if we can't explain it to a 10-year-old, we don't publish it. ASSISTANT: Clarity as a content standard. USER: It keeps our messaging accessible to everyone.""", "expected": ["company.strategic_signatures", "company.brand_core"], "difficulty": "hard", "category_focus": "Content principle overlapping brand" }, # ========== PERFORMANCE & METRICS (17-24) ========== { "id": "perf_1", "conversation": """USER: Last quarter's email campaign hit 42% open rate and 8% CTR. Best we've ever done. ASSISTANT: Those are strong numbers. What drove the improvement? USER: We switched to personalized subject lines based on past behavior.""", "expected": ["company.performance_context"], "difficulty": "easy", "category_focus": "Campaign metrics" }, { "id": "perf_2", "conversation": """USER: Our CAC has been climbing - $180 last month, up from $120 in Q1. ASSISTANT: That's a significant increase. Any hypothesis on the cause? USER: iOS privacy changes are killing our Facebook targeting.""", "expected": ["company.performance_context"], "difficulty": "medium", "category_focus": "Acquisition metrics with context" }, { "id": "perf_3", "conversation": """USER: The influencer campaign flopped. 2M impressions but only 50 conversions. ASSISTANT: That's a 0.0025% conversion rate. What went wrong? USER: Wrong audience. The influencer's followers weren't our target demo.""", "expected": ["company.performance_context"], "difficulty": "medium", "category_focus": "Campaign failure analysis" }, { "id": "perf_4", "conversation": """USER: We learned that video content outperforms static by 3x on engagement. ASSISTANT: That's a significant insight. Are you shifting your content mix? USER: Yes, Q4 will be 70% video. It's a major strategic shift.""", "expected": ["company.performance_context", "company.business_priorities"], "difficulty": "hard", "category_focus": "Learning leading to priority" }, { "id": "perf_5", "conversation": """USER: What was our email open rate last week? ASSISTANT: I don't have access to that data in real-time. USER: I'll check the dashboard.""", "expected": ["none"], "difficulty": "easy", "category_focus": "Data lookup request" }, { "id": "perf_6", "conversation": """USER: Our NPS dropped from 72 to 58 after the website redesign. ASSISTANT: That's a significant drop. Customer feedback? USER: They hate the new navigation. We're rolling back next week.""", "expected": ["company.performance_context"], "difficulty": "medium", "category_focus": "Customer satisfaction metrics" }, { "id": "perf_7", "conversation": """USER: The podcast is our best performing channel - 40% of qualified leads come from it. ASSISTANT: That's impressive attribution. How are you tracking it? USER: Custom UTM parameters and post-conversion surveys.""", "expected": ["company.performance_context"], "difficulty": "medium", "category_focus": "Channel performance" }, { "id": "perf_8", "conversation": """USER: We've tested 47 different landing page variants this year. ASSISTANT: That's rigorous testing. What's been the biggest winner? USER: Social proof above the fold. Increased conversions by 34%.""", "expected": ["company.performance_context", "company.knowledge_artifacts"], "difficulty": "hard", "category_focus": "Testing insights becoming knowledge" }, # ========== TOOLS & INTEGRATIONS (25-30) ========== { "id": "tools_1", "conversation": """USER: We use HubSpot for CRM, Klaviyo for email, and Shopify for commerce. Everything needs to sync. ASSISTANT: What's your current integration setup? USER: Zapier connects them, but we're moving to native integrations.""", "expected": ["company.tools_config"], "difficulty": "easy", "category_focus": "Tech stack overview" }, { "id": "tools_2", "conversation": """USER: Our API key for the analytics platform is expiring next month. ASSISTANT: I'll note that for renewal. USER: Thanks. It's critical - everything breaks without it.""", "expected": ["company.tools_config"], "difficulty": "medium", "category_focus": "API configuration" }, { "id": "tools_3", "conversation": """USER: We just migrated from Mailchimp to Klaviyo. The segmentation is so much better. ASSISTANT: What segments are you using? USER: RFM-based: high-value, at-risk, and dormant customers.""", "expected": ["company.tools_config"], "difficulty": "medium", "category_focus": "Tool migration and usage" }, { "id": "tools_4", "conversation": """USER: Can you remind me how to export data from Google Analytics? ASSISTANT: Go to Reports, select your date range, then click Export. USER: Got it, thanks.""", "expected": ["none"], "difficulty": "easy", "category_focus": "How-to question" }, { "id": "tools_5", "conversation": """USER: Our Slack is integrated with everything - Asana, HubSpot, even our custom dashboards. ASSISTANT: Centralized notifications. USER: It's our command center. All marketing alerts go there.""", "expected": ["company.tools_config"], "difficulty": "medium", "category_focus": "Workflow integrations" }, { "id": "tools_6", "conversation": """USER: We're evaluating Notion vs Confluence for our marketing wiki. ASSISTANT: What are your key requirements? USER: Easy editing and good search. Our current docs are a mess.""", "expected": ["none"], "difficulty": "medium", "category_focus": "Tool evaluation (not decided)" }, # ========== USER PREFERENCES & ROLE (31-40) ========== { "id": "user_1", "conversation": """USER: I'm the VP of Growth, reporting to the CMO. I own all acquisition channels. ASSISTANT: That's a broad scope. What's your team size? USER: 12 people across paid, organic, and partnerships.""", "expected": ["user.role_context"], "difficulty": "easy", "category_focus": "Role and scope" }, { "id": "user_2", "conversation": """USER: I prefer bullet points over paragraphs. Get to the point quickly. ASSISTANT: Noted. I'll keep responses concise and scannable. USER: Perfect. Time is my scarcest resource.""", "expected": ["user.communication_style"], "difficulty": "easy", "category_focus": "Format preference" }, { "id": "user_3", "conversation": """USER: Don't sugarcoat things. If something's not working, tell me directly. ASSISTANT: Direct feedback, even when it's uncomfortable. USER: Especially when it's uncomfortable. That's when I need it most.""", "expected": ["user.interaction_preferences"], "difficulty": "medium", "category_focus": "Feedback style" }, { "id": "user_4", "conversation": """USER: I review all campaigns on Friday afternoons. That's when I have focus time. ASSISTANT: I'll schedule campaign reviews for Fridays then. USER: And send me a summary Thursday evening so I can prep.""", "expected": ["user.workflow_patterns"], "difficulty": "medium", "category_focus": "Review cadence" }, { "id": "user_5", "conversation": """USER: Yesterday we discussed the influencer strategy. Can you pull up those notes? ASSISTANT: Yes, we covered micro-influencer targeting and budget allocation. USER: Right. Let's continue from the budget discussion.""", "expected": ["user.session_history"], "difficulty": "easy", "category_focus": "Session continuity" }, { "id": "user_6", "conversation": """USER: I believe in 'strong opinions, loosely held'. I'll argue a point but change my mind with good data. ASSISTANT: Intellectually flexible but not wishy-washy. USER: Exactly. Challenge me when you think I'm wrong.""", "expected": ["user.strategic_approach", "user.interaction_preferences"], "difficulty": "hard", "category_focus": "Personal philosophy + interaction style" }, { "id": "user_7", "conversation": """USER: How's your day going? ASSISTANT: Going well, thanks! How can I help you today? USER: Just checking in before our meeting.""", "expected": ["none"], "difficulty": "easy", "category_focus": "Small talk" }, { "id": "user_8", "conversation": """USER: I'm a visual thinker. Show me charts and diagrams whenever possible. ASSISTANT: I'll prioritize visual representations in my responses. USER: And color-code things by priority if you can.""", "expected": ["user.communication_style"], "difficulty": "medium", "category_focus": "Visual preference" }, { "id": "user_9", "conversation": """USER: I've been in marketing for 15 years, so skip the basics. ASSISTANT: I'll assume expert-level knowledge. USER: Good. I hate when tools explain obvious concepts.""", "expected": ["user.interaction_preferences", "user.role_context"], "difficulty": "hard", "category_focus": "Experience level + interaction" }, { "id": "user_10", "conversation": """USER: My success metric is pipeline contribution. Everything else is vanity. ASSISTANT: Revenue-focused measurement. USER: If it doesn't move pipeline, I don't care about it.""", "expected": ["user.strategic_approach"], "difficulty": "medium", "category_focus": "Personal success definition" }, # ========== BUSINESS PRIORITIES (41-46) ========== { "id": "priority_1", "conversation": """USER: Q4 is all about Black Friday and Cyber Monday. Everything else takes a back seat. ASSISTANT: So BFCM is the top priority through year-end? USER: 60% of our annual revenue happens in those two weeks.""", "expected": ["company.business_priorities"], "difficulty": "easy", "category_focus": "Seasonal priority" }, { "id": "priority_2", "conversation": """USER: We're launching a new product line next month. All marketing resources are shifting to support it. ASSISTANT: What's the launch timeline? USER: Soft launch March 1st, full campaign March 15th.""", "expected": ["company.business_priorities"], "difficulty": "medium", "category_focus": "Product launch priority" }, { "id": "priority_3", "conversation": """USER: The board wants us to focus on profitability over growth this year. ASSISTANT: A shift from growth-at-all-costs. USER: Yes. CAC payback under 12 months is now mandatory.""", "expected": ["company.business_priorities", "company.strategic_signatures"], "difficulty": "hard", "category_focus": "Strategic priority shift" }, { "id": "priority_4", "conversation": """USER: We need to hit 10,000 email subscribers by end of quarter. ASSISTANT: What's the current count? USER: 6,500. So we need about 120 new subs per day.""", "expected": ["company.business_priorities"], "difficulty": "medium", "category_focus": "Specific goal" }, { "id": "priority_5", "conversation": """USER: Retention is our #1 focus this year. We've been leaking customers. ASSISTANT: Churn reduction over acquisition. USER: We're pausing all acquisition campaigns until we fix the retention problem.""", "expected": ["company.business_priorities"], "difficulty": "medium", "category_focus": "Strategic focus" }, { "id": "priority_6", "conversation": """USER: What should we prioritize next quarter? ASSISTANT: That depends on your goals. What's most important to the business? USER: I'm not sure yet. Still figuring it out.""", "expected": ["none"], "difficulty": "medium", "category_focus": "Undefined priority" }, # ========== KNOWLEDGE ARTIFACTS (47-50) ========== { "id": "knowledge_1", "conversation": """USER: We have a 50-page brand book that every agency partner must follow. ASSISTANT: Comprehensive brand guidelines. USER: It covers everything from logo usage to photography style.""", "expected": ["company.knowledge_artifacts"], "difficulty": "easy", "category_focus": "Brand documentation" }, { "id": "knowledge_2", "conversation": """USER: Our content playbook defines the exact process from ideation to publication. ASSISTANT: A documented workflow. USER: Every piece of content goes through the same 7-step process.""", "expected": ["company.knowledge_artifacts"], "difficulty": "medium", "category_focus": "Process documentation" }, { "id": "knowledge_3", "conversation": """USER: We have templates for every email type - welcome, abandoned cart, win-back, you name it. ASSISTANT: A comprehensive email template library. USER: It's saved us hundreds of hours. New team members can start producing immediately.""", "expected": ["company.knowledge_artifacts"], "difficulty": "medium", "category_focus": "Template library" }, { "id": "knowledge_4", "conversation": """USER: Our style guide says we never use exclamation marks in headlines. ASSISTANT: A specific editorial rule. USER: It's part of our understated brand voice.""", "expected": ["company.knowledge_artifacts", "company.brand_core"], "difficulty": "hard", "category_focus": "Style guide overlapping brand" }, ] def parse_prediction(text): """Parse model output into category set.""" if not text or not text.strip(): return set() # Handle various formats text = text.lower().strip() # Remove common prefixes for prefix in ["categories:", "category:", "the categories are:", "answer:"]: if text.startswith(prefix): text = text[len(prefix):].strip() cats = [c.strip() for c in text.split(",")] return {c for c in cats if c in VALID_CATEGORIES} def compute_metrics(predicted, gold): """Compute F1, precision, recall.""" if not predicted and not gold: return 1.0, 1.0, 1.0, True, True if not predicted or not gold: return 0.0, 0.0, 0.0, False, False tp = len(predicted & gold) precision = tp / len(predicted) recall = tp / len(gold) f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 any_match = tp > 0 exact_match = predicted == gold return f1, precision, recall, any_match, exact_match async def eval_tinker_model(name, checkpoint, model_name, renderer_name): """Evaluate a Tinker model.""" print(f"\n{'='*60}", flush=True) print(f"Evaluating: {name}", flush=True) print(f"{'='*60}", flush=True) service_client = tinker.ServiceClient() sampling_client = service_client.create_sampling_client(model_path=checkpoint) tokenizer = get_tokenizer(model_name) renderer = renderers.get_renderer(name=renderer_name, tokenizer=tokenizer) stop = renderer.get_stop_sequences() params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=stop) results = [] for i, test in enumerate(MARKETING_BENCHMARK): messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"Analyze this conversation and determine which memory categories apply:\n\n{test['conversation']}"} ] prompt = renderer.build_generation_prompt(messages) result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result() response, _ = renderer.parse_response(result.sequences[0].tokens) predicted = parse_prediction(response["content"]) gold = set(test["expected"]) f1, prec, rec, any_match, exact = compute_metrics(predicted, gold) results.append({ "id": test["id"], "predicted": list(predicted), "gold": list(gold), "f1": f1, "any_match": any_match, "exact_match": exact, "difficulty": test["difficulty"] }) status = "✓" if any_match else "✗" print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True) return results async def eval_cohere_model(): """Evaluate Cohere Command-R-Plus (teacher model).""" print(f"\n{'='*60}", flush=True) print(f"Evaluating: Cohere Command-R-Plus (Teacher)", flush=True) print(f"{'='*60}", flush=True) client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY")) results = [] for i, test in enumerate(MARKETING_BENCHMARK): prompt = f"""{SYSTEM_PROMPT} Analyze this conversation and determine which memory categories apply: {test['conversation']} Respond with comma-separated categories only. No explanation.""" try: response = client.chat( model="command-r-plus-08-2024", messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=100 ) # Extract text from response response_text = "" if hasattr(response.message, 'content') and response.message.content: for block in response.message.content: if hasattr(block, 'text'): response_text = block.text break predicted = parse_prediction(response_text) gold = set(test["expected"]) f1, prec, rec, any_match, exact = compute_metrics(predicted, gold) results.append({ "id": test["id"], "predicted": list(predicted), "gold": list(gold), "f1": f1, "any_match": any_match, "exact_match": exact, "difficulty": test["difficulty"] }) status = "✓" if any_match else "✗" print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True) # Rate limiting await asyncio.sleep(0.5) except Exception as e: print(f"[{i+1:2d}] ERROR {test['id']}: {e}", flush=True) results.append({ "id": test["id"], "predicted": [], "gold": list(test["expected"]), "f1": 0.0, "any_match": False, "exact_match": False, "difficulty": test["difficulty"], "error": str(e) }) return results def compute_summary(results, name): """Compute summary statistics.""" n = len(results) avg_f1 = sum(r["f1"] for r in results) / n any_match = sum(1 for r in results if r["any_match"]) / n exact_match = sum(1 for r in results if r["exact_match"]) / n # By difficulty by_diff = {} for diff in ["easy", "medium", "hard"]: subset = [r for r in results if r["difficulty"] == diff] if subset: by_diff[diff] = { "count": len(subset), "f1": sum(r["f1"] for r in subset) / len(subset), "any_match": sum(1 for r in subset if r["any_match"]) / len(subset), "exact_match": sum(1 for r in subset if r["exact_match"]) / len(subset) } return { "name": name, "total": n, "avg_f1": avg_f1, "any_match": any_match, "exact_match": exact_match, "by_difficulty": by_diff } async def main(): print("=" * 70, flush=True) print("FINAL BENCHMARK: Memory Routing Model Comparison", flush=True) print("50 Challenging Marketing Scenarios", flush=True) print("=" * 70, flush=True) all_results = {} # 1. Our RL-trained model rl_results = await eval_tinker_model( name="Llama-8B + LoRA + RL (Ours)", checkpoint="tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012", model_name="meta-llama/Llama-3.1-8B", renderer_name="llama3" ) all_results["rl_model"] = rl_results # 2. Cohere Command-R-Plus (Teacher) cohere_results = await eval_cohere_model() all_results["cohere"] = cohere_results # Compute summaries summaries = { "rl_model": compute_summary(rl_results, "Llama-8B + LoRA + RL (Ours)"), "cohere": compute_summary(cohere_results, "Cohere Command-R-Plus (104B)") } # Print comparison print("\n" + "=" * 70, flush=True) print("BENCHMARK RESULTS", flush=True) print("=" * 70, flush=True) print(f"\n{'Model':<35} {'Any Match':<12} {'Exact':<12} {'Avg F1':<10}", flush=True) print("-" * 70, flush=True) for key, summary in summaries.items(): print(f"{summary['name']:<35} {summary['any_match']:<12.0%} {summary['exact_match']:<12.0%} {summary['avg_f1']:<10.2f}", flush=True) print("\n" + "-" * 70, flush=True) print("RESULTS BY DIFFICULTY", flush=True) print("-" * 70, flush=True) for diff in ["easy", "medium", "hard"]: print(f"\n{diff.upper()}:", flush=True) for key, summary in summaries.items(): if diff in summary["by_difficulty"]: d = summary["by_difficulty"][diff] print(f" {summary['name']:<33} Any={d['any_match']:.0%} Exact={d['exact_match']:.0%} F1={d['f1']:.2f} (n={d['count']})", flush=True) # Save results output = { "benchmark_date": datetime.now().isoformat(), "num_scenarios": len(MARKETING_BENCHMARK), "summaries": summaries, "detailed_results": all_results } os.makedirs("training/benchmarks", exist_ok=True) output_path = f"training/benchmarks/final_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output_path, "w") as f: json.dump(output, f, indent=2, default=str) print(f"\nResults saved to: {output_path}", flush=True) # Key findings print("\n" + "=" * 70, flush=True) print("KEY FINDINGS", flush=True) print("=" * 70, flush=True) rl_f1 = summaries["rl_model"]["avg_f1"] cohere_f1 = summaries["cohere"]["avg_f1"] if rl_f1 > cohere_f1: improvement = ((rl_f1 - cohere_f1) / cohere_f1) * 100 print(f"✓ Our 8B model OUTPERFORMS the 104B teacher by {improvement:.1f}% on F1", flush=True) else: gap = ((cohere_f1 - rl_f1) / cohere_f1) * 100 print(f" Our 8B model is within {gap:.1f}% of the 104B teacher on F1", flush=True) print(f"\nModel Sizes:", flush=True) print(f" - Llama-8B + LoRA: ~8B parameters (LoRA adds ~0.1B)", flush=True) print(f" - Cohere Command-R-Plus: ~104B parameters", flush=True) print(f" - Size ratio: 13x smaller", flush=True) if __name__ == "__main__": asyncio.run(main())