MuratcanKoylan's picture
Upload folder using huggingface_hub
685d968 verified
"""
Final Benchmark: Memory Routing Model Comparison
Compares:
1. Our RL-trained Llama-8B model
2. Base Llama-8B (untrained)
3. Cohere Command-R-Plus (teacher model used for data generation)
All scenarios are marketing-specific and challenging.
"""
import asyncio
import json
import os
import time
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
import cohere
import tinker
from tinker import types
from tinker_cookbook import renderers
from tinker_cookbook.tokenizer_utils import get_tokenizer
VALID_CATEGORIES = {
"company.brand_core", "company.strategic_signatures", "company.knowledge_artifacts",
"company.business_priorities", "company.tools_config", "company.performance_context",
"user.communication_style", "user.strategic_approach", "user.role_context",
"user.workflow_patterns", "user.session_history", "user.interaction_preferences",
"none"
}
SYSTEM_PROMPT = """You route marketing conversations into structured memory categories.
Available categories:
- company.brand_core: Voice, values, positioning, identity anchors
- company.strategic_signatures: Decision frameworks, strategic heuristics
- company.knowledge_artifacts: Docs, style guides, playbooks
- company.business_priorities: Quarterly/seasonal goals, active campaigns
- company.tools_config: Integrations, API keys, workflow settings
- company.performance_context: Campaign metrics, retrospectives, learnings
- user.communication_style: Tone, verbosity, format expectations
- user.strategic_approach: Personal priorities, success definitions
- user.role_context: Title, scope, decision authority
- user.workflow_patterns: Review cadence, collaboration norms
- user.session_history: Immediate context, recent asks
- user.interaction_preferences: Coaching style, feedback expectations
- none: Irrelevant, vague, or transactional content
Respond with comma-separated categories. Use 'none' only if no other category applies."""
# 50 Challenging Marketing Scenarios
MARKETING_BENCHMARK = [
# ========== BRAND & POSITIONING (1-8) ==========
{
"id": "brand_1",
"conversation": """USER: Our brand DNA is 'accessible luxury'. We're not Chanel, but we're definitely not Target either. Think Everlane meets Aritzia.
ASSISTANT: So premium quality at attainable price points, with a focus on transparency?
USER: Exactly. Every piece of content should feel elevated but not pretentious.""",
"expected": ["company.brand_core"],
"difficulty": "medium",
"category_focus": "Brand positioning with comparisons"
},
{
"id": "brand_2",
"conversation": """USER: We've been using 'Innovation for Everyone' as our tagline for 5 years. It's core to who we are.
ASSISTANT: That's a strong positioning statement. Does it extend to your visual identity too?
USER: Yes, our whole design system is built around democratizing technology.""",
"expected": ["company.brand_core"],
"difficulty": "easy",
"category_focus": "Tagline and identity"
},
{
"id": "brand_3",
"conversation": """USER: I hate when brands try too hard to be funny. Our voice should be warm and helpful, never sarcastic.
ASSISTANT: So authentic helpfulness over trendy humor?
USER: Right. We're the trusted friend, not the class clown.""",
"expected": ["company.brand_core"],
"difficulty": "medium",
"category_focus": "Voice and tone definition"
},
{
"id": "brand_4",
"conversation": """USER: Our competitors use fear tactics - 'Don't miss out!' We never do that. We believe in empowerment.
ASSISTANT: Positive framing over scarcity messaging.
USER: Always. Even our sales emails should feel like opportunities, not threats.""",
"expected": ["company.brand_core", "company.strategic_signatures"],
"difficulty": "hard",
"category_focus": "Brand values + decision framework"
},
{
"id": "brand_5",
"conversation": """USER: What font are we using for the Q4 campaign headers?
ASSISTANT: The brand guidelines specify Montserrat for headers.
USER: Thanks, just needed to confirm.""",
"expected": ["none"],
"difficulty": "easy",
"category_focus": "Transactional brand query"
},
{
"id": "brand_6",
"conversation": """USER: We position ourselves as the 'anti-agency agency'. No jargon, no BS, just results.
ASSISTANT: Transparency and authenticity as differentiators.
USER: And we back it up with our pricing model - no hidden fees, ever.""",
"expected": ["company.brand_core"],
"difficulty": "medium",
"category_focus": "Agency positioning"
},
{
"id": "brand_7",
"conversation": """USER: Our brand is built on three pillars: sustainability, transparency, and community.
ASSISTANT: Those are strong values. How do they manifest in your marketing?
USER: Every campaign must touch at least one pillar. It's non-negotiable.""",
"expected": ["company.brand_core", "company.strategic_signatures"],
"difficulty": "hard",
"category_focus": "Brand pillars as decision framework"
},
{
"id": "brand_8",
"conversation": """USER: We're rebranding next quarter but I can't share details yet.
ASSISTANT: Understood. Let me know when you're ready to discuss.
USER: Will do.""",
"expected": ["none"],
"difficulty": "medium",
"category_focus": "Vague future reference"
},
# ========== STRATEGIC DECISIONS (9-16) ==========
{
"id": "strategy_1",
"conversation": """USER: We never launch a campaign without at least 3 weeks of testing. It's our golden rule.
ASSISTANT: So testing is a non-negotiable gate in your process?
USER: Absolutely. Even if stakeholders push back, we hold the line.""",
"expected": ["company.strategic_signatures"],
"difficulty": "easy",
"category_focus": "Decision framework"
},
{
"id": "strategy_2",
"conversation": """USER: Our philosophy is 'measure twice, cut once'. We'd rather delay a launch than ship something half-baked.
ASSISTANT: Quality over speed.
USER: Every time. Our reputation depends on it.""",
"expected": ["company.strategic_signatures"],
"difficulty": "medium",
"category_focus": "Strategic heuristic"
},
{
"id": "strategy_3",
"conversation": """USER: When in doubt, we default to the customer's perspective. What would they want?
ASSISTANT: Customer-centric decision making.
USER: It's saved us from many internal-focused mistakes.""",
"expected": ["company.strategic_signatures"],
"difficulty": "medium",
"category_focus": "Decision principle"
},
{
"id": "strategy_4",
"conversation": """USER: I personally believe in testing everything. Data over gut instinct.
ASSISTANT: A data-driven approach.
USER: It's how I've always operated. Even small decisions get A/B tested.""",
"expected": ["user.strategic_approach"],
"difficulty": "medium",
"category_focus": "Personal vs company strategy"
},
{
"id": "strategy_5",
"conversation": """USER: The company prioritizes brand safety above all. But I think we're too conservative sometimes.
ASSISTANT: So there's tension between company policy and your personal view?
USER: Yes. I'd take more calculated risks if it were up to me.""",
"expected": ["company.strategic_signatures", "user.strategic_approach"],
"difficulty": "hard",
"category_focus": "Company vs personal strategy conflict"
},
{
"id": "strategy_6",
"conversation": """USER: We have a 'no surprises' policy with clients. They hear bad news from us first.
ASSISTANT: Proactive communication as a core principle.
USER: It's built trust with every client we've worked with.""",
"expected": ["company.strategic_signatures"],
"difficulty": "medium",
"category_focus": "Client relationship principle"
},
{
"id": "strategy_7",
"conversation": """USER: Should we go with vendor A or vendor B for the email platform?
ASSISTANT: What are the key differences you're weighing?
USER: Just cost really. They're pretty similar otherwise.""",
"expected": ["none"],
"difficulty": "easy",
"category_focus": "Transactional decision"
},
{
"id": "strategy_8",
"conversation": """USER: Our rule is simple: if we can't explain it to a 10-year-old, we don't publish it.
ASSISTANT: Clarity as a content standard.
USER: It keeps our messaging accessible to everyone.""",
"expected": ["company.strategic_signatures", "company.brand_core"],
"difficulty": "hard",
"category_focus": "Content principle overlapping brand"
},
# ========== PERFORMANCE & METRICS (17-24) ==========
{
"id": "perf_1",
"conversation": """USER: Last quarter's email campaign hit 42% open rate and 8% CTR. Best we've ever done.
ASSISTANT: Those are strong numbers. What drove the improvement?
USER: We switched to personalized subject lines based on past behavior.""",
"expected": ["company.performance_context"],
"difficulty": "easy",
"category_focus": "Campaign metrics"
},
{
"id": "perf_2",
"conversation": """USER: Our CAC has been climbing - $180 last month, up from $120 in Q1.
ASSISTANT: That's a significant increase. Any hypothesis on the cause?
USER: iOS privacy changes are killing our Facebook targeting.""",
"expected": ["company.performance_context"],
"difficulty": "medium",
"category_focus": "Acquisition metrics with context"
},
{
"id": "perf_3",
"conversation": """USER: The influencer campaign flopped. 2M impressions but only 50 conversions.
ASSISTANT: That's a 0.0025% conversion rate. What went wrong?
USER: Wrong audience. The influencer's followers weren't our target demo.""",
"expected": ["company.performance_context"],
"difficulty": "medium",
"category_focus": "Campaign failure analysis"
},
{
"id": "perf_4",
"conversation": """USER: We learned that video content outperforms static by 3x on engagement.
ASSISTANT: That's a significant insight. Are you shifting your content mix?
USER: Yes, Q4 will be 70% video. It's a major strategic shift.""",
"expected": ["company.performance_context", "company.business_priorities"],
"difficulty": "hard",
"category_focus": "Learning leading to priority"
},
{
"id": "perf_5",
"conversation": """USER: What was our email open rate last week?
ASSISTANT: I don't have access to that data in real-time.
USER: I'll check the dashboard.""",
"expected": ["none"],
"difficulty": "easy",
"category_focus": "Data lookup request"
},
{
"id": "perf_6",
"conversation": """USER: Our NPS dropped from 72 to 58 after the website redesign.
ASSISTANT: That's a significant drop. Customer feedback?
USER: They hate the new navigation. We're rolling back next week.""",
"expected": ["company.performance_context"],
"difficulty": "medium",
"category_focus": "Customer satisfaction metrics"
},
{
"id": "perf_7",
"conversation": """USER: The podcast is our best performing channel - 40% of qualified leads come from it.
ASSISTANT: That's impressive attribution. How are you tracking it?
USER: Custom UTM parameters and post-conversion surveys.""",
"expected": ["company.performance_context"],
"difficulty": "medium",
"category_focus": "Channel performance"
},
{
"id": "perf_8",
"conversation": """USER: We've tested 47 different landing page variants this year.
ASSISTANT: That's rigorous testing. What's been the biggest winner?
USER: Social proof above the fold. Increased conversions by 34%.""",
"expected": ["company.performance_context", "company.knowledge_artifacts"],
"difficulty": "hard",
"category_focus": "Testing insights becoming knowledge"
},
# ========== TOOLS & INTEGRATIONS (25-30) ==========
{
"id": "tools_1",
"conversation": """USER: We use HubSpot for CRM, Klaviyo for email, and Shopify for commerce. Everything needs to sync.
ASSISTANT: What's your current integration setup?
USER: Zapier connects them, but we're moving to native integrations.""",
"expected": ["company.tools_config"],
"difficulty": "easy",
"category_focus": "Tech stack overview"
},
{
"id": "tools_2",
"conversation": """USER: Our API key for the analytics platform is expiring next month.
ASSISTANT: I'll note that for renewal.
USER: Thanks. It's critical - everything breaks without it.""",
"expected": ["company.tools_config"],
"difficulty": "medium",
"category_focus": "API configuration"
},
{
"id": "tools_3",
"conversation": """USER: We just migrated from Mailchimp to Klaviyo. The segmentation is so much better.
ASSISTANT: What segments are you using?
USER: RFM-based: high-value, at-risk, and dormant customers.""",
"expected": ["company.tools_config"],
"difficulty": "medium",
"category_focus": "Tool migration and usage"
},
{
"id": "tools_4",
"conversation": """USER: Can you remind me how to export data from Google Analytics?
ASSISTANT: Go to Reports, select your date range, then click Export.
USER: Got it, thanks.""",
"expected": ["none"],
"difficulty": "easy",
"category_focus": "How-to question"
},
{
"id": "tools_5",
"conversation": """USER: Our Slack is integrated with everything - Asana, HubSpot, even our custom dashboards.
ASSISTANT: Centralized notifications.
USER: It's our command center. All marketing alerts go there.""",
"expected": ["company.tools_config"],
"difficulty": "medium",
"category_focus": "Workflow integrations"
},
{
"id": "tools_6",
"conversation": """USER: We're evaluating Notion vs Confluence for our marketing wiki.
ASSISTANT: What are your key requirements?
USER: Easy editing and good search. Our current docs are a mess.""",
"expected": ["none"],
"difficulty": "medium",
"category_focus": "Tool evaluation (not decided)"
},
# ========== USER PREFERENCES & ROLE (31-40) ==========
{
"id": "user_1",
"conversation": """USER: I'm the VP of Growth, reporting to the CMO. I own all acquisition channels.
ASSISTANT: That's a broad scope. What's your team size?
USER: 12 people across paid, organic, and partnerships.""",
"expected": ["user.role_context"],
"difficulty": "easy",
"category_focus": "Role and scope"
},
{
"id": "user_2",
"conversation": """USER: I prefer bullet points over paragraphs. Get to the point quickly.
ASSISTANT: Noted. I'll keep responses concise and scannable.
USER: Perfect. Time is my scarcest resource.""",
"expected": ["user.communication_style"],
"difficulty": "easy",
"category_focus": "Format preference"
},
{
"id": "user_3",
"conversation": """USER: Don't sugarcoat things. If something's not working, tell me directly.
ASSISTANT: Direct feedback, even when it's uncomfortable.
USER: Especially when it's uncomfortable. That's when I need it most.""",
"expected": ["user.interaction_preferences"],
"difficulty": "medium",
"category_focus": "Feedback style"
},
{
"id": "user_4",
"conversation": """USER: I review all campaigns on Friday afternoons. That's when I have focus time.
ASSISTANT: I'll schedule campaign reviews for Fridays then.
USER: And send me a summary Thursday evening so I can prep.""",
"expected": ["user.workflow_patterns"],
"difficulty": "medium",
"category_focus": "Review cadence"
},
{
"id": "user_5",
"conversation": """USER: Yesterday we discussed the influencer strategy. Can you pull up those notes?
ASSISTANT: Yes, we covered micro-influencer targeting and budget allocation.
USER: Right. Let's continue from the budget discussion.""",
"expected": ["user.session_history"],
"difficulty": "easy",
"category_focus": "Session continuity"
},
{
"id": "user_6",
"conversation": """USER: I believe in 'strong opinions, loosely held'. I'll argue a point but change my mind with good data.
ASSISTANT: Intellectually flexible but not wishy-washy.
USER: Exactly. Challenge me when you think I'm wrong.""",
"expected": ["user.strategic_approach", "user.interaction_preferences"],
"difficulty": "hard",
"category_focus": "Personal philosophy + interaction style"
},
{
"id": "user_7",
"conversation": """USER: How's your day going?
ASSISTANT: Going well, thanks! How can I help you today?
USER: Just checking in before our meeting.""",
"expected": ["none"],
"difficulty": "easy",
"category_focus": "Small talk"
},
{
"id": "user_8",
"conversation": """USER: I'm a visual thinker. Show me charts and diagrams whenever possible.
ASSISTANT: I'll prioritize visual representations in my responses.
USER: And color-code things by priority if you can.""",
"expected": ["user.communication_style"],
"difficulty": "medium",
"category_focus": "Visual preference"
},
{
"id": "user_9",
"conversation": """USER: I've been in marketing for 15 years, so skip the basics.
ASSISTANT: I'll assume expert-level knowledge.
USER: Good. I hate when tools explain obvious concepts.""",
"expected": ["user.interaction_preferences", "user.role_context"],
"difficulty": "hard",
"category_focus": "Experience level + interaction"
},
{
"id": "user_10",
"conversation": """USER: My success metric is pipeline contribution. Everything else is vanity.
ASSISTANT: Revenue-focused measurement.
USER: If it doesn't move pipeline, I don't care about it.""",
"expected": ["user.strategic_approach"],
"difficulty": "medium",
"category_focus": "Personal success definition"
},
# ========== BUSINESS PRIORITIES (41-46) ==========
{
"id": "priority_1",
"conversation": """USER: Q4 is all about Black Friday and Cyber Monday. Everything else takes a back seat.
ASSISTANT: So BFCM is the top priority through year-end?
USER: 60% of our annual revenue happens in those two weeks.""",
"expected": ["company.business_priorities"],
"difficulty": "easy",
"category_focus": "Seasonal priority"
},
{
"id": "priority_2",
"conversation": """USER: We're launching a new product line next month. All marketing resources are shifting to support it.
ASSISTANT: What's the launch timeline?
USER: Soft launch March 1st, full campaign March 15th.""",
"expected": ["company.business_priorities"],
"difficulty": "medium",
"category_focus": "Product launch priority"
},
{
"id": "priority_3",
"conversation": """USER: The board wants us to focus on profitability over growth this year.
ASSISTANT: A shift from growth-at-all-costs.
USER: Yes. CAC payback under 12 months is now mandatory.""",
"expected": ["company.business_priorities", "company.strategic_signatures"],
"difficulty": "hard",
"category_focus": "Strategic priority shift"
},
{
"id": "priority_4",
"conversation": """USER: We need to hit 10,000 email subscribers by end of quarter.
ASSISTANT: What's the current count?
USER: 6,500. So we need about 120 new subs per day.""",
"expected": ["company.business_priorities"],
"difficulty": "medium",
"category_focus": "Specific goal"
},
{
"id": "priority_5",
"conversation": """USER: Retention is our #1 focus this year. We've been leaking customers.
ASSISTANT: Churn reduction over acquisition.
USER: We're pausing all acquisition campaigns until we fix the retention problem.""",
"expected": ["company.business_priorities"],
"difficulty": "medium",
"category_focus": "Strategic focus"
},
{
"id": "priority_6",
"conversation": """USER: What should we prioritize next quarter?
ASSISTANT: That depends on your goals. What's most important to the business?
USER: I'm not sure yet. Still figuring it out.""",
"expected": ["none"],
"difficulty": "medium",
"category_focus": "Undefined priority"
},
# ========== KNOWLEDGE ARTIFACTS (47-50) ==========
{
"id": "knowledge_1",
"conversation": """USER: We have a 50-page brand book that every agency partner must follow.
ASSISTANT: Comprehensive brand guidelines.
USER: It covers everything from logo usage to photography style.""",
"expected": ["company.knowledge_artifacts"],
"difficulty": "easy",
"category_focus": "Brand documentation"
},
{
"id": "knowledge_2",
"conversation": """USER: Our content playbook defines the exact process from ideation to publication.
ASSISTANT: A documented workflow.
USER: Every piece of content goes through the same 7-step process.""",
"expected": ["company.knowledge_artifacts"],
"difficulty": "medium",
"category_focus": "Process documentation"
},
{
"id": "knowledge_3",
"conversation": """USER: We have templates for every email type - welcome, abandoned cart, win-back, you name it.
ASSISTANT: A comprehensive email template library.
USER: It's saved us hundreds of hours. New team members can start producing immediately.""",
"expected": ["company.knowledge_artifacts"],
"difficulty": "medium",
"category_focus": "Template library"
},
{
"id": "knowledge_4",
"conversation": """USER: Our style guide says we never use exclamation marks in headlines.
ASSISTANT: A specific editorial rule.
USER: It's part of our understated brand voice.""",
"expected": ["company.knowledge_artifacts", "company.brand_core"],
"difficulty": "hard",
"category_focus": "Style guide overlapping brand"
},
]
def parse_prediction(text):
"""Parse model output into category set."""
if not text or not text.strip():
return set()
# Handle various formats
text = text.lower().strip()
# Remove common prefixes
for prefix in ["categories:", "category:", "the categories are:", "answer:"]:
if text.startswith(prefix):
text = text[len(prefix):].strip()
cats = [c.strip() for c in text.split(",")]
return {c for c in cats if c in VALID_CATEGORIES}
def compute_metrics(predicted, gold):
"""Compute F1, precision, recall."""
if not predicted and not gold:
return 1.0, 1.0, 1.0, True, True
if not predicted or not gold:
return 0.0, 0.0, 0.0, False, False
tp = len(predicted & gold)
precision = tp / len(predicted)
recall = tp / len(gold)
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
any_match = tp > 0
exact_match = predicted == gold
return f1, precision, recall, any_match, exact_match
async def eval_tinker_model(name, checkpoint, model_name, renderer_name):
"""Evaluate a Tinker model."""
print(f"\n{'='*60}", flush=True)
print(f"Evaluating: {name}", flush=True)
print(f"{'='*60}", flush=True)
service_client = tinker.ServiceClient()
sampling_client = service_client.create_sampling_client(model_path=checkpoint)
tokenizer = get_tokenizer(model_name)
renderer = renderers.get_renderer(name=renderer_name, tokenizer=tokenizer)
stop = renderer.get_stop_sequences()
params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=stop)
results = []
for i, test in enumerate(MARKETING_BENCHMARK):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Analyze this conversation and determine which memory categories apply:\n\n{test['conversation']}"}
]
prompt = renderer.build_generation_prompt(messages)
result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
response, _ = renderer.parse_response(result.sequences[0].tokens)
predicted = parse_prediction(response["content"])
gold = set(test["expected"])
f1, prec, rec, any_match, exact = compute_metrics(predicted, gold)
results.append({
"id": test["id"],
"predicted": list(predicted),
"gold": list(gold),
"f1": f1,
"any_match": any_match,
"exact_match": exact,
"difficulty": test["difficulty"]
})
status = "✓" if any_match else "✗"
print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True)
return results
async def eval_cohere_model():
"""Evaluate Cohere Command-R-Plus (teacher model)."""
print(f"\n{'='*60}", flush=True)
print(f"Evaluating: Cohere Command-R-Plus (Teacher)", flush=True)
print(f"{'='*60}", flush=True)
client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
results = []
for i, test in enumerate(MARKETING_BENCHMARK):
prompt = f"""{SYSTEM_PROMPT}
Analyze this conversation and determine which memory categories apply:
{test['conversation']}
Respond with comma-separated categories only. No explanation."""
try:
response = client.chat(
model="command-r-plus-08-2024",
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=100
)
# Extract text from response
response_text = ""
if hasattr(response.message, 'content') and response.message.content:
for block in response.message.content:
if hasattr(block, 'text'):
response_text = block.text
break
predicted = parse_prediction(response_text)
gold = set(test["expected"])
f1, prec, rec, any_match, exact = compute_metrics(predicted, gold)
results.append({
"id": test["id"],
"predicted": list(predicted),
"gold": list(gold),
"f1": f1,
"any_match": any_match,
"exact_match": exact,
"difficulty": test["difficulty"]
})
status = "✓" if any_match else "✗"
print(f"[{i+1:2d}] {status} {test['id']:<15} F1={f1:.2f}", flush=True)
# Rate limiting
await asyncio.sleep(0.5)
except Exception as e:
print(f"[{i+1:2d}] ERROR {test['id']}: {e}", flush=True)
results.append({
"id": test["id"],
"predicted": [],
"gold": list(test["expected"]),
"f1": 0.0,
"any_match": False,
"exact_match": False,
"difficulty": test["difficulty"],
"error": str(e)
})
return results
def compute_summary(results, name):
"""Compute summary statistics."""
n = len(results)
avg_f1 = sum(r["f1"] for r in results) / n
any_match = sum(1 for r in results if r["any_match"]) / n
exact_match = sum(1 for r in results if r["exact_match"]) / n
# By difficulty
by_diff = {}
for diff in ["easy", "medium", "hard"]:
subset = [r for r in results if r["difficulty"] == diff]
if subset:
by_diff[diff] = {
"count": len(subset),
"f1": sum(r["f1"] for r in subset) / len(subset),
"any_match": sum(1 for r in subset if r["any_match"]) / len(subset),
"exact_match": sum(1 for r in subset if r["exact_match"]) / len(subset)
}
return {
"name": name,
"total": n,
"avg_f1": avg_f1,
"any_match": any_match,
"exact_match": exact_match,
"by_difficulty": by_diff
}
async def main():
print("=" * 70, flush=True)
print("FINAL BENCHMARK: Memory Routing Model Comparison", flush=True)
print("50 Challenging Marketing Scenarios", flush=True)
print("=" * 70, flush=True)
all_results = {}
# 1. Our RL-trained model
rl_results = await eval_tinker_model(
name="Llama-8B + LoRA + RL (Ours)",
checkpoint="tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012",
model_name="meta-llama/Llama-3.1-8B",
renderer_name="llama3"
)
all_results["rl_model"] = rl_results
# 2. Cohere Command-R-Plus (Teacher)
cohere_results = await eval_cohere_model()
all_results["cohere"] = cohere_results
# Compute summaries
summaries = {
"rl_model": compute_summary(rl_results, "Llama-8B + LoRA + RL (Ours)"),
"cohere": compute_summary(cohere_results, "Cohere Command-R-Plus (104B)")
}
# Print comparison
print("\n" + "=" * 70, flush=True)
print("BENCHMARK RESULTS", flush=True)
print("=" * 70, flush=True)
print(f"\n{'Model':<35} {'Any Match':<12} {'Exact':<12} {'Avg F1':<10}", flush=True)
print("-" * 70, flush=True)
for key, summary in summaries.items():
print(f"{summary['name']:<35} {summary['any_match']:<12.0%} {summary['exact_match']:<12.0%} {summary['avg_f1']:<10.2f}", flush=True)
print("\n" + "-" * 70, flush=True)
print("RESULTS BY DIFFICULTY", flush=True)
print("-" * 70, flush=True)
for diff in ["easy", "medium", "hard"]:
print(f"\n{diff.upper()}:", flush=True)
for key, summary in summaries.items():
if diff in summary["by_difficulty"]:
d = summary["by_difficulty"][diff]
print(f" {summary['name']:<33} Any={d['any_match']:.0%} Exact={d['exact_match']:.0%} F1={d['f1']:.2f} (n={d['count']})", flush=True)
# Save results
output = {
"benchmark_date": datetime.now().isoformat(),
"num_scenarios": len(MARKETING_BENCHMARK),
"summaries": summaries,
"detailed_results": all_results
}
os.makedirs("training/benchmarks", exist_ok=True)
output_path = f"training/benchmarks/final_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_path, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\nResults saved to: {output_path}", flush=True)
# Key findings
print("\n" + "=" * 70, flush=True)
print("KEY FINDINGS", flush=True)
print("=" * 70, flush=True)
rl_f1 = summaries["rl_model"]["avg_f1"]
cohere_f1 = summaries["cohere"]["avg_f1"]
if rl_f1 > cohere_f1:
improvement = ((rl_f1 - cohere_f1) / cohere_f1) * 100
print(f"✓ Our 8B model OUTPERFORMS the 104B teacher by {improvement:.1f}% on F1", flush=True)
else:
gap = ((cohere_f1 - rl_f1) / cohere_f1) * 100
print(f" Our 8B model is within {gap:.1f}% of the 104B teacher on F1", flush=True)
print(f"\nModel Sizes:", flush=True)
print(f" - Llama-8B + LoRA: ~8B parameters (LoRA adds ~0.1B)", flush=True)
print(f" - Cohere Command-R-Plus: ~104B parameters", flush=True)
print(f" - Size ratio: 13x smaller", flush=True)
if __name__ == "__main__":
asyncio.run(main())