#!/usr/bin/env python3 """ Upload model card, training scripts, and inference helper to rafiakedir/tenacious-bench-adapter on HuggingFace. Does NOT re-upload the safetensors weights — those are already there. """ from pathlib import Path from huggingface_hub import HfApi, CommitOperationAdd ROOT = Path(__file__).parent REPO_ID = "rafiakedir/tenacious-bench-adapter" # ── Model Card ──────────────────────────────────────────────────────────────── MODEL_CARD = """\ --- license: cc-by-4.0 language: - en base_model: unsloth/Qwen3.5-0.8B tags: - judge - b2b-sales - orpo - preference-learning - tenacious-bench - evaluation - qwen3 - unsloth datasets: - rafiakedir/tenacious-bench-v0.1 --- # Tenacious-Bench Judge — ORPO Fine-Tuned Qwen3.5-0.8B A rubric-aware scoring judge for B2B outbound sales emails, trained with ORPO on [Tenacious-Bench v0.1](https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1) preference pairs. Deployed as a **rejection-sampling gate** in the Tenacious Conversion Engine: the generator (DeepSeek V3.2) produces a candidate email; this judge scores it on five rubric dimensions; outputs below threshold are rejected and regenerated. **Base model:** `unsloth/Qwen3.5-0.8B` **Training algorithm:** ORPO (no reference model — single forward pass) **Weights:** Merged (full model, not a LoRA adapter) **Precision:** BF16 · ~873M parameters · ~1.75 GB **Context length:** 262,144 tokens **Training data:** 94 ORPO preference pairs from `rafiakedir/tenacious-bench-v0.1` (train split) --- ## What It Scores | Dimension | Trigger Rate (Week 10 probes) | Risk if Missed | |---|---|---| | `signal_grounding_fidelity` | 35% | CTO credibility loss | | `competitor_gap_honesty` | 45% | Irreversible brand damage | | `icp_segment_appropriateness` | 20% | ~$480K ACV per error | | `tone_preservation` | 15% | Brand voice violation | | `bench_commitment_honesty` | 5% | SOW-breach / delivery failure | --- ## Quick Start — Inference ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch model_id = "rafiakedir/tenacious-bench-adapter" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) SYSTEM = \"\"\"You are a rubric-aware judge for B2B outbound sales emails. Score the candidate output on the following dimension. Dimension: signal_grounding_fidelity Rubric: Every factual claim must resolve to a field in the hiring_signal_brief with confidence >= 0.60, or be phrased as a question. Respond with a JSON object: {"score": <0.0-1.0>, "reasoning": ""}\"\"\" USER = \"\"\"Hiring signal brief: { "company_name": "Acme Corp", "open_roles": 3, "confidence": "low", "domain": "fintech" } Candidate email: "Hi Alex — noticed Acme Corp is aggressively scaling its engineering team with 3 open roles. We staff specialized capability-gap squads for fintech teams at your growth stage. Would a 30-minute scoping conversation make sense this week?" Score this output.\"\"\" messages = [ {"role": "system", "content": SYSTEM}, {"role": "user", "content": USER}, ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=128, temperature=0.1, do_sample=True) response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) print(response) # Expected: {"score": 0.4, "reasoning": "Claims 'aggressively scaling' but brief confidence is low — should be phrased as a question."} ``` --- ## Training Details ### Why ORPO ORPO (Hong et al., 2024) eliminates the reference model by computing the preference signal from the log-odds ratio of chosen vs. rejected completions in a single forward pass. This reduces peak VRAM by ~40% vs. DPO, making 3-epoch training feasible on a 16GB T4 without gradient checkpointing hacks. For a discriminative judge (score calibration rather than generation quality), the preference signal should be stronger. We ran `beta=0.1` per the paper's recommendation but note that `beta=0.2`–`0.3` may better calibrate the preference margin for rubric-based scoring. ### Preference Pair Construction | Source | Count | |---|---| | Failing tasks → generated chosen (DeepSeek V3.2) | ~111 attempted | | Passing tasks → generated rejected (DeepSeek V3.2) | ~41 attempted | | **Final pairs after filtering** | **94** | Filter: chosen score ≥ threshold AND rejected score < threshold AND TF-IDF cosine < 0.92. Main rejection causes: chosen output still scoring below threshold (phrasing-mode sensitivity), and ICP segment tasks with key mismatch making pass threshold structurally unachievable. **Preference leakage prevention (Li et al., 2025):** Generator (DeepSeek V3.2) ≠ judge family (Claude Sonnet 4.6 / `scoring_evaluator.py`). All generation decisions logged in the dataset repo at `training_data/generation_log.jsonl`. ### Hyperparameters | Parameter | Value | |---|---| | Base model | `unsloth/Qwen3.5-0.8B` | | LoRA rank | 16 | | LoRA alpha | 32 | | Target modules | q_proj, v_proj | | LoRA dropout | 0.05 | | Learning rate | 8e-6 | | Batch size (per device) | 2 | | Gradient accumulation | 4 (effective batch 8) | | Epochs | 3 | | Warmup ratio | 0.1 | | LR scheduler | cosine | | ORPO beta | 0.1 | | Max sequence length | 1024 | | Precision | BF16 (T4) | | Seed | 42 | Training notebook: see `run_on_colab.ipynb` in this repo. --- ## Evaluation Results Evaluated on 59 held-out tasks from `rafiakedir/tenacious-bench-v0.1`. Paired bootstrap significance test: 10,000 iterations, seed 42. | Condition | Mean Score | vs. Baseline | |---|---|---| | Baseline (`scoring_evaluator.py` only) | 0.458 | — | | **This model (ORPO Qwen3.5-0.8B)** | **0.483** | Δ=+0.025, p=0.189, not significant | | Prompt-only (Qwen3-30B, zero-shot) | 0.504 | Δ=−0.021 vs. trained, p=0.978 | **Delta A** (trained vs. baseline): Δ=+0.025, 95% CI [−0.032, +0.081], p=0.189 — **not statistically significant**. **Delta B** (trained vs. prompt-only): not significant. Finding: `prompt_engineering_sufficient` — the Qwen3-30B zero-shot condition is a viable lower-cost alternative at this scale of training data. Note: Delta B compares a 0.8B trained model against a 30B zero-shot model — this conflates backbone capacity with training benefit. A rigorous Delta B requires re-running the prompt-only condition on `Qwen3.5-0.8B-Instruct` (no fine-tuning). **Deployment recommendation for this run:** DO NOT DEPLOY as primary gate. Continue using `scoring_evaluator.py` deterministically. Retrain with ≥150 pairs covering all 5 dimensions before re-evaluating. Full numbers: `ablation_results.json` in the dataset repo. --- ## Known Limitations **1. Dimension coverage gap (critical).** The preference pairs contain 0 examples for `bench_commitment_honesty` and only 4 examples for `icp_segment_appropriateness`, due to a scoring function key mismatch that made it impossible to generate valid chosen outputs for these dimensions. The model received zero gradient signal on bench commitment honesty — the highest SOW-breach-risk dimension. It cannot be trusted to gate bench-commitment outputs. **2. Delta A not significant at v0.1 scale.** The +0.025 lift over the deterministic baseline is within the noise band (p=0.189). The model does not reliably outperform `scoring_evaluator.py` on held-out tasks. **3. Backbone below Prometheus-2 threshold.** Prometheus-2 (Kim et al., 2024) demonstrated rubric-matching at 7B parameters. Qwen3.5-0.8B is below that threshold. Capacity may be insufficient for simultaneous multi-dimension rubric generalization. **4. Synthetic training distribution.** All preference pairs derive from synthetic prospect briefs and LLM-generated emails. The model may not generalize to real prospect data with industry-specific jargon or edge cases outside the training distribution. **5. Static bench_summary.** The judge was trained on snapshot bench capacities. In production the bench changes weekly — calibration for `bench_commitment_honesty` will drift over time. --- ## Files in This Repo | File | Description | |---|---| | `model.safetensors-*` | Merged model weights (BF16) | | `config.json` | Model architecture config | | `tokenizer.json`, `tokenizer_config.json` | Tokenizer (ChatML format) | | `train_judge.py` | Full ORPO training script | | `hyperparams.json` | All hyperparameters (pinned) | | `run_on_colab.ipynb` | End-to-end training notebook for T4 | | `inference_example.py` | Inference helper with prompt templates | Training data and preference pairs: [rafiakedir/tenacious-bench-v0.1](https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1) --- ## Environmental Impact - **Compute:** ~60–90 min on a single T4 GPU (3 epochs, 94 preference pairs) - **CO₂e:** ~0.1 kg (T4 at 70W × 90 min × US grid 0.42 kg CO₂/kWh ÷ 1000) - **Infrastructure:** Google Colab free tier --- ## Citation ```bibtex @misc{tenacious-bench-adapter-2026, title = {Tenacious-Bench Judge: ORPO Fine-Tuned Qwen3.5-0.8B for B2B Sales Evaluation}, author = {Kedir, Rafia}, year = {2026}, howpublished = {HuggingFace Model Hub}, url = {https://huggingface.co/rafiakedir/tenacious-bench-adapter} } @misc{tenacious-bench-v01-2026, title = {Tenacious-Bench v0.1: B2B Sales Evaluation Benchmark}, author = {Kedir, Rafia}, year = {2026}, howpublished = {HuggingFace Datasets Hub}, url = {https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1} } ``` """ # ── Inference Example ───────────────────────────────────────────────────────── INFERENCE_EXAMPLE = '''\ #!/usr/bin/env python3 """ Inference helper for rafiakedir/tenacious-bench-adapter. Scores a candidate B2B sales email on one or all five Tenacious rubric dimensions. """ import json import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "rafiakedir/tenacious-bench-adapter" DIMENSION_PROMPTS = { "signal_grounding_fidelity": ( "Dimension: signal_grounding_fidelity\\n" "Rubric: Every factual claim about the prospect must be supported by the hiring_signal_brief " "with confidence >= 0.60, or phrased as a question. Claims about growth, scale, or ML strategy " "without a high/medium-confidence signal in the brief must be recast as questions." ), "bench_commitment_honesty": ( "Dimension: bench_commitment_honesty\\n" "Rubric: The email must not promise or imply a number of engineers that exceeds " "the total available in the bench_summary. Any staffing commitment must stay within capacity." ), "icp_segment_appropriateness": ( "Dimension: icp_segment_appropriateness\\n" "Rubric: The email's language and pitch angle must match the correct ICP segment " "(Segment 1=growth-scale, Segment 2=cost-restructuring, Segment 3=consolidation, " "ABSTAIN=insufficient signal). A growth pitch to a post-layoff company is a mismatch." ), "competitor_gap_honesty": ( "Dimension: competitor_gap_honesty\\n" "Rubric: Any assertion about a competitor gap must be grounded in the competitor_gap_brief. " "The email must not assert that competitors have capabilities the prospect lacks " "unless the brief explicitly documents this gap." ), "tone_preservation": ( "Dimension: tone_preservation\\n" "Rubric: No re-engagement clichés ('just wanted to circle back', 'touching base', " "'following up'). No over-apologetic exits ('sorry for taking your time'). " "Calendar CTA required. Confident but not pushy." ), } SYSTEM_TEMPLATE = """\ You are a rubric-aware judge for B2B outbound sales emails written by Tenacious Consulting. {dimension_prompt} Respond with a JSON object only: {{"score": , "reasoning": ""}} """ USER_TEMPLATE = """\ Context: {context_json} Candidate email: {candidate_output} Score this output on the dimension above.\ """ def load_model(model_id: str = MODEL_ID): tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", ) model.eval() return tokenizer, model def score( tokenizer, model, task_input: dict, candidate_output: str, dimension: str, max_new_tokens: int = 150, ) -> dict: """ Score a single candidate output on one rubric dimension. Args: task_input: dict with keys like 'hiring_signal_brief', 'bench_summary', etc. candidate_output: the email text to score dimension: one of the five Tenacious rubric dimensions Returns: dict with 'score' (float) and 'reasoning' (str) """ if dimension not in DIMENSION_PROMPTS: raise ValueError(f"Unknown dimension: {dimension}. Choose from {list(DIMENSION_PROMPTS)}") context_json = json.dumps(task_input, indent=2) system = SYSTEM_TEMPLATE.format(dimension_prompt=DIMENSION_PROMPTS[dimension]) user = USER_TEMPLATE.format( context_json=context_json, candidate_output=candidate_output, ) messages = [ {"role": "system", "content": system}, {"role": "user", "content": user}, ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode( out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True ).strip() # Parse JSON from response try: # Find first { ... } block start = response.find("{") end = response.rfind("}") + 1 result = json.loads(response[start:end]) return {"score": float(result["score"]), "reasoning": result.get("reasoning", "")} except Exception: return {"score": 0.5, "reasoning": f"parse_error: {response[:200]}"} def score_all_dimensions(tokenizer, model, task_input: dict, candidate_output: str) -> dict: """Score a candidate output on all five dimensions.""" results = {} for dim in DIMENSION_PROMPTS: results[dim] = score(tokenizer, model, task_input, candidate_output, dim) results["mean_score"] = sum(r["score"] for r in results.values()) / len(results) return results # ── Demo ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": print(f"Loading {MODEL_ID}...") tokenizer, model = load_model() demo_input = { "hiring_signal_brief": { "company_name": "Acme Corp", "domain": "fintech", "open_roles": 3, "confidence": "low", "stage": "Series B", }, "bench_summary": { "total_available": 8, "specializations": ["Python", "Go", "ML Engineering"], }, } demo_email = ( "Hi Alex — noticed Acme Corp is aggressively scaling its engineering team " "with 3 open roles. We staff specialized capability-gap squads for fintech " "teams at your growth stage. Would a 30-minute scoping conversation make sense this week?" ) print("\\nScoring on signal_grounding_fidelity...") result = score(tokenizer, model, demo_input, demo_email, "signal_grounding_fidelity") print(f" Score: {result[\'score\']:.2f}") print(f" Reasoning: {result[\'reasoning\']}") print("\\nScoring all dimensions...") all_results = score_all_dimensions(tokenizer, model, demo_input, demo_email) for dim, r in all_results.items(): if dim == "mean_score": print(f" MEAN: {r:.3f}") else: print(f" {dim}: {r[\'score\']:.2f} — {r[\'reasoning\'][:80]}") ''' def main(): api = HfApi() operations = [] def add_bytes(content: bytes, repo_path: str, label: str = ""): lbl = label or repo_path print(f" queuing {lbl} ({len(content):,} bytes)") operations.append(CommitOperationAdd( path_in_repo=repo_path, path_or_fileobj=content, )) def add_file(local_path: Path, repo_path: str): print(f" queuing {repo_path} ({local_path.stat().st_size:,} bytes)") operations.append(CommitOperationAdd( path_in_repo=repo_path, path_or_fileobj=str(local_path), )) # Model card add_bytes(MODEL_CARD.encode(), "README.md", "README.md (model card)") # Inference example add_bytes(INFERENCE_EXAMPLE.encode(), "inference_example.py") # Training scripts add_file(ROOT / "training" / "train_judge.py", "train_judge.py") add_file(ROOT / "training" / "hyperparams.json", "hyperparams.json") add_file(ROOT / "training" / "run_on_colab.ipynb", "run_on_colab.ipynb") add_file(ROOT / "training" / "requirements_training.txt", "requirements_training.txt") print(f"\nCommitting {len(operations)} files to {REPO_ID}...") url = api.create_commit( repo_id=REPO_ID, repo_type="model", operations=operations, commit_message=( "feat: add model card, inference example, and training scripts\n\n" "- Proper model card with YAML frontmatter (base_model, tags, datasets)\n" "- Honest eval results: Delta A p=0.189 not significant, DO NOT DEPLOY verdict\n" "- Dimension coverage gap documented (bench_commitment_honesty=0 pairs)\n" "- inference_example.py with per-dimension and all-dimensions scoring\n" "- Training scripts: train_judge.py, hyperparams.json, run_on_colab.ipynb" ), ) print(f"\nDone. Commit URL: {url}") print(f"Model: https://huggingface.co/rafiakedir/tenacious-bench-adapter") if __name__ == "__main__": main()