#!/usr/bin/env python3 """ Evaluate Model CLI Run comprehensive evaluation suite on the CEO voice model. Measures voice authenticity, factual accuracy, and generates human eval sets. Usage: python scripts/evaluate_model.py --model username/model --test-set test.jsonl Environment: HF_TOKEN - Hugging Face token for loading models """ import argparse import json import sys from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent)) from rich.console import Console from rich.table import Table from rich.progress import Progress, SpinnerColumn, TextColumn console = Console() def main(): parser = argparse.ArgumentParser( description="Evaluate the CEO voice model", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Full evaluation python scripts/evaluate_model.py \\ --model username/ceo-voice-model \\ --test-set data/training/validation.jsonl \\ --style-profile data/processed/style_profile.json \\ --posts data/processed/posts.json # Voice metrics only python scripts/evaluate_model.py \\ --responses outputs.json \\ --style-profile data/processed/style_profile.json \\ --voice-only # Generate human eval set python scripts/evaluate_model.py \\ --model username/model-a \\ --model-b username/model-b \\ --test-set test.jsonl \\ --generate-human-eval \\ --output evaluation/ Environment: HF_TOKEN - Hugging Face token """, ) # Model options parser.add_argument("--model", help="Voice model ID or path") parser.add_argument("--model-b", help="Second model for A/B comparison") parser.add_argument("--adapter", help="LoRA adapter path (if separate)") # Input options parser.add_argument("--test-set", help="Test set JSONL file") parser.add_argument("--responses", help="Pre-generated responses JSON") parser.add_argument("--num-samples", type=int, default=50, help="Number of samples") # Reference data parser.add_argument("--style-profile", help="Style profile JSON") parser.add_argument("--posts", help="Parsed posts JSON for factual check") parser.add_argument("--segments", help="Segments JSON for embeddings") # Evaluation options parser.add_argument("--voice-only", action="store_true", help="Voice metrics only") parser.add_argument("--factual-only", action="store_true", help="Factual check only") parser.add_argument("--generate-human-eval", action="store_true", help="Generate human eval set") # Output options parser.add_argument("--output", default="evaluation/", help="Output directory") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") # Model loading options parser.add_argument("--no-4bit", action="store_true", help="Disable 4-bit quantization") parser.add_argument("--no-refinement", action="store_true", help="Skip refinement model") args = parser.parse_args() console.print("\n[bold blue]AI Executive - Model Evaluation[/bold blue]") console.print("=" * 50) # Create output directory output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) results = {} # Load test data test_questions = [] reference_answers = [] if args.test_set: console.print(f"\n[yellow]Loading test set:[/yellow] {args.test_set}") with open(args.test_set, "r") as f: for line in f: if line.strip(): item = json.loads(line) messages = item.get("messages", []) for msg in messages: if msg["role"] == "user": test_questions.append(msg["content"]) elif msg["role"] == "assistant": reference_answers.append(msg["content"]) # Limit samples if args.num_samples and len(test_questions) > args.num_samples: test_questions = test_questions[:args.num_samples] reference_answers = reference_answers[:args.num_samples] console.print(f" Loaded {len(test_questions)} test questions") # Load pre-generated responses or generate new ones responses = [] responses_b = [] if args.responses: console.print(f"\n[yellow]Loading responses:[/yellow] {args.responses}") with open(args.responses, "r") as f: data = json.load(f) responses = [d["response"] if isinstance(d, dict) else d for d in data] console.print(f" Loaded {len(responses)} responses") elif args.model and test_questions: console.print(f"\n[yellow]Generating responses with:[/yellow] {args.model}") try: from src.inference.dual_llm_pipeline import DualLLMPipeline # Load pipeline with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console, ) as progress: task = progress.add_task("Loading model...", total=None) pipeline = DualLLMPipeline.from_hub( voice_model_id=args.model, voice_adapter_id=args.adapter, load_in_4bit=not args.no_4bit, enable_refinement=not args.no_refinement, ) # Generate responses console.print(f" Generating {len(test_questions)} responses...") for i, q in enumerate(test_questions): if i % 10 == 0: console.print(f" Progress: {i}/{len(test_questions)}") result = pipeline.generate(q) responses.append(result.final_response) # Save responses responses_path = output_dir / "generated_responses.json" with open(responses_path, "w") as f: json.dump([{"question": q, "response": r} for q, r in zip(test_questions, responses)], f, indent=2) console.print(f" Saved to: {responses_path}") except ImportError as e: console.print(f"[red]Error loading model:[/red] {e}") return 1 # Generate responses for model B if A/B comparison if args.model_b and test_questions: console.print(f"\n[yellow]Generating responses with model B:[/yellow] {args.model_b}") try: from src.inference.dual_llm_pipeline import DualLLMPipeline pipeline_b = DualLLMPipeline.from_hub( voice_model_id=args.model_b, load_in_4bit=not args.no_4bit, enable_refinement=not args.no_refinement, ) for i, q in enumerate(test_questions): if i % 10 == 0: console.print(f" Progress: {i}/{len(test_questions)}") result = pipeline_b.generate(q) responses_b.append(result.final_response) except ImportError as e: console.print(f"[red]Error loading model B:[/red] {e}") # Run voice metrics evaluation if args.style_profile and responses and not args.factual_only: console.print("\n[yellow]Running voice metrics evaluation...[/yellow]") try: from src.evaluation.voice_metrics import VoiceMetrics metrics = VoiceMetrics.from_style_profile( args.style_profile, blog_segments_path=args.segments, ) voice_results = metrics.evaluate_batch(responses) results["voice_metrics"] = voice_results # Display results table = Table(title="Voice Metrics") table.add_column("Metric", style="cyan") table.add_column("Value", style="white") for key, value in voice_results.items(): if isinstance(value, float): table.add_row(key, f"{value:.4f}") else: table.add_row(key, str(value)) console.print(table) # Evaluate individual responses if verbose if args.verbose and len(responses) <= 10: console.print("\n[dim]Individual response scores:[/dim]") for i, resp in enumerate(responses[:5]): result = metrics.evaluate_response(resp) console.print(f" [{i}] Voice: {result.overall_voice_score:.2f}, Style: {result.style_match_score:.2f}") except ImportError as e: console.print(f"[red]Voice metrics unavailable:[/red] {e}") # Run factual accuracy evaluation if args.posts and responses and not args.voice_only: console.print("\n[yellow]Running factual accuracy check...[/yellow]") try: from src.evaluation.factual_accuracy import FactualAccuracyChecker checker = FactualAccuracyChecker.from_blogs(args.posts) factual_results = checker.check_batch(responses) results["factual_accuracy"] = factual_results # Display results table = Table(title="Factual Accuracy") table.add_column("Metric", style="cyan") table.add_column("Value", style="white") for key, value in factual_results.items(): if isinstance(value, float): table.add_row(key, f"{value:.4f}") else: table.add_row(key, str(value)) console.print(table) except ImportError as e: console.print(f"[red]Factual check unavailable:[/red] {e}") # Generate human evaluation set if args.generate_human_eval and test_questions and (responses or responses_b): console.print("\n[yellow]Generating human evaluation set...[/yellow]") try: from src.evaluation.human_eval_generator import HumanEvalGenerator generator = HumanEvalGenerator() if responses_b: # A/B comparison items = generator.generate_blind_test( questions=test_questions, responses_a=responses, responses_b=responses_b, source_a_name=args.model or "Model A", source_b_name=args.model_b or "Model B", ) else: # Rating test items = generator.generate_rating_test( questions=test_questions, responses=responses, source_name=args.model or "Model", ) files = generator.save_evaluation_set(items, output_dir) console.print(f" Generated {len(items)} evaluation items") for name, path in files.items(): console.print(f" {name}: {path}") results["human_eval"] = {"num_items": len(items), "files": files} except ImportError as e: console.print(f"[red]Human eval generation unavailable:[/red] {e}") # Save all results if results: results_path = output_dir / "evaluation_results.json" with open(results_path, "w") as f: json.dump(results, f, indent=2) console.print(f"\n[green]Results saved to:[/green] {results_path}") # Summary console.print("\n" + "=" * 50) console.print("[bold green]Evaluation complete![/bold green]") if "voice_metrics" in results: vm = results["voice_metrics"] console.print(f"\nVoice Score: {vm.get('avg_overall_voice_score', 0):.2%}") console.print(f"Vocabulary Overlap: {vm.get('avg_vocabulary_overlap', 0):.2%}") if "factual_accuracy" in results: fa = results["factual_accuracy"] console.print(f"\nFactual Accuracy: {fa.get('avg_accuracy', 0):.2%}") console.print(f"Hallucinations: {fa.get('total_hallucinations', 0)}") return 0 if __name__ == "__main__": exit(main())