Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Evaluate Model CLI | |
| Run comprehensive evaluation suite on the CEO voice model. | |
| Measures voice authenticity, factual accuracy, and generates human eval sets. | |
| Usage: | |
| python scripts/evaluate_model.py --model username/model --test-set test.jsonl | |
| Environment: | |
| HF_TOKEN - Hugging Face token for loading models | |
| """ | |
| import argparse | |
| import json | |
| import sys | |
| from pathlib import Path | |
| # Add src to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich.progress import Progress, SpinnerColumn, TextColumn | |
| console = Console() | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Evaluate the CEO voice model", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Full evaluation | |
| python scripts/evaluate_model.py \\ | |
| --model username/ceo-voice-model \\ | |
| --test-set data/training/validation.jsonl \\ | |
| --style-profile data/processed/style_profile.json \\ | |
| --posts data/processed/posts.json | |
| # Voice metrics only | |
| python scripts/evaluate_model.py \\ | |
| --responses outputs.json \\ | |
| --style-profile data/processed/style_profile.json \\ | |
| --voice-only | |
| # Generate human eval set | |
| python scripts/evaluate_model.py \\ | |
| --model username/model-a \\ | |
| --model-b username/model-b \\ | |
| --test-set test.jsonl \\ | |
| --generate-human-eval \\ | |
| --output evaluation/ | |
| Environment: | |
| HF_TOKEN - Hugging Face token | |
| """, | |
| ) | |
| # Model options | |
| parser.add_argument("--model", help="Voice model ID or path") | |
| parser.add_argument("--model-b", help="Second model for A/B comparison") | |
| parser.add_argument("--adapter", help="LoRA adapter path (if separate)") | |
| # Input options | |
| parser.add_argument("--test-set", help="Test set JSONL file") | |
| parser.add_argument("--responses", help="Pre-generated responses JSON") | |
| parser.add_argument("--num-samples", type=int, default=50, help="Number of samples") | |
| # Reference data | |
| parser.add_argument("--style-profile", help="Style profile JSON") | |
| parser.add_argument("--posts", help="Parsed posts JSON for factual check") | |
| parser.add_argument("--segments", help="Segments JSON for embeddings") | |
| # Evaluation options | |
| parser.add_argument("--voice-only", action="store_true", help="Voice metrics only") | |
| parser.add_argument("--factual-only", action="store_true", help="Factual check only") | |
| parser.add_argument("--generate-human-eval", action="store_true", help="Generate human eval set") | |
| # Output options | |
| parser.add_argument("--output", default="evaluation/", help="Output directory") | |
| parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") | |
| # Model loading options | |
| parser.add_argument("--no-4bit", action="store_true", help="Disable 4-bit quantization") | |
| parser.add_argument("--no-refinement", action="store_true", help="Skip refinement model") | |
| args = parser.parse_args() | |
| console.print("\n[bold blue]AI Executive - Model Evaluation[/bold blue]") | |
| console.print("=" * 50) | |
| # Create output directory | |
| output_dir = Path(args.output) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| results = {} | |
| # Load test data | |
| test_questions = [] | |
| reference_answers = [] | |
| if args.test_set: | |
| console.print(f"\n[yellow]Loading test set:[/yellow] {args.test_set}") | |
| with open(args.test_set, "r") as f: | |
| for line in f: | |
| if line.strip(): | |
| item = json.loads(line) | |
| messages = item.get("messages", []) | |
| for msg in messages: | |
| if msg["role"] == "user": | |
| test_questions.append(msg["content"]) | |
| elif msg["role"] == "assistant": | |
| reference_answers.append(msg["content"]) | |
| # Limit samples | |
| if args.num_samples and len(test_questions) > args.num_samples: | |
| test_questions = test_questions[:args.num_samples] | |
| reference_answers = reference_answers[:args.num_samples] | |
| console.print(f" Loaded {len(test_questions)} test questions") | |
| # Load pre-generated responses or generate new ones | |
| responses = [] | |
| responses_b = [] | |
| if args.responses: | |
| console.print(f"\n[yellow]Loading responses:[/yellow] {args.responses}") | |
| with open(args.responses, "r") as f: | |
| data = json.load(f) | |
| responses = [d["response"] if isinstance(d, dict) else d for d in data] | |
| console.print(f" Loaded {len(responses)} responses") | |
| elif args.model and test_questions: | |
| console.print(f"\n[yellow]Generating responses with:[/yellow] {args.model}") | |
| try: | |
| from src.inference.dual_llm_pipeline import DualLLMPipeline | |
| # Load pipeline | |
| with Progress( | |
| SpinnerColumn(), | |
| TextColumn("[progress.description]{task.description}"), | |
| console=console, | |
| ) as progress: | |
| task = progress.add_task("Loading model...", total=None) | |
| pipeline = DualLLMPipeline.from_hub( | |
| voice_model_id=args.model, | |
| voice_adapter_id=args.adapter, | |
| load_in_4bit=not args.no_4bit, | |
| enable_refinement=not args.no_refinement, | |
| ) | |
| # Generate responses | |
| console.print(f" Generating {len(test_questions)} responses...") | |
| for i, q in enumerate(test_questions): | |
| if i % 10 == 0: | |
| console.print(f" Progress: {i}/{len(test_questions)}") | |
| result = pipeline.generate(q) | |
| responses.append(result.final_response) | |
| # Save responses | |
| responses_path = output_dir / "generated_responses.json" | |
| with open(responses_path, "w") as f: | |
| json.dump([{"question": q, "response": r} for q, r in zip(test_questions, responses)], f, indent=2) | |
| console.print(f" Saved to: {responses_path}") | |
| except ImportError as e: | |
| console.print(f"[red]Error loading model:[/red] {e}") | |
| return 1 | |
| # Generate responses for model B if A/B comparison | |
| if args.model_b and test_questions: | |
| console.print(f"\n[yellow]Generating responses with model B:[/yellow] {args.model_b}") | |
| try: | |
| from src.inference.dual_llm_pipeline import DualLLMPipeline | |
| pipeline_b = DualLLMPipeline.from_hub( | |
| voice_model_id=args.model_b, | |
| load_in_4bit=not args.no_4bit, | |
| enable_refinement=not args.no_refinement, | |
| ) | |
| for i, q in enumerate(test_questions): | |
| if i % 10 == 0: | |
| console.print(f" Progress: {i}/{len(test_questions)}") | |
| result = pipeline_b.generate(q) | |
| responses_b.append(result.final_response) | |
| except ImportError as e: | |
| console.print(f"[red]Error loading model B:[/red] {e}") | |
| # Run voice metrics evaluation | |
| if args.style_profile and responses and not args.factual_only: | |
| console.print("\n[yellow]Running voice metrics evaluation...[/yellow]") | |
| try: | |
| from src.evaluation.voice_metrics import VoiceMetrics | |
| metrics = VoiceMetrics.from_style_profile( | |
| args.style_profile, | |
| blog_segments_path=args.segments, | |
| ) | |
| voice_results = metrics.evaluate_batch(responses) | |
| results["voice_metrics"] = voice_results | |
| # Display results | |
| table = Table(title="Voice Metrics") | |
| table.add_column("Metric", style="cyan") | |
| table.add_column("Value", style="white") | |
| for key, value in voice_results.items(): | |
| if isinstance(value, float): | |
| table.add_row(key, f"{value:.4f}") | |
| else: | |
| table.add_row(key, str(value)) | |
| console.print(table) | |
| # Evaluate individual responses if verbose | |
| if args.verbose and len(responses) <= 10: | |
| console.print("\n[dim]Individual response scores:[/dim]") | |
| for i, resp in enumerate(responses[:5]): | |
| result = metrics.evaluate_response(resp) | |
| console.print(f" [{i}] Voice: {result.overall_voice_score:.2f}, Style: {result.style_match_score:.2f}") | |
| except ImportError as e: | |
| console.print(f"[red]Voice metrics unavailable:[/red] {e}") | |
| # Run factual accuracy evaluation | |
| if args.posts and responses and not args.voice_only: | |
| console.print("\n[yellow]Running factual accuracy check...[/yellow]") | |
| try: | |
| from src.evaluation.factual_accuracy import FactualAccuracyChecker | |
| checker = FactualAccuracyChecker.from_blogs(args.posts) | |
| factual_results = checker.check_batch(responses) | |
| results["factual_accuracy"] = factual_results | |
| # Display results | |
| table = Table(title="Factual Accuracy") | |
| table.add_column("Metric", style="cyan") | |
| table.add_column("Value", style="white") | |
| for key, value in factual_results.items(): | |
| if isinstance(value, float): | |
| table.add_row(key, f"{value:.4f}") | |
| else: | |
| table.add_row(key, str(value)) | |
| console.print(table) | |
| except ImportError as e: | |
| console.print(f"[red]Factual check unavailable:[/red] {e}") | |
| # Generate human evaluation set | |
| if args.generate_human_eval and test_questions and (responses or responses_b): | |
| console.print("\n[yellow]Generating human evaluation set...[/yellow]") | |
| try: | |
| from src.evaluation.human_eval_generator import HumanEvalGenerator | |
| generator = HumanEvalGenerator() | |
| if responses_b: | |
| # A/B comparison | |
| items = generator.generate_blind_test( | |
| questions=test_questions, | |
| responses_a=responses, | |
| responses_b=responses_b, | |
| source_a_name=args.model or "Model A", | |
| source_b_name=args.model_b or "Model B", | |
| ) | |
| else: | |
| # Rating test | |
| items = generator.generate_rating_test( | |
| questions=test_questions, | |
| responses=responses, | |
| source_name=args.model or "Model", | |
| ) | |
| files = generator.save_evaluation_set(items, output_dir) | |
| console.print(f" Generated {len(items)} evaluation items") | |
| for name, path in files.items(): | |
| console.print(f" {name}: {path}") | |
| results["human_eval"] = {"num_items": len(items), "files": files} | |
| except ImportError as e: | |
| console.print(f"[red]Human eval generation unavailable:[/red] {e}") | |
| # Save all results | |
| if results: | |
| results_path = output_dir / "evaluation_results.json" | |
| with open(results_path, "w") as f: | |
| json.dump(results, f, indent=2) | |
| console.print(f"\n[green]Results saved to:[/green] {results_path}") | |
| # Summary | |
| console.print("\n" + "=" * 50) | |
| console.print("[bold green]Evaluation complete![/bold green]") | |
| if "voice_metrics" in results: | |
| vm = results["voice_metrics"] | |
| console.print(f"\nVoice Score: {vm.get('avg_overall_voice_score', 0):.2%}") | |
| console.print(f"Vocabulary Overlap: {vm.get('avg_vocabulary_overlap', 0):.2%}") | |
| if "factual_accuracy" in results: | |
| fa = results["factual_accuracy"] | |
| console.print(f"\nFactual Accuracy: {fa.get('avg_accuracy', 0):.2%}") | |
| console.print(f"Hallucinations: {fa.get('total_hallucinations', 0)}") | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) | |