ai_exec / scripts /evaluate_model.py
Chaitanya-aitf's picture
Upload 38 files
45ee481 verified
#!/usr/bin/env python3
"""
Evaluate Model CLI
Run comprehensive evaluation suite on the CEO voice model.
Measures voice authenticity, factual accuracy, and generates human eval sets.
Usage:
python scripts/evaluate_model.py --model username/model --test-set test.jsonl
Environment:
HF_TOKEN - Hugging Face token for loading models
"""
import argparse
import json
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn
console = Console()
def main():
parser = argparse.ArgumentParser(
description="Evaluate the CEO voice model",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Full evaluation
python scripts/evaluate_model.py \\
--model username/ceo-voice-model \\
--test-set data/training/validation.jsonl \\
--style-profile data/processed/style_profile.json \\
--posts data/processed/posts.json
# Voice metrics only
python scripts/evaluate_model.py \\
--responses outputs.json \\
--style-profile data/processed/style_profile.json \\
--voice-only
# Generate human eval set
python scripts/evaluate_model.py \\
--model username/model-a \\
--model-b username/model-b \\
--test-set test.jsonl \\
--generate-human-eval \\
--output evaluation/
Environment:
HF_TOKEN - Hugging Face token
""",
)
# Model options
parser.add_argument("--model", help="Voice model ID or path")
parser.add_argument("--model-b", help="Second model for A/B comparison")
parser.add_argument("--adapter", help="LoRA adapter path (if separate)")
# Input options
parser.add_argument("--test-set", help="Test set JSONL file")
parser.add_argument("--responses", help="Pre-generated responses JSON")
parser.add_argument("--num-samples", type=int, default=50, help="Number of samples")
# Reference data
parser.add_argument("--style-profile", help="Style profile JSON")
parser.add_argument("--posts", help="Parsed posts JSON for factual check")
parser.add_argument("--segments", help="Segments JSON for embeddings")
# Evaluation options
parser.add_argument("--voice-only", action="store_true", help="Voice metrics only")
parser.add_argument("--factual-only", action="store_true", help="Factual check only")
parser.add_argument("--generate-human-eval", action="store_true", help="Generate human eval set")
# Output options
parser.add_argument("--output", default="evaluation/", help="Output directory")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
# Model loading options
parser.add_argument("--no-4bit", action="store_true", help="Disable 4-bit quantization")
parser.add_argument("--no-refinement", action="store_true", help="Skip refinement model")
args = parser.parse_args()
console.print("\n[bold blue]AI Executive - Model Evaluation[/bold blue]")
console.print("=" * 50)
# Create output directory
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
results = {}
# Load test data
test_questions = []
reference_answers = []
if args.test_set:
console.print(f"\n[yellow]Loading test set:[/yellow] {args.test_set}")
with open(args.test_set, "r") as f:
for line in f:
if line.strip():
item = json.loads(line)
messages = item.get("messages", [])
for msg in messages:
if msg["role"] == "user":
test_questions.append(msg["content"])
elif msg["role"] == "assistant":
reference_answers.append(msg["content"])
# Limit samples
if args.num_samples and len(test_questions) > args.num_samples:
test_questions = test_questions[:args.num_samples]
reference_answers = reference_answers[:args.num_samples]
console.print(f" Loaded {len(test_questions)} test questions")
# Load pre-generated responses or generate new ones
responses = []
responses_b = []
if args.responses:
console.print(f"\n[yellow]Loading responses:[/yellow] {args.responses}")
with open(args.responses, "r") as f:
data = json.load(f)
responses = [d["response"] if isinstance(d, dict) else d for d in data]
console.print(f" Loaded {len(responses)} responses")
elif args.model and test_questions:
console.print(f"\n[yellow]Generating responses with:[/yellow] {args.model}")
try:
from src.inference.dual_llm_pipeline import DualLLMPipeline
# Load pipeline
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console,
) as progress:
task = progress.add_task("Loading model...", total=None)
pipeline = DualLLMPipeline.from_hub(
voice_model_id=args.model,
voice_adapter_id=args.adapter,
load_in_4bit=not args.no_4bit,
enable_refinement=not args.no_refinement,
)
# Generate responses
console.print(f" Generating {len(test_questions)} responses...")
for i, q in enumerate(test_questions):
if i % 10 == 0:
console.print(f" Progress: {i}/{len(test_questions)}")
result = pipeline.generate(q)
responses.append(result.final_response)
# Save responses
responses_path = output_dir / "generated_responses.json"
with open(responses_path, "w") as f:
json.dump([{"question": q, "response": r} for q, r in zip(test_questions, responses)], f, indent=2)
console.print(f" Saved to: {responses_path}")
except ImportError as e:
console.print(f"[red]Error loading model:[/red] {e}")
return 1
# Generate responses for model B if A/B comparison
if args.model_b and test_questions:
console.print(f"\n[yellow]Generating responses with model B:[/yellow] {args.model_b}")
try:
from src.inference.dual_llm_pipeline import DualLLMPipeline
pipeline_b = DualLLMPipeline.from_hub(
voice_model_id=args.model_b,
load_in_4bit=not args.no_4bit,
enable_refinement=not args.no_refinement,
)
for i, q in enumerate(test_questions):
if i % 10 == 0:
console.print(f" Progress: {i}/{len(test_questions)}")
result = pipeline_b.generate(q)
responses_b.append(result.final_response)
except ImportError as e:
console.print(f"[red]Error loading model B:[/red] {e}")
# Run voice metrics evaluation
if args.style_profile and responses and not args.factual_only:
console.print("\n[yellow]Running voice metrics evaluation...[/yellow]")
try:
from src.evaluation.voice_metrics import VoiceMetrics
metrics = VoiceMetrics.from_style_profile(
args.style_profile,
blog_segments_path=args.segments,
)
voice_results = metrics.evaluate_batch(responses)
results["voice_metrics"] = voice_results
# Display results
table = Table(title="Voice Metrics")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="white")
for key, value in voice_results.items():
if isinstance(value, float):
table.add_row(key, f"{value:.4f}")
else:
table.add_row(key, str(value))
console.print(table)
# Evaluate individual responses if verbose
if args.verbose and len(responses) <= 10:
console.print("\n[dim]Individual response scores:[/dim]")
for i, resp in enumerate(responses[:5]):
result = metrics.evaluate_response(resp)
console.print(f" [{i}] Voice: {result.overall_voice_score:.2f}, Style: {result.style_match_score:.2f}")
except ImportError as e:
console.print(f"[red]Voice metrics unavailable:[/red] {e}")
# Run factual accuracy evaluation
if args.posts and responses and not args.voice_only:
console.print("\n[yellow]Running factual accuracy check...[/yellow]")
try:
from src.evaluation.factual_accuracy import FactualAccuracyChecker
checker = FactualAccuracyChecker.from_blogs(args.posts)
factual_results = checker.check_batch(responses)
results["factual_accuracy"] = factual_results
# Display results
table = Table(title="Factual Accuracy")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="white")
for key, value in factual_results.items():
if isinstance(value, float):
table.add_row(key, f"{value:.4f}")
else:
table.add_row(key, str(value))
console.print(table)
except ImportError as e:
console.print(f"[red]Factual check unavailable:[/red] {e}")
# Generate human evaluation set
if args.generate_human_eval and test_questions and (responses or responses_b):
console.print("\n[yellow]Generating human evaluation set...[/yellow]")
try:
from src.evaluation.human_eval_generator import HumanEvalGenerator
generator = HumanEvalGenerator()
if responses_b:
# A/B comparison
items = generator.generate_blind_test(
questions=test_questions,
responses_a=responses,
responses_b=responses_b,
source_a_name=args.model or "Model A",
source_b_name=args.model_b or "Model B",
)
else:
# Rating test
items = generator.generate_rating_test(
questions=test_questions,
responses=responses,
source_name=args.model or "Model",
)
files = generator.save_evaluation_set(items, output_dir)
console.print(f" Generated {len(items)} evaluation items")
for name, path in files.items():
console.print(f" {name}: {path}")
results["human_eval"] = {"num_items": len(items), "files": files}
except ImportError as e:
console.print(f"[red]Human eval generation unavailable:[/red] {e}")
# Save all results
if results:
results_path = output_dir / "evaluation_results.json"
with open(results_path, "w") as f:
json.dump(results, f, indent=2)
console.print(f"\n[green]Results saved to:[/green] {results_path}")
# Summary
console.print("\n" + "=" * 50)
console.print("[bold green]Evaluation complete![/bold green]")
if "voice_metrics" in results:
vm = results["voice_metrics"]
console.print(f"\nVoice Score: {vm.get('avg_overall_voice_score', 0):.2%}")
console.print(f"Vocabulary Overlap: {vm.get('avg_vocabulary_overlap', 0):.2%}")
if "factual_accuracy" in results:
fa = results["factual_accuracy"]
console.print(f"\nFactual Accuracy: {fa.get('avg_accuracy', 0):.2%}")
console.print(f"Hallucinations: {fa.get('total_hallucinations', 0)}")
return 0
if __name__ == "__main__":
exit(main())