Spaces:

AI-Talent-Force
/

ai_exec

Paused

App Files Files Community

ai_exec / scripts /evaluate_model.py

Chaitanya-aitf

Upload 38 files

45ee481 verified about 1 month ago

raw

history blame contribute delete

12 kB

	#!/usr/bin/env python3
	"""
	Evaluate Model CLI

	Run comprehensive evaluation suite on the CEO voice model.
	Measures voice authenticity, factual accuracy, and generates human eval sets.

	Usage:
	python scripts/evaluate_model.py --model username/model --test-set test.jsonl

	Environment:
	HF_TOKEN - Hugging Face token for loading models
	"""

	import argparse
	import json
	import sys
	from pathlib import Path

	# Add src to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from rich.console import Console
	from rich.table import Table
	from rich.progress import Progress, SpinnerColumn, TextColumn

	console = Console()


	def main():
	parser = argparse.ArgumentParser(
	description="Evaluate the CEO voice model",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Full evaluation
	python scripts/evaluate_model.py \\
	--model username/ceo-voice-model \\
	--test-set data/training/validation.jsonl \\
	--style-profile data/processed/style_profile.json \\
	--posts data/processed/posts.json

	# Voice metrics only
	python scripts/evaluate_model.py \\
	--responses outputs.json \\
	--style-profile data/processed/style_profile.json \\
	--voice-only

	# Generate human eval set
	python scripts/evaluate_model.py \\
	--model username/model-a \\
	--model-b username/model-b \\
	--test-set test.jsonl \\
	--generate-human-eval \\
	--output evaluation/

	Environment:
	HF_TOKEN - Hugging Face token
	""",
	)

	# Model options
	parser.add_argument("--model", help="Voice model ID or path")
	parser.add_argument("--model-b", help="Second model for A/B comparison")
	parser.add_argument("--adapter", help="LoRA adapter path (if separate)")

	# Input options
	parser.add_argument("--test-set", help="Test set JSONL file")
	parser.add_argument("--responses", help="Pre-generated responses JSON")
	parser.add_argument("--num-samples", type=int, default=50, help="Number of samples")

	# Reference data
	parser.add_argument("--style-profile", help="Style profile JSON")
	parser.add_argument("--posts", help="Parsed posts JSON for factual check")
	parser.add_argument("--segments", help="Segments JSON for embeddings")

	# Evaluation options
	parser.add_argument("--voice-only", action="store_true", help="Voice metrics only")
	parser.add_argument("--factual-only", action="store_true", help="Factual check only")
	parser.add_argument("--generate-human-eval", action="store_true", help="Generate human eval set")

	# Output options
	parser.add_argument("--output", default="evaluation/", help="Output directory")
	parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")

	# Model loading options
	parser.add_argument("--no-4bit", action="store_true", help="Disable 4-bit quantization")
	parser.add_argument("--no-refinement", action="store_true", help="Skip refinement model")

	args = parser.parse_args()

	console.print("\n[bold blue]AI Executive - Model Evaluation[/bold blue]")
	console.print("=" * 50)

	# Create output directory
	output_dir = Path(args.output)
	output_dir.mkdir(parents=True, exist_ok=True)

	results = {}

	# Load test data
	test_questions = []
	reference_answers = []

	if args.test_set:
	console.print(f"\n[yellow]Loading test set:[/yellow] {args.test_set}")
	with open(args.test_set, "r") as f:
	for line in f:
	if line.strip():
	item = json.loads(line)
	messages = item.get("messages", [])
	for msg in messages:
	if msg["role"] == "user":
	test_questions.append(msg["content"])
	elif msg["role"] == "assistant":
	reference_answers.append(msg["content"])

	# Limit samples
	if args.num_samples and len(test_questions) > args.num_samples:
	test_questions = test_questions[:args.num_samples]
	reference_answers = reference_answers[:args.num_samples]

	console.print(f" Loaded {len(test_questions)} test questions")

	# Load pre-generated responses or generate new ones
	responses = []
	responses_b = []

	if args.responses:
	console.print(f"\n[yellow]Loading responses:[/yellow] {args.responses}")
	with open(args.responses, "r") as f:
	data = json.load(f)
	responses = [d["response"] if isinstance(d, dict) else d for d in data]
	console.print(f" Loaded {len(responses)} responses")

	elif args.model and test_questions:
	console.print(f"\n[yellow]Generating responses with:[/yellow] {args.model}")

	try:
	from src.inference.dual_llm_pipeline import DualLLMPipeline

	# Load pipeline
	with Progress(
	SpinnerColumn(),
	TextColumn("[progress.description]{task.description}"),
	console=console,
	) as progress:
	task = progress.add_task("Loading model...", total=None)
	pipeline = DualLLMPipeline.from_hub(
	voice_model_id=args.model,
	voice_adapter_id=args.adapter,
	load_in_4bit=not args.no_4bit,
	enable_refinement=not args.no_refinement,
	)

	# Generate responses
	console.print(f" Generating {len(test_questions)} responses...")
	for i, q in enumerate(test_questions):
	if i % 10 == 0:
	console.print(f" Progress: {i}/{len(test_questions)}")
	result = pipeline.generate(q)
	responses.append(result.final_response)

	# Save responses
	responses_path = output_dir / "generated_responses.json"
	with open(responses_path, "w") as f:
	json.dump([{"question": q, "response": r} for q, r in zip(test_questions, responses)], f, indent=2)
	console.print(f" Saved to: {responses_path}")

	except ImportError as e:
	console.print(f"[red]Error loading model:[/red] {e}")
	return 1

	# Generate responses for model B if A/B comparison
	if args.model_b and test_questions:
	console.print(f"\n[yellow]Generating responses with model B:[/yellow] {args.model_b}")

	try:
	from src.inference.dual_llm_pipeline import DualLLMPipeline

	pipeline_b = DualLLMPipeline.from_hub(
	voice_model_id=args.model_b,
	load_in_4bit=not args.no_4bit,
	enable_refinement=not args.no_refinement,
	)

	for i, q in enumerate(test_questions):
	if i % 10 == 0:
	console.print(f" Progress: {i}/{len(test_questions)}")
	result = pipeline_b.generate(q)
	responses_b.append(result.final_response)

	except ImportError as e:
	console.print(f"[red]Error loading model B:[/red] {e}")

	# Run voice metrics evaluation
	if args.style_profile and responses and not args.factual_only:
	console.print("\n[yellow]Running voice metrics evaluation...[/yellow]")

	try:
	from src.evaluation.voice_metrics import VoiceMetrics

	metrics = VoiceMetrics.from_style_profile(
	args.style_profile,
	blog_segments_path=args.segments,
	)

	voice_results = metrics.evaluate_batch(responses)
	results["voice_metrics"] = voice_results

	# Display results
	table = Table(title="Voice Metrics")
	table.add_column("Metric", style="cyan")
	table.add_column("Value", style="white")

	for key, value in voice_results.items():
	if isinstance(value, float):
	table.add_row(key, f"{value:.4f}")
	else:
	table.add_row(key, str(value))

	console.print(table)

	# Evaluate individual responses if verbose
	if args.verbose and len(responses) <= 10:
	console.print("\n[dim]Individual response scores:[/dim]")
	for i, resp in enumerate(responses[:5]):
	result = metrics.evaluate_response(resp)
	console.print(f" [{i}] Voice: {result.overall_voice_score:.2f}, Style: {result.style_match_score:.2f}")

	except ImportError as e:
	console.print(f"[red]Voice metrics unavailable:[/red] {e}")

	# Run factual accuracy evaluation
	if args.posts and responses and not args.voice_only:
	console.print("\n[yellow]Running factual accuracy check...[/yellow]")

	try:
	from src.evaluation.factual_accuracy import FactualAccuracyChecker

	checker = FactualAccuracyChecker.from_blogs(args.posts)
	factual_results = checker.check_batch(responses)
	results["factual_accuracy"] = factual_results

	# Display results
	table = Table(title="Factual Accuracy")
	table.add_column("Metric", style="cyan")
	table.add_column("Value", style="white")

	for key, value in factual_results.items():
	if isinstance(value, float):
	table.add_row(key, f"{value:.4f}")
	else:
	table.add_row(key, str(value))

	console.print(table)

	except ImportError as e:
	console.print(f"[red]Factual check unavailable:[/red] {e}")

	# Generate human evaluation set
	if args.generate_human_eval and test_questions and (responses or responses_b):
	console.print("\n[yellow]Generating human evaluation set...[/yellow]")

	try:
	from src.evaluation.human_eval_generator import HumanEvalGenerator

	generator = HumanEvalGenerator()

	if responses_b:
	# A/B comparison
	items = generator.generate_blind_test(
	questions=test_questions,
	responses_a=responses,
	responses_b=responses_b,
	source_a_name=args.model or "Model A",
	source_b_name=args.model_b or "Model B",
	)
	else:
	# Rating test
	items = generator.generate_rating_test(
	questions=test_questions,
	responses=responses,
	source_name=args.model or "Model",
	)

	files = generator.save_evaluation_set(items, output_dir)

	console.print(f" Generated {len(items)} evaluation items")
	for name, path in files.items():
	console.print(f" {name}: {path}")

	results["human_eval"] = {"num_items": len(items), "files": files}

	except ImportError as e:
	console.print(f"[red]Human eval generation unavailable:[/red] {e}")

	# Save all results
	if results:
	results_path = output_dir / "evaluation_results.json"
	with open(results_path, "w") as f:
	json.dump(results, f, indent=2)
	console.print(f"\n[green]Results saved to:[/green] {results_path}")

	# Summary
	console.print("\n" + "=" * 50)
	console.print("[bold green]Evaluation complete![/bold green]")

	if "voice_metrics" in results:
	vm = results["voice_metrics"]
	console.print(f"\nVoice Score: {vm.get('avg_overall_voice_score', 0):.2%}")
	console.print(f"Vocabulary Overlap: {vm.get('avg_vocabulary_overlap', 0):.2%}")

	if "factual_accuracy" in results:
	fa = results["factual_accuracy"]
	console.print(f"\nFactual Accuracy: {fa.get('avg_accuracy', 0):.2%}")
	console.print(f"Hallucinations: {fa.get('total_hallucinations', 0)}")

	return 0


	if __name__ == "__main__":
	exit(main())