File size: 12,030 Bytes
45ee481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env python3
"""
Evaluate Model CLI

Run comprehensive evaluation suite on the CEO voice model.
Measures voice authenticity, factual accuracy, and generates human eval sets.

Usage:
    python scripts/evaluate_model.py --model username/model --test-set test.jsonl

Environment:
    HF_TOKEN - Hugging Face token for loading models
"""

import argparse
import json
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn

console = Console()


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate the CEO voice model",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Full evaluation
    python scripts/evaluate_model.py \\
        --model username/ceo-voice-model \\
        --test-set data/training/validation.jsonl \\
        --style-profile data/processed/style_profile.json \\
        --posts data/processed/posts.json

    # Voice metrics only
    python scripts/evaluate_model.py \\
        --responses outputs.json \\
        --style-profile data/processed/style_profile.json \\
        --voice-only

    # Generate human eval set
    python scripts/evaluate_model.py \\
        --model username/model-a \\
        --model-b username/model-b \\
        --test-set test.jsonl \\
        --generate-human-eval \\
        --output evaluation/

Environment:
    HF_TOKEN - Hugging Face token
        """,
    )

    # Model options
    parser.add_argument("--model", help="Voice model ID or path")
    parser.add_argument("--model-b", help="Second model for A/B comparison")
    parser.add_argument("--adapter", help="LoRA adapter path (if separate)")

    # Input options
    parser.add_argument("--test-set", help="Test set JSONL file")
    parser.add_argument("--responses", help="Pre-generated responses JSON")
    parser.add_argument("--num-samples", type=int, default=50, help="Number of samples")

    # Reference data
    parser.add_argument("--style-profile", help="Style profile JSON")
    parser.add_argument("--posts", help="Parsed posts JSON for factual check")
    parser.add_argument("--segments", help="Segments JSON for embeddings")

    # Evaluation options
    parser.add_argument("--voice-only", action="store_true", help="Voice metrics only")
    parser.add_argument("--factual-only", action="store_true", help="Factual check only")
    parser.add_argument("--generate-human-eval", action="store_true", help="Generate human eval set")

    # Output options
    parser.add_argument("--output", default="evaluation/", help="Output directory")
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")

    # Model loading options
    parser.add_argument("--no-4bit", action="store_true", help="Disable 4-bit quantization")
    parser.add_argument("--no-refinement", action="store_true", help="Skip refinement model")

    args = parser.parse_args()

    console.print("\n[bold blue]AI Executive - Model Evaluation[/bold blue]")
    console.print("=" * 50)

    # Create output directory
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)

    results = {}

    # Load test data
    test_questions = []
    reference_answers = []

    if args.test_set:
        console.print(f"\n[yellow]Loading test set:[/yellow] {args.test_set}")
        with open(args.test_set, "r") as f:
            for line in f:
                if line.strip():
                    item = json.loads(line)
                    messages = item.get("messages", [])
                    for msg in messages:
                        if msg["role"] == "user":
                            test_questions.append(msg["content"])
                        elif msg["role"] == "assistant":
                            reference_answers.append(msg["content"])

        # Limit samples
        if args.num_samples and len(test_questions) > args.num_samples:
            test_questions = test_questions[:args.num_samples]
            reference_answers = reference_answers[:args.num_samples]

        console.print(f"  Loaded {len(test_questions)} test questions")

    # Load pre-generated responses or generate new ones
    responses = []
    responses_b = []

    if args.responses:
        console.print(f"\n[yellow]Loading responses:[/yellow] {args.responses}")
        with open(args.responses, "r") as f:
            data = json.load(f)
        responses = [d["response"] if isinstance(d, dict) else d for d in data]
        console.print(f"  Loaded {len(responses)} responses")

    elif args.model and test_questions:
        console.print(f"\n[yellow]Generating responses with:[/yellow] {args.model}")

        try:
            from src.inference.dual_llm_pipeline import DualLLMPipeline

            # Load pipeline
            with Progress(
                SpinnerColumn(),
                TextColumn("[progress.description]{task.description}"),
                console=console,
            ) as progress:
                task = progress.add_task("Loading model...", total=None)
                pipeline = DualLLMPipeline.from_hub(
                    voice_model_id=args.model,
                    voice_adapter_id=args.adapter,
                    load_in_4bit=not args.no_4bit,
                    enable_refinement=not args.no_refinement,
                )

            # Generate responses
            console.print(f"  Generating {len(test_questions)} responses...")
            for i, q in enumerate(test_questions):
                if i % 10 == 0:
                    console.print(f"  Progress: {i}/{len(test_questions)}")
                result = pipeline.generate(q)
                responses.append(result.final_response)

            # Save responses
            responses_path = output_dir / "generated_responses.json"
            with open(responses_path, "w") as f:
                json.dump([{"question": q, "response": r} for q, r in zip(test_questions, responses)], f, indent=2)
            console.print(f"  Saved to: {responses_path}")

        except ImportError as e:
            console.print(f"[red]Error loading model:[/red] {e}")
            return 1

    # Generate responses for model B if A/B comparison
    if args.model_b and test_questions:
        console.print(f"\n[yellow]Generating responses with model B:[/yellow] {args.model_b}")

        try:
            from src.inference.dual_llm_pipeline import DualLLMPipeline

            pipeline_b = DualLLMPipeline.from_hub(
                voice_model_id=args.model_b,
                load_in_4bit=not args.no_4bit,
                enable_refinement=not args.no_refinement,
            )

            for i, q in enumerate(test_questions):
                if i % 10 == 0:
                    console.print(f"  Progress: {i}/{len(test_questions)}")
                result = pipeline_b.generate(q)
                responses_b.append(result.final_response)

        except ImportError as e:
            console.print(f"[red]Error loading model B:[/red] {e}")

    # Run voice metrics evaluation
    if args.style_profile and responses and not args.factual_only:
        console.print("\n[yellow]Running voice metrics evaluation...[/yellow]")

        try:
            from src.evaluation.voice_metrics import VoiceMetrics

            metrics = VoiceMetrics.from_style_profile(
                args.style_profile,
                blog_segments_path=args.segments,
            )

            voice_results = metrics.evaluate_batch(responses)
            results["voice_metrics"] = voice_results

            # Display results
            table = Table(title="Voice Metrics")
            table.add_column("Metric", style="cyan")
            table.add_column("Value", style="white")

            for key, value in voice_results.items():
                if isinstance(value, float):
                    table.add_row(key, f"{value:.4f}")
                else:
                    table.add_row(key, str(value))

            console.print(table)

            # Evaluate individual responses if verbose
            if args.verbose and len(responses) <= 10:
                console.print("\n[dim]Individual response scores:[/dim]")
                for i, resp in enumerate(responses[:5]):
                    result = metrics.evaluate_response(resp)
                    console.print(f"  [{i}] Voice: {result.overall_voice_score:.2f}, Style: {result.style_match_score:.2f}")

        except ImportError as e:
            console.print(f"[red]Voice metrics unavailable:[/red] {e}")

    # Run factual accuracy evaluation
    if args.posts and responses and not args.voice_only:
        console.print("\n[yellow]Running factual accuracy check...[/yellow]")

        try:
            from src.evaluation.factual_accuracy import FactualAccuracyChecker

            checker = FactualAccuracyChecker.from_blogs(args.posts)
            factual_results = checker.check_batch(responses)
            results["factual_accuracy"] = factual_results

            # Display results
            table = Table(title="Factual Accuracy")
            table.add_column("Metric", style="cyan")
            table.add_column("Value", style="white")

            for key, value in factual_results.items():
                if isinstance(value, float):
                    table.add_row(key, f"{value:.4f}")
                else:
                    table.add_row(key, str(value))

            console.print(table)

        except ImportError as e:
            console.print(f"[red]Factual check unavailable:[/red] {e}")

    # Generate human evaluation set
    if args.generate_human_eval and test_questions and (responses or responses_b):
        console.print("\n[yellow]Generating human evaluation set...[/yellow]")

        try:
            from src.evaluation.human_eval_generator import HumanEvalGenerator

            generator = HumanEvalGenerator()

            if responses_b:
                # A/B comparison
                items = generator.generate_blind_test(
                    questions=test_questions,
                    responses_a=responses,
                    responses_b=responses_b,
                    source_a_name=args.model or "Model A",
                    source_b_name=args.model_b or "Model B",
                )
            else:
                # Rating test
                items = generator.generate_rating_test(
                    questions=test_questions,
                    responses=responses,
                    source_name=args.model or "Model",
                )

            files = generator.save_evaluation_set(items, output_dir)

            console.print(f"  Generated {len(items)} evaluation items")
            for name, path in files.items():
                console.print(f"  {name}: {path}")

            results["human_eval"] = {"num_items": len(items), "files": files}

        except ImportError as e:
            console.print(f"[red]Human eval generation unavailable:[/red] {e}")

    # Save all results
    if results:
        results_path = output_dir / "evaluation_results.json"
        with open(results_path, "w") as f:
            json.dump(results, f, indent=2)
        console.print(f"\n[green]Results saved to:[/green] {results_path}")

    # Summary
    console.print("\n" + "=" * 50)
    console.print("[bold green]Evaluation complete![/bold green]")

    if "voice_metrics" in results:
        vm = results["voice_metrics"]
        console.print(f"\nVoice Score: {vm.get('avg_overall_voice_score', 0):.2%}")
        console.print(f"Vocabulary Overlap: {vm.get('avg_vocabulary_overlap', 0):.2%}")

    if "factual_accuracy" in results:
        fa = results["factual_accuracy"]
        console.print(f"\nFactual Accuracy: {fa.get('avg_accuracy', 0):.2%}")
        console.print(f"Hallucinations: {fa.get('total_hallucinations', 0)}")

    return 0


if __name__ == "__main__":
    exit(main())