| | |
| | """ |
| | Test script to evaluate fine-tuned CodeLlama model on training and test samples |
| | """ |
| |
|
| | import json |
| | import sys |
| | import os |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference")) |
| |
|
| | from inference_codellama import load_local_model, generate_with_local_model |
| |
|
| | def load_samples(dataset_path, num_samples=2): |
| | """Load N samples from dataset""" |
| | samples = [] |
| | with open(dataset_path, 'r', encoding='utf-8') as f: |
| | for i, line in enumerate(f): |
| | if i >= num_samples: |
| | break |
| | if line.strip(): |
| | samples.append(json.loads(line)) |
| | return samples |
| |
|
| | def extract_instruction_prompt(instruction_text): |
| | """Extract just the task part from instruction (remove system prompt if needed)""" |
| | |
| | |
| | return instruction_text |
| |
|
| | def extract_code_from_response(text): |
| | """Extract Verilog code from markdown code blocks""" |
| | if not text: |
| | return text |
| | |
| | |
| | if '```verilog' in text: |
| | start = text.find('```verilog') + len('```verilog') |
| | end = text.find('```', start) |
| | if end != -1: |
| | extracted = text[start:end].strip() |
| | return extracted |
| | |
| | |
| | if '```' in text: |
| | start = text.find('```') |
| | if start != -1: |
| | start_marker = text.find('\n', start) |
| | if start_marker == -1: |
| | start_marker = start + 3 |
| | else: |
| | start_marker += 1 |
| | |
| | end = text.find('```', start_marker) |
| | if end != -1: |
| | extracted = text[start_marker:end].strip() |
| | return extracted |
| | |
| | return text.strip() |
| |
|
| | def compare_code(expected, generated): |
| | """Simple code comparison""" |
| | expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '') |
| | generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '') |
| | |
| | if expected_clean == generated_clean: |
| | return 100.0, "Perfect match" |
| | |
| | |
| | matches = 0 |
| | min_len = min(len(expected_clean), len(generated_clean)) |
| | for i in range(min_len): |
| | if expected_clean[i] == generated_clean[i]: |
| | matches += 1 |
| | |
| | similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0 |
| | |
| | return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match" |
| |
|
| | def main(): |
| | |
| | script_dir = Path(__file__).parent |
| | model_path = script_dir / "training-outputs" / "codellama-fifo-v1" |
| | base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct" |
| | train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl" |
| | test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl" |
| | |
| | print("=" * 80) |
| | print("π§ͺ CODELLAMA FINE-TUNED MODEL EVALUATION") |
| | print("=" * 80) |
| | print(f"Model: {model_path}") |
| | print(f"Base Model: {base_model_path}") |
| | print("=" * 80) |
| | print() |
| | |
| | |
| | print("π¦ Loading model...") |
| | model, tokenizer = load_local_model( |
| | str(model_path), |
| | str(base_model_path) if base_model_path.exists() else None, |
| | use_quantization=None, |
| | merge_weights=False |
| | ) |
| | print("β
Model loaded successfully!\n") |
| | |
| | results = { |
| | "training_samples": [], |
| | "test_samples": [] |
| | } |
| | |
| | |
| | print("=" * 80) |
| | print("π TESTING TRAINING SAMPLES") |
| | print("=" * 80) |
| | |
| | train_samples = load_samples(train_dataset, num_samples=2) |
| | |
| | for i, sample in enumerate(train_samples, 1): |
| | print(f"\n{'='*80}") |
| | print(f"TRAINING SAMPLE {i}/2") |
| | print(f"{'='*80}") |
| | |
| | instruction = sample.get("instruction", "") |
| | expected_response = sample.get("response", "") |
| | expected_code = extract_code_from_response(expected_response) |
| | |
| | print(f"\nπ Instruction:") |
| | print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) |
| | |
| | print(f"\nπ― Expected Code (first 300 chars):") |
| | print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) |
| | |
| | print(f"\nπ€ Generating response...") |
| | try: |
| | generated_response = generate_with_local_model( |
| | model, |
| | tokenizer, |
| | instruction, |
| | max_new_tokens=800, |
| | temperature=0.3, |
| | stream=False |
| | ) |
| | |
| | generated_code = extract_code_from_response(generated_response) |
| | |
| | print(f"\nβ
Generated Code (first 300 chars):") |
| | print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) |
| | |
| | |
| | similarity, match_info = compare_code(expected_code, generated_code) |
| | |
| | print(f"\nπ Comparison:") |
| | print(f" Similarity: {similarity:.2f}%") |
| | print(f" Match Info: {match_info}") |
| | |
| | results["training_samples"].append({ |
| | "sample_num": i, |
| | "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, |
| | "expected_code_length": len(expected_code), |
| | "generated_code_length": len(generated_code), |
| | "similarity": similarity, |
| | "match_info": match_info, |
| | "expected_code": expected_code, |
| | "generated_code": generated_code, |
| | "generated_full_response": generated_response |
| | }) |
| | |
| | except Exception as e: |
| | print(f"β Error during inference: {e}") |
| | results["training_samples"].append({ |
| | "sample_num": i, |
| | "error": str(e) |
| | }) |
| | |
| | |
| | print("\n\n" + "=" * 80) |
| | print("π TESTING TEST SAMPLES") |
| | print("=" * 80) |
| | |
| | test_samples = load_samples(test_dataset, num_samples=2) |
| | |
| | for i, sample in enumerate(test_samples, 1): |
| | print(f"\n{'='*80}") |
| | print(f"TEST SAMPLE {i}/2") |
| | print(f"{'='*80}") |
| | |
| | instruction = sample.get("instruction", "") |
| | expected_response = sample.get("response", "") |
| | expected_code = extract_code_from_response(expected_response) |
| | |
| | print(f"\nπ Instruction:") |
| | print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) |
| | |
| | print(f"\nπ― Expected Code (first 300 chars):") |
| | print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) |
| | |
| | print(f"\nπ€ Generating response...") |
| | try: |
| | generated_response = generate_with_local_model( |
| | model, |
| | tokenizer, |
| | instruction, |
| | max_new_tokens=800, |
| | temperature=0.3, |
| | stream=False |
| | ) |
| | |
| | generated_code = extract_code_from_response(generated_response) |
| | |
| | print(f"\nβ
Generated Code (first 300 chars):") |
| | print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) |
| | |
| | |
| | similarity, match_info = compare_code(expected_code, generated_code) |
| | |
| | print(f"\nπ Comparison:") |
| | print(f" Similarity: {similarity:.2f}%") |
| | print(f" Match Info: {match_info}") |
| | |
| | results["test_samples"].append({ |
| | "sample_num": i, |
| | "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, |
| | "expected_code_length": len(expected_code), |
| | "generated_code_length": len(generated_code), |
| | "similarity": similarity, |
| | "match_info": match_info, |
| | "expected_code": expected_code, |
| | "generated_code": generated_code, |
| | "generated_full_response": generated_response |
| | }) |
| | |
| | except Exception as e: |
| | print(f"β Error during inference: {e}") |
| | results["test_samples"].append({ |
| | "sample_num": i, |
| | "error": str(e) |
| | }) |
| | |
| | |
| | print("\n\n" + "=" * 80) |
| | print("π EVALUATION SUMMARY") |
| | print("=" * 80) |
| | |
| | train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0 |
| | test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0 |
| | |
| | print(f"\nπ Training Samples:") |
| | print(f" Average Similarity: {train_avg_similarity:.2f}%") |
| | print(f" Samples Tested: {len(results['training_samples'])}") |
| | |
| | print(f"\nπ Test Samples:") |
| | print(f" Average Similarity: {test_avg_similarity:.2f}%") |
| | print(f" Samples Tested: {len(results['test_samples'])}") |
| | |
| | overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity) |
| | print(f"\nπ Overall Average Similarity: {overall_avg:.2f}%") |
| | |
| | |
| | output_file = script_dir / "evaluation_results.json" |
| | with open(output_file, 'w') as f: |
| | json.dump(results, f, indent=2) |
| | |
| | print(f"\nπΎ Detailed results saved to: {output_file}") |
| | print("=" * 80) |
| | |
| | return results |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|