#!/usr/bin/env python3 """ Test script to evaluate fine-tuned CodeLlama model on training and test samples """ import json import sys import os from pathlib import Path # Add scripts to path sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference")) from inference_codellama import load_local_model, generate_with_local_model def load_samples(dataset_path, num_samples=2): """Load N samples from dataset""" samples = [] with open(dataset_path, 'r', encoding='utf-8') as f: for i, line in enumerate(f): if i >= num_samples: break if line.strip(): samples.append(json.loads(line)) return samples def extract_instruction_prompt(instruction_text): """Extract just the task part from instruction (remove system prompt if needed)""" # The instruction already contains the system prompt + task # Return as-is for CodeLlama return instruction_text def extract_code_from_response(text): """Extract Verilog code from markdown code blocks""" if not text: return text # Check for verilog code block if '```verilog' in text: start = text.find('```verilog') + len('```verilog') end = text.find('```', start) if end != -1: extracted = text[start:end].strip() return extracted # Check for generic code block if '```' in text: start = text.find('```') if start != -1: start_marker = text.find('\n', start) if start_marker == -1: start_marker = start + 3 else: start_marker += 1 end = text.find('```', start_marker) if end != -1: extracted = text[start_marker:end].strip() return extracted return text.strip() def compare_code(expected, generated): """Simple code comparison""" expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '') generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '') if expected_clean == generated_clean: return 100.0, "Perfect match" # Calculate similarity (simple) matches = 0 min_len = min(len(expected_clean), len(generated_clean)) for i in range(min_len): if expected_clean[i] == generated_clean[i]: matches += 1 similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0 return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match" def main(): # Paths script_dir = Path(__file__).parent model_path = script_dir / "training-outputs" / "codellama-fifo-v1" base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct" train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl" test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl" print("=" * 80) print("๐Ÿงช CODELLAMA FINE-TUNED MODEL EVALUATION") print("=" * 80) print(f"Model: {model_path}") print(f"Base Model: {base_model_path}") print("=" * 80) print() # Load model print("๐Ÿ“ฆ Loading model...") model, tokenizer = load_local_model( str(model_path), str(base_model_path) if base_model_path.exists() else None, use_quantization=None, # Auto-detect merge_weights=False ) print("โœ… Model loaded successfully!\n") results = { "training_samples": [], "test_samples": [] } # Test training samples print("=" * 80) print("๐Ÿ“š TESTING TRAINING SAMPLES") print("=" * 80) train_samples = load_samples(train_dataset, num_samples=2) for i, sample in enumerate(train_samples, 1): print(f"\n{'='*80}") print(f"TRAINING SAMPLE {i}/2") print(f"{'='*80}") instruction = sample.get("instruction", "") expected_response = sample.get("response", "") expected_code = extract_code_from_response(expected_response) print(f"\n๐Ÿ“ Instruction:") print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) print(f"\n๐ŸŽฏ Expected Code (first 300 chars):") print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) print(f"\n๐Ÿค– Generating response...") try: generated_response = generate_with_local_model( model, tokenizer, instruction, max_new_tokens=800, temperature=0.3, stream=False ) generated_code = extract_code_from_response(generated_response) print(f"\nโœ… Generated Code (first 300 chars):") print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) # Compare similarity, match_info = compare_code(expected_code, generated_code) print(f"\n๐Ÿ“Š Comparison:") print(f" Similarity: {similarity:.2f}%") print(f" Match Info: {match_info}") results["training_samples"].append({ "sample_num": i, "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, "expected_code_length": len(expected_code), "generated_code_length": len(generated_code), "similarity": similarity, "match_info": match_info, "expected_code": expected_code, "generated_code": generated_code, "generated_full_response": generated_response }) except Exception as e: print(f"โŒ Error during inference: {e}") results["training_samples"].append({ "sample_num": i, "error": str(e) }) # Test test samples print("\n\n" + "=" * 80) print("๐Ÿ“š TESTING TEST SAMPLES") print("=" * 80) test_samples = load_samples(test_dataset, num_samples=2) for i, sample in enumerate(test_samples, 1): print(f"\n{'='*80}") print(f"TEST SAMPLE {i}/2") print(f"{'='*80}") instruction = sample.get("instruction", "") expected_response = sample.get("response", "") expected_code = extract_code_from_response(expected_response) print(f"\n๐Ÿ“ Instruction:") print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) print(f"\n๐ŸŽฏ Expected Code (first 300 chars):") print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) print(f"\n๐Ÿค– Generating response...") try: generated_response = generate_with_local_model( model, tokenizer, instruction, max_new_tokens=800, temperature=0.3, stream=False ) generated_code = extract_code_from_response(generated_response) print(f"\nโœ… Generated Code (first 300 chars):") print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) # Compare similarity, match_info = compare_code(expected_code, generated_code) print(f"\n๐Ÿ“Š Comparison:") print(f" Similarity: {similarity:.2f}%") print(f" Match Info: {match_info}") results["test_samples"].append({ "sample_num": i, "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, "expected_code_length": len(expected_code), "generated_code_length": len(generated_code), "similarity": similarity, "match_info": match_info, "expected_code": expected_code, "generated_code": generated_code, "generated_full_response": generated_response }) except Exception as e: print(f"โŒ Error during inference: {e}") results["test_samples"].append({ "sample_num": i, "error": str(e) }) # Summary print("\n\n" + "=" * 80) print("๐Ÿ“Š EVALUATION SUMMARY") print("=" * 80) train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0 test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0 print(f"\n๐Ÿ“ˆ Training Samples:") print(f" Average Similarity: {train_avg_similarity:.2f}%") print(f" Samples Tested: {len(results['training_samples'])}") print(f"\n๐Ÿ“ˆ Test Samples:") print(f" Average Similarity: {test_avg_similarity:.2f}%") print(f" Samples Tested: {len(results['test_samples'])}") overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity) print(f"\n๐Ÿ“Š Overall Average Similarity: {overall_avg:.2f}%") # Save results output_file = script_dir / "evaluation_results.json" with open(output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\n๐Ÿ’พ Detailed results saved to: {output_file}") print("=" * 80) return results if __name__ == "__main__": main()