|
|
|
|
|
""" |
|
|
Test script to evaluate fine-tuned CodeLlama model on training and test samples |
|
|
""" |
|
|
|
|
|
import json |
|
|
import sys |
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference")) |
|
|
|
|
|
from inference_codellama import load_local_model, generate_with_local_model |
|
|
|
|
|
def load_samples(dataset_path, num_samples=2): |
|
|
"""Load N samples from dataset""" |
|
|
samples = [] |
|
|
with open(dataset_path, 'r', encoding='utf-8') as f: |
|
|
for i, line in enumerate(f): |
|
|
if i >= num_samples: |
|
|
break |
|
|
if line.strip(): |
|
|
samples.append(json.loads(line)) |
|
|
return samples |
|
|
|
|
|
def extract_instruction_prompt(instruction_text): |
|
|
"""Extract just the task part from instruction (remove system prompt if needed)""" |
|
|
|
|
|
|
|
|
return instruction_text |
|
|
|
|
|
def extract_code_from_response(text): |
|
|
"""Extract Verilog code from markdown code blocks""" |
|
|
if not text: |
|
|
return text |
|
|
|
|
|
|
|
|
if '```verilog' in text: |
|
|
start = text.find('```verilog') + len('```verilog') |
|
|
end = text.find('```', start) |
|
|
if end != -1: |
|
|
extracted = text[start:end].strip() |
|
|
return extracted |
|
|
|
|
|
|
|
|
if '```' in text: |
|
|
start = text.find('```') |
|
|
if start != -1: |
|
|
start_marker = text.find('\n', start) |
|
|
if start_marker == -1: |
|
|
start_marker = start + 3 |
|
|
else: |
|
|
start_marker += 1 |
|
|
|
|
|
end = text.find('```', start_marker) |
|
|
if end != -1: |
|
|
extracted = text[start_marker:end].strip() |
|
|
return extracted |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def compare_code(expected, generated): |
|
|
"""Simple code comparison""" |
|
|
expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '') |
|
|
generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '') |
|
|
|
|
|
if expected_clean == generated_clean: |
|
|
return 100.0, "Perfect match" |
|
|
|
|
|
|
|
|
matches = 0 |
|
|
min_len = min(len(expected_clean), len(generated_clean)) |
|
|
for i in range(min_len): |
|
|
if expected_clean[i] == generated_clean[i]: |
|
|
matches += 1 |
|
|
|
|
|
similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0 |
|
|
|
|
|
return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match" |
|
|
|
|
|
def main(): |
|
|
|
|
|
script_dir = Path(__file__).parent |
|
|
model_path = script_dir / "training-outputs" / "codellama-fifo-v1" |
|
|
base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct" |
|
|
train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl" |
|
|
test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl" |
|
|
|
|
|
print("=" * 80) |
|
|
print("π§ͺ CODELLAMA FINE-TUNED MODEL EVALUATION") |
|
|
print("=" * 80) |
|
|
print(f"Model: {model_path}") |
|
|
print(f"Base Model: {base_model_path}") |
|
|
print("=" * 80) |
|
|
print() |
|
|
|
|
|
|
|
|
print("π¦ Loading model...") |
|
|
model, tokenizer = load_local_model( |
|
|
str(model_path), |
|
|
str(base_model_path) if base_model_path.exists() else None, |
|
|
use_quantization=None, |
|
|
merge_weights=False |
|
|
) |
|
|
print("β
Model loaded successfully!\n") |
|
|
|
|
|
results = { |
|
|
"training_samples": [], |
|
|
"test_samples": [] |
|
|
} |
|
|
|
|
|
|
|
|
print("=" * 80) |
|
|
print("π TESTING TRAINING SAMPLES") |
|
|
print("=" * 80) |
|
|
|
|
|
train_samples = load_samples(train_dataset, num_samples=2) |
|
|
|
|
|
for i, sample in enumerate(train_samples, 1): |
|
|
print(f"\n{'='*80}") |
|
|
print(f"TRAINING SAMPLE {i}/2") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
instruction = sample.get("instruction", "") |
|
|
expected_response = sample.get("response", "") |
|
|
expected_code = extract_code_from_response(expected_response) |
|
|
|
|
|
print(f"\nπ Instruction:") |
|
|
print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) |
|
|
|
|
|
print(f"\nπ― Expected Code (first 300 chars):") |
|
|
print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) |
|
|
|
|
|
print(f"\nπ€ Generating response...") |
|
|
try: |
|
|
generated_response = generate_with_local_model( |
|
|
model, |
|
|
tokenizer, |
|
|
instruction, |
|
|
max_new_tokens=800, |
|
|
temperature=0.3, |
|
|
stream=False |
|
|
) |
|
|
|
|
|
generated_code = extract_code_from_response(generated_response) |
|
|
|
|
|
print(f"\nβ
Generated Code (first 300 chars):") |
|
|
print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) |
|
|
|
|
|
|
|
|
similarity, match_info = compare_code(expected_code, generated_code) |
|
|
|
|
|
print(f"\nπ Comparison:") |
|
|
print(f" Similarity: {similarity:.2f}%") |
|
|
print(f" Match Info: {match_info}") |
|
|
|
|
|
results["training_samples"].append({ |
|
|
"sample_num": i, |
|
|
"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, |
|
|
"expected_code_length": len(expected_code), |
|
|
"generated_code_length": len(generated_code), |
|
|
"similarity": similarity, |
|
|
"match_info": match_info, |
|
|
"expected_code": expected_code, |
|
|
"generated_code": generated_code, |
|
|
"generated_full_response": generated_response |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error during inference: {e}") |
|
|
results["training_samples"].append({ |
|
|
"sample_num": i, |
|
|
"error": str(e) |
|
|
}) |
|
|
|
|
|
|
|
|
print("\n\n" + "=" * 80) |
|
|
print("π TESTING TEST SAMPLES") |
|
|
print("=" * 80) |
|
|
|
|
|
test_samples = load_samples(test_dataset, num_samples=2) |
|
|
|
|
|
for i, sample in enumerate(test_samples, 1): |
|
|
print(f"\n{'='*80}") |
|
|
print(f"TEST SAMPLE {i}/2") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
instruction = sample.get("instruction", "") |
|
|
expected_response = sample.get("response", "") |
|
|
expected_code = extract_code_from_response(expected_response) |
|
|
|
|
|
print(f"\nπ Instruction:") |
|
|
print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction) |
|
|
|
|
|
print(f"\nπ― Expected Code (first 300 chars):") |
|
|
print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code) |
|
|
|
|
|
print(f"\nπ€ Generating response...") |
|
|
try: |
|
|
generated_response = generate_with_local_model( |
|
|
model, |
|
|
tokenizer, |
|
|
instruction, |
|
|
max_new_tokens=800, |
|
|
temperature=0.3, |
|
|
stream=False |
|
|
) |
|
|
|
|
|
generated_code = extract_code_from_response(generated_response) |
|
|
|
|
|
print(f"\nβ
Generated Code (first 300 chars):") |
|
|
print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code) |
|
|
|
|
|
|
|
|
similarity, match_info = compare_code(expected_code, generated_code) |
|
|
|
|
|
print(f"\nπ Comparison:") |
|
|
print(f" Similarity: {similarity:.2f}%") |
|
|
print(f" Match Info: {match_info}") |
|
|
|
|
|
results["test_samples"].append({ |
|
|
"sample_num": i, |
|
|
"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction, |
|
|
"expected_code_length": len(expected_code), |
|
|
"generated_code_length": len(generated_code), |
|
|
"similarity": similarity, |
|
|
"match_info": match_info, |
|
|
"expected_code": expected_code, |
|
|
"generated_code": generated_code, |
|
|
"generated_full_response": generated_response |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error during inference: {e}") |
|
|
results["test_samples"].append({ |
|
|
"sample_num": i, |
|
|
"error": str(e) |
|
|
}) |
|
|
|
|
|
|
|
|
print("\n\n" + "=" * 80) |
|
|
print("π EVALUATION SUMMARY") |
|
|
print("=" * 80) |
|
|
|
|
|
train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0 |
|
|
test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0 |
|
|
|
|
|
print(f"\nπ Training Samples:") |
|
|
print(f" Average Similarity: {train_avg_similarity:.2f}%") |
|
|
print(f" Samples Tested: {len(results['training_samples'])}") |
|
|
|
|
|
print(f"\nπ Test Samples:") |
|
|
print(f" Average Similarity: {test_avg_similarity:.2f}%") |
|
|
print(f" Samples Tested: {len(results['test_samples'])}") |
|
|
|
|
|
overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity) |
|
|
print(f"\nπ Overall Average Similarity: {overall_avg:.2f}%") |
|
|
|
|
|
|
|
|
output_file = script_dir / "evaluation_results.json" |
|
|
with open(output_file, 'w') as f: |
|
|
json.dump(results, f, indent=2) |
|
|
|
|
|
print(f"\nπΎ Detailed results saved to: {output_file}") |
|
|
print("=" * 80) |
|
|
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|