codellama-fine-tuning / test_samples.py
Prithvik-1's picture
Upload test_samples.py with huggingface_hub
884c533 verified
#!/usr/bin/env python3
"""
Test script to evaluate fine-tuned CodeLlama model on training and test samples
"""
import json
import sys
import os
from pathlib import Path
# Add scripts to path
sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference"))
from inference_codellama import load_local_model, generate_with_local_model
def load_samples(dataset_path, num_samples=2):
"""Load N samples from dataset"""
samples = []
with open(dataset_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= num_samples:
break
if line.strip():
samples.append(json.loads(line))
return samples
def extract_instruction_prompt(instruction_text):
"""Extract just the task part from instruction (remove system prompt if needed)"""
# The instruction already contains the system prompt + task
# Return as-is for CodeLlama
return instruction_text
def extract_code_from_response(text):
"""Extract Verilog code from markdown code blocks"""
if not text:
return text
# Check for verilog code block
if '```verilog' in text:
start = text.find('```verilog') + len('```verilog')
end = text.find('```', start)
if end != -1:
extracted = text[start:end].strip()
return extracted
# Check for generic code block
if '```' in text:
start = text.find('```')
if start != -1:
start_marker = text.find('\n', start)
if start_marker == -1:
start_marker = start + 3
else:
start_marker += 1
end = text.find('```', start_marker)
if end != -1:
extracted = text[start_marker:end].strip()
return extracted
return text.strip()
def compare_code(expected, generated):
"""Simple code comparison"""
expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '')
generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '')
if expected_clean == generated_clean:
return 100.0, "Perfect match"
# Calculate similarity (simple)
matches = 0
min_len = min(len(expected_clean), len(generated_clean))
for i in range(min_len):
if expected_clean[i] == generated_clean[i]:
matches += 1
similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0
return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match"
def main():
# Paths
script_dir = Path(__file__).parent
model_path = script_dir / "training-outputs" / "codellama-fifo-v1"
base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct"
train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl"
test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl"
print("=" * 80)
print("πŸ§ͺ CODELLAMA FINE-TUNED MODEL EVALUATION")
print("=" * 80)
print(f"Model: {model_path}")
print(f"Base Model: {base_model_path}")
print("=" * 80)
print()
# Load model
print("πŸ“¦ Loading model...")
model, tokenizer = load_local_model(
str(model_path),
str(base_model_path) if base_model_path.exists() else None,
use_quantization=None, # Auto-detect
merge_weights=False
)
print("βœ… Model loaded successfully!\n")
results = {
"training_samples": [],
"test_samples": []
}
# Test training samples
print("=" * 80)
print("πŸ“š TESTING TRAINING SAMPLES")
print("=" * 80)
train_samples = load_samples(train_dataset, num_samples=2)
for i, sample in enumerate(train_samples, 1):
print(f"\n{'='*80}")
print(f"TRAINING SAMPLE {i}/2")
print(f"{'='*80}")
instruction = sample.get("instruction", "")
expected_response = sample.get("response", "")
expected_code = extract_code_from_response(expected_response)
print(f"\nπŸ“ Instruction:")
print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)
print(f"\n🎯 Expected Code (first 300 chars):")
print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)
print(f"\nπŸ€– Generating response...")
try:
generated_response = generate_with_local_model(
model,
tokenizer,
instruction,
max_new_tokens=800,
temperature=0.3,
stream=False
)
generated_code = extract_code_from_response(generated_response)
print(f"\nβœ… Generated Code (first 300 chars):")
print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)
# Compare
similarity, match_info = compare_code(expected_code, generated_code)
print(f"\nπŸ“Š Comparison:")
print(f" Similarity: {similarity:.2f}%")
print(f" Match Info: {match_info}")
results["training_samples"].append({
"sample_num": i,
"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
"expected_code_length": len(expected_code),
"generated_code_length": len(generated_code),
"similarity": similarity,
"match_info": match_info,
"expected_code": expected_code,
"generated_code": generated_code,
"generated_full_response": generated_response
})
except Exception as e:
print(f"❌ Error during inference: {e}")
results["training_samples"].append({
"sample_num": i,
"error": str(e)
})
# Test test samples
print("\n\n" + "=" * 80)
print("πŸ“š TESTING TEST SAMPLES")
print("=" * 80)
test_samples = load_samples(test_dataset, num_samples=2)
for i, sample in enumerate(test_samples, 1):
print(f"\n{'='*80}")
print(f"TEST SAMPLE {i}/2")
print(f"{'='*80}")
instruction = sample.get("instruction", "")
expected_response = sample.get("response", "")
expected_code = extract_code_from_response(expected_response)
print(f"\nπŸ“ Instruction:")
print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)
print(f"\n🎯 Expected Code (first 300 chars):")
print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)
print(f"\nπŸ€– Generating response...")
try:
generated_response = generate_with_local_model(
model,
tokenizer,
instruction,
max_new_tokens=800,
temperature=0.3,
stream=False
)
generated_code = extract_code_from_response(generated_response)
print(f"\nβœ… Generated Code (first 300 chars):")
print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)
# Compare
similarity, match_info = compare_code(expected_code, generated_code)
print(f"\nπŸ“Š Comparison:")
print(f" Similarity: {similarity:.2f}%")
print(f" Match Info: {match_info}")
results["test_samples"].append({
"sample_num": i,
"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
"expected_code_length": len(expected_code),
"generated_code_length": len(generated_code),
"similarity": similarity,
"match_info": match_info,
"expected_code": expected_code,
"generated_code": generated_code,
"generated_full_response": generated_response
})
except Exception as e:
print(f"❌ Error during inference: {e}")
results["test_samples"].append({
"sample_num": i,
"error": str(e)
})
# Summary
print("\n\n" + "=" * 80)
print("πŸ“Š EVALUATION SUMMARY")
print("=" * 80)
train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0
test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0
print(f"\nπŸ“ˆ Training Samples:")
print(f" Average Similarity: {train_avg_similarity:.2f}%")
print(f" Samples Tested: {len(results['training_samples'])}")
print(f"\nπŸ“ˆ Test Samples:")
print(f" Average Similarity: {test_avg_similarity:.2f}%")
print(f" Samples Tested: {len(results['test_samples'])}")
overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity)
print(f"\nπŸ“Š Overall Average Similarity: {overall_avg:.2f}%")
# Save results
output_file = script_dir / "evaluation_results.json"
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nπŸ’Ύ Detailed results saved to: {output_file}")
print("=" * 80)
return results
if __name__ == "__main__":
main()