codellama-fine-tuning / test_samples.py

Upload test_samples.py with huggingface_hub

884c533 verified 2 months ago

10.1 kB

	#!/usr/bin/env python3
	"""
	Test script to evaluate fine-tuned CodeLlama model on training and test samples
	"""

	import json
	import sys
	import os
	from pathlib import Path

	# Add scripts to path
	sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference"))

	from inference_codellama import load_local_model, generate_with_local_model

	def load_samples(dataset_path, num_samples=2):
	"""Load N samples from dataset"""
	samples = []
	with open(dataset_path, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	if i >= num_samples:
	break
	if line.strip():
	samples.append(json.loads(line))
	return samples

	def extract_instruction_prompt(instruction_text):
	"""Extract just the task part from instruction (remove system prompt if needed)"""
	# The instruction already contains the system prompt + task
	# Return as-is for CodeLlama
	return instruction_text

	def extract_code_from_response(text):
	"""Extract Verilog code from markdown code blocks"""
	if not text:
	return text

	# Check for verilog code block
	if '```verilog' in text:
	start = text.find('```verilog') + len('```verilog')
	end = text.find('```', start)
	if end != -1:
	extracted = text[start:end].strip()
	return extracted

	# Check for generic code block
	if '```' in text:
	start = text.find('```')
	if start != -1:
	start_marker = text.find('\n', start)
	if start_marker == -1:
	start_marker = start + 3
	else:
	start_marker += 1

	end = text.find('```', start_marker)
	if end != -1:
	extracted = text[start_marker:end].strip()
	return extracted

	return text.strip()

	def compare_code(expected, generated):
	"""Simple code comparison"""
	expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '')
	generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '')

	if expected_clean == generated_clean:
	return 100.0, "Perfect match"

	# Calculate similarity (simple)
	matches = 0
	min_len = min(len(expected_clean), len(generated_clean))
	for i in range(min_len):
	if expected_clean[i] == generated_clean[i]:
	matches += 1

	similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0

	return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match"

	def main():
	# Paths
	script_dir = Path(__file__).parent
	model_path = script_dir / "training-outputs" / "codellama-fifo-v1"
	base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct"
	train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl"
	test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl"

	print("=" * 80)
	print("🧪 CODELLAMA FINE-TUNED MODEL EVALUATION")
	print("=" * 80)
	print(f"Model: {model_path}")
	print(f"Base Model: {base_model_path}")
	print("=" * 80)
	print()

	# Load model
	print("📦 Loading model...")
	model, tokenizer = load_local_model(
	str(model_path),
	str(base_model_path) if base_model_path.exists() else None,
	use_quantization=None, # Auto-detect
	merge_weights=False
	)
	print("✅ Model loaded successfully!\n")

	results = {
	"training_samples": [],
	"test_samples": []
	}

	# Test training samples
	print("=" * 80)
	print("📚 TESTING TRAINING SAMPLES")
	print("=" * 80)

	train_samples = load_samples(train_dataset, num_samples=2)

	for i, sample in enumerate(train_samples, 1):
	print(f"\n{'='*80}")
	print(f"TRAINING SAMPLE {i}/2")
	print(f"{'='*80}")

	instruction = sample.get("instruction", "")
	expected_response = sample.get("response", "")
	expected_code = extract_code_from_response(expected_response)

	print(f"\n📝 Instruction:")
	print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)

	print(f"\n🎯 Expected Code (first 300 chars):")
	print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)

	print(f"\n🤖 Generating response...")
	try:
	generated_response = generate_with_local_model(
	model,
	tokenizer,
	instruction,
	max_new_tokens=800,
	temperature=0.3,
	stream=False
	)

	generated_code = extract_code_from_response(generated_response)

	print(f"\n✅ Generated Code (first 300 chars):")
	print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)

	# Compare
	similarity, match_info = compare_code(expected_code, generated_code)

	print(f"\n📊 Comparison:")
	print(f" Similarity: {similarity:.2f}%")
	print(f" Match Info: {match_info}")

	results["training_samples"].append({
	"sample_num": i,
	"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
	"expected_code_length": len(expected_code),
	"generated_code_length": len(generated_code),
	"similarity": similarity,
	"match_info": match_info,
	"expected_code": expected_code,
	"generated_code": generated_code,
	"generated_full_response": generated_response
	})

	except Exception as e:
	print(f"❌ Error during inference: {e}")
	results["training_samples"].append({
	"sample_num": i,
	"error": str(e)
	})

	# Test test samples
	print("\n\n" + "=" * 80)
	print("📚 TESTING TEST SAMPLES")
	print("=" * 80)

	test_samples = load_samples(test_dataset, num_samples=2)

	for i, sample in enumerate(test_samples, 1):
	print(f"\n{'='*80}")
	print(f"TEST SAMPLE {i}/2")
	print(f"{'='*80}")

	instruction = sample.get("instruction", "")
	expected_response = sample.get("response", "")
	expected_code = extract_code_from_response(expected_response)

	print(f"\n📝 Instruction:")
	print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)

	print(f"\n🎯 Expected Code (first 300 chars):")
	print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)

	print(f"\n🤖 Generating response...")
	try:
	generated_response = generate_with_local_model(
	model,
	tokenizer,
	instruction,
	max_new_tokens=800,
	temperature=0.3,
	stream=False
	)

	generated_code = extract_code_from_response(generated_response)

	print(f"\n✅ Generated Code (first 300 chars):")
	print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)

	# Compare
	similarity, match_info = compare_code(expected_code, generated_code)

	print(f"\n📊 Comparison:")
	print(f" Similarity: {similarity:.2f}%")
	print(f" Match Info: {match_info}")

	results["test_samples"].append({
	"sample_num": i,
	"instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
	"expected_code_length": len(expected_code),
	"generated_code_length": len(generated_code),
	"similarity": similarity,
	"match_info": match_info,
	"expected_code": expected_code,
	"generated_code": generated_code,
	"generated_full_response": generated_response
	})

	except Exception as e:
	print(f"❌ Error during inference: {e}")
	results["test_samples"].append({
	"sample_num": i,
	"error": str(e)
	})

	# Summary
	print("\n\n" + "=" * 80)
	print("📊 EVALUATION SUMMARY")
	print("=" * 80)

	train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0
	test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0

	print(f"\n📈 Training Samples:")
	print(f" Average Similarity: {train_avg_similarity:.2f}%")
	print(f" Samples Tested: {len(results['training_samples'])}")

	print(f"\n📈 Test Samples:")
	print(f" Average Similarity: {test_avg_similarity:.2f}%")
	print(f" Samples Tested: {len(results['test_samples'])}")

	overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity)
	print(f"\n📊 Overall Average Similarity: {overall_avg:.2f}%")

	# Save results
	output_file = script_dir / "evaluation_results.json"
	with open(output_file, 'w') as f:
	json.dump(results, f, indent=2)

	print(f"\n💾 Detailed results saved to: {output_file}")
	print("=" * 80)

	return results

	if __name__ == "__main__":
	main()