Prithvik-1 commited on
Commit
884c533
Β·
verified Β·
1 Parent(s): 6045a16

Upload test_samples.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. test_samples.py +272 -0
test_samples.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to evaluate fine-tuned CodeLlama model on training and test samples
4
+ """
5
+
6
+ import json
7
+ import sys
8
+ import os
9
+ from pathlib import Path
10
+
11
+ # Add scripts to path
12
+ sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference"))
13
+
14
+ from inference_codellama import load_local_model, generate_with_local_model
15
+
16
+ def load_samples(dataset_path, num_samples=2):
17
+ """Load N samples from dataset"""
18
+ samples = []
19
+ with open(dataset_path, 'r', encoding='utf-8') as f:
20
+ for i, line in enumerate(f):
21
+ if i >= num_samples:
22
+ break
23
+ if line.strip():
24
+ samples.append(json.loads(line))
25
+ return samples
26
+
27
+ def extract_instruction_prompt(instruction_text):
28
+ """Extract just the task part from instruction (remove system prompt if needed)"""
29
+ # The instruction already contains the system prompt + task
30
+ # Return as-is for CodeLlama
31
+ return instruction_text
32
+
33
+ def extract_code_from_response(text):
34
+ """Extract Verilog code from markdown code blocks"""
35
+ if not text:
36
+ return text
37
+
38
+ # Check for verilog code block
39
+ if '```verilog' in text:
40
+ start = text.find('```verilog') + len('```verilog')
41
+ end = text.find('```', start)
42
+ if end != -1:
43
+ extracted = text[start:end].strip()
44
+ return extracted
45
+
46
+ # Check for generic code block
47
+ if '```' in text:
48
+ start = text.find('```')
49
+ if start != -1:
50
+ start_marker = text.find('\n', start)
51
+ if start_marker == -1:
52
+ start_marker = start + 3
53
+ else:
54
+ start_marker += 1
55
+
56
+ end = text.find('```', start_marker)
57
+ if end != -1:
58
+ extracted = text[start_marker:end].strip()
59
+ return extracted
60
+
61
+ return text.strip()
62
+
63
+ def compare_code(expected, generated):
64
+ """Simple code comparison"""
65
+ expected_clean = expected.strip().replace(' ', '').replace('\n', '').replace('\t', '')
66
+ generated_clean = generated.strip().replace(' ', '').replace('\n', '').replace('\t', '')
67
+
68
+ if expected_clean == generated_clean:
69
+ return 100.0, "Perfect match"
70
+
71
+ # Calculate similarity (simple)
72
+ matches = 0
73
+ min_len = min(len(expected_clean), len(generated_clean))
74
+ for i in range(min_len):
75
+ if expected_clean[i] == generated_clean[i]:
76
+ matches += 1
77
+
78
+ similarity = (matches / max(len(expected_clean), len(generated_clean))) * 100 if max(len(expected_clean), len(generated_clean)) > 0 else 0
79
+
80
+ return similarity, f"{matches}/{max(len(expected_clean), len(generated_clean))} characters match"
81
+
82
+ def main():
83
+ # Paths
84
+ script_dir = Path(__file__).parent
85
+ model_path = script_dir / "training-outputs" / "codellama-fifo-v1"
86
+ base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct"
87
+ train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl"
88
+ test_dataset = script_dir / "datasets" / "processed" / "split" / "test.jsonl"
89
+
90
+ print("=" * 80)
91
+ print("πŸ§ͺ CODELLAMA FINE-TUNED MODEL EVALUATION")
92
+ print("=" * 80)
93
+ print(f"Model: {model_path}")
94
+ print(f"Base Model: {base_model_path}")
95
+ print("=" * 80)
96
+ print()
97
+
98
+ # Load model
99
+ print("πŸ“¦ Loading model...")
100
+ model, tokenizer = load_local_model(
101
+ str(model_path),
102
+ str(base_model_path) if base_model_path.exists() else None,
103
+ use_quantization=None, # Auto-detect
104
+ merge_weights=False
105
+ )
106
+ print("βœ… Model loaded successfully!\n")
107
+
108
+ results = {
109
+ "training_samples": [],
110
+ "test_samples": []
111
+ }
112
+
113
+ # Test training samples
114
+ print("=" * 80)
115
+ print("πŸ“š TESTING TRAINING SAMPLES")
116
+ print("=" * 80)
117
+
118
+ train_samples = load_samples(train_dataset, num_samples=2)
119
+
120
+ for i, sample in enumerate(train_samples, 1):
121
+ print(f"\n{'='*80}")
122
+ print(f"TRAINING SAMPLE {i}/2")
123
+ print(f"{'='*80}")
124
+
125
+ instruction = sample.get("instruction", "")
126
+ expected_response = sample.get("response", "")
127
+ expected_code = extract_code_from_response(expected_response)
128
+
129
+ print(f"\nπŸ“ Instruction:")
130
+ print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)
131
+
132
+ print(f"\n🎯 Expected Code (first 300 chars):")
133
+ print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)
134
+
135
+ print(f"\nπŸ€– Generating response...")
136
+ try:
137
+ generated_response = generate_with_local_model(
138
+ model,
139
+ tokenizer,
140
+ instruction,
141
+ max_new_tokens=800,
142
+ temperature=0.3,
143
+ stream=False
144
+ )
145
+
146
+ generated_code = extract_code_from_response(generated_response)
147
+
148
+ print(f"\nβœ… Generated Code (first 300 chars):")
149
+ print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)
150
+
151
+ # Compare
152
+ similarity, match_info = compare_code(expected_code, generated_code)
153
+
154
+ print(f"\nπŸ“Š Comparison:")
155
+ print(f" Similarity: {similarity:.2f}%")
156
+ print(f" Match Info: {match_info}")
157
+
158
+ results["training_samples"].append({
159
+ "sample_num": i,
160
+ "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
161
+ "expected_code_length": len(expected_code),
162
+ "generated_code_length": len(generated_code),
163
+ "similarity": similarity,
164
+ "match_info": match_info,
165
+ "expected_code": expected_code,
166
+ "generated_code": generated_code,
167
+ "generated_full_response": generated_response
168
+ })
169
+
170
+ except Exception as e:
171
+ print(f"❌ Error during inference: {e}")
172
+ results["training_samples"].append({
173
+ "sample_num": i,
174
+ "error": str(e)
175
+ })
176
+
177
+ # Test test samples
178
+ print("\n\n" + "=" * 80)
179
+ print("πŸ“š TESTING TEST SAMPLES")
180
+ print("=" * 80)
181
+
182
+ test_samples = load_samples(test_dataset, num_samples=2)
183
+
184
+ for i, sample in enumerate(test_samples, 1):
185
+ print(f"\n{'='*80}")
186
+ print(f"TEST SAMPLE {i}/2")
187
+ print(f"{'='*80}")
188
+
189
+ instruction = sample.get("instruction", "")
190
+ expected_response = sample.get("response", "")
191
+ expected_code = extract_code_from_response(expected_response)
192
+
193
+ print(f"\nπŸ“ Instruction:")
194
+ print(f"{instruction[:200]}..." if len(instruction) > 200 else instruction)
195
+
196
+ print(f"\n🎯 Expected Code (first 300 chars):")
197
+ print(expected_code[:300] + "..." if len(expected_code) > 300 else expected_code)
198
+
199
+ print(f"\nπŸ€– Generating response...")
200
+ try:
201
+ generated_response = generate_with_local_model(
202
+ model,
203
+ tokenizer,
204
+ instruction,
205
+ max_new_tokens=800,
206
+ temperature=0.3,
207
+ stream=False
208
+ )
209
+
210
+ generated_code = extract_code_from_response(generated_response)
211
+
212
+ print(f"\nβœ… Generated Code (first 300 chars):")
213
+ print(generated_code[:300] + "..." if len(generated_code) > 300 else generated_code)
214
+
215
+ # Compare
216
+ similarity, match_info = compare_code(expected_code, generated_code)
217
+
218
+ print(f"\nπŸ“Š Comparison:")
219
+ print(f" Similarity: {similarity:.2f}%")
220
+ print(f" Match Info: {match_info}")
221
+
222
+ results["test_samples"].append({
223
+ "sample_num": i,
224
+ "instruction": instruction[:100] + "..." if len(instruction) > 100 else instruction,
225
+ "expected_code_length": len(expected_code),
226
+ "generated_code_length": len(generated_code),
227
+ "similarity": similarity,
228
+ "match_info": match_info,
229
+ "expected_code": expected_code,
230
+ "generated_code": generated_code,
231
+ "generated_full_response": generated_response
232
+ })
233
+
234
+ except Exception as e:
235
+ print(f"❌ Error during inference: {e}")
236
+ results["test_samples"].append({
237
+ "sample_num": i,
238
+ "error": str(e)
239
+ })
240
+
241
+ # Summary
242
+ print("\n\n" + "=" * 80)
243
+ print("πŸ“Š EVALUATION SUMMARY")
244
+ print("=" * 80)
245
+
246
+ train_avg_similarity = sum(s.get("similarity", 0) for s in results["training_samples"] if "similarity" in s) / len([s for s in results["training_samples"] if "similarity" in s]) if results["training_samples"] else 0
247
+ test_avg_similarity = sum(s.get("similarity", 0) for s in results["test_samples"] if "similarity" in s) / len([s for s in results["test_samples"] if "similarity" in s]) if results["test_samples"] else 0
248
+
249
+ print(f"\nπŸ“ˆ Training Samples:")
250
+ print(f" Average Similarity: {train_avg_similarity:.2f}%")
251
+ print(f" Samples Tested: {len(results['training_samples'])}")
252
+
253
+ print(f"\nπŸ“ˆ Test Samples:")
254
+ print(f" Average Similarity: {test_avg_similarity:.2f}%")
255
+ print(f" Samples Tested: {len(results['test_samples'])}")
256
+
257
+ overall_avg = (train_avg_similarity + test_avg_similarity) / 2 if (train_avg_similarity > 0 and test_avg_similarity > 0) else (train_avg_similarity if train_avg_similarity > 0 else test_avg_similarity)
258
+ print(f"\nπŸ“Š Overall Average Similarity: {overall_avg:.2f}%")
259
+
260
+ # Save results
261
+ output_file = script_dir / "evaluation_results.json"
262
+ with open(output_file, 'w') as f:
263
+ json.dump(results, f, indent=2)
264
+
265
+ print(f"\nπŸ’Ύ Detailed results saved to: {output_file}")
266
+ print("=" * 80)
267
+
268
+ return results
269
+
270
+ if __name__ == "__main__":
271
+ main()
272
+