stmasson commited on
Commit
d7f1905
·
verified ·
1 Parent(s): 188cdd5

Upload eval_comparison.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_comparison.py +154 -0
eval_comparison.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = ["transformers>=4.46.0", "torch", "peft", "bitsandbytes", "accelerate", "datasets", "human-eval", "tqdm"]
3
+ # ///
4
+
5
+ import os
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
8
+ from peft import PeftModel
9
+ from human_eval.data import write_jsonl, read_problems
10
+ from human_eval.evaluation import evaluate_functional_correctness
11
+ import tempfile
12
+ import json
13
+ from tqdm import tqdm
14
+
15
+ print("="*60)
16
+ print("EVALUATION: Base vs Fine-tuned on HumanEval")
17
+ print("="*60)
18
+
19
+ # Configuration
20
+ BASE_MODEL = "mistralai/Devstral-Small-2505"
21
+ FINETUNED_MODEL = "stmasson/alizee-coder-devstral-1-small"
22
+ NUM_SAMPLES = 1 # samples per problem
23
+ TEMPERATURE = 0.1
24
+ MAX_NEW_TOKENS = 512
25
+
26
+ # 4-bit quantization for memory efficiency
27
+ bnb_config = BitsAndBytesConfig(
28
+ load_in_4bit=True,
29
+ bnb_4bit_quant_type="nf4",
30
+ bnb_4bit_compute_dtype=torch.bfloat16,
31
+ bnb_4bit_use_double_quant=True,
32
+ )
33
+
34
+ def load_model(model_name, adapter_name=None):
35
+ """Load model with optional LoRA adapter"""
36
+ print(f"\nLoading model: {model_name}")
37
+ if adapter_name:
38
+ print(f"With adapter: {adapter_name}")
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
41
+ if tokenizer.pad_token is None:
42
+ tokenizer.pad_token = tokenizer.eos_token
43
+
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_name,
46
+ quantization_config=bnb_config,
47
+ device_map="auto",
48
+ trust_remote_code=True,
49
+ torch_dtype=torch.bfloat16,
50
+ )
51
+
52
+ if adapter_name:
53
+ model = PeftModel.from_pretrained(model, adapter_name)
54
+ model = model.merge_and_unload()
55
+
56
+ model.eval()
57
+ return model, tokenizer
58
+
59
+ def generate_completion(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS):
60
+ """Generate code completion"""
61
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
62
+
63
+ with torch.no_grad():
64
+ outputs = model.generate(
65
+ **inputs,
66
+ max_new_tokens=max_new_tokens,
67
+ temperature=TEMPERATURE,
68
+ do_sample=True if TEMPERATURE > 0 else False,
69
+ pad_token_id=tokenizer.pad_token_id,
70
+ eos_token_id=tokenizer.eos_token_id,
71
+ )
72
+
73
+ completion = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
74
+
75
+ # Extract code until function ends or stop tokens
76
+ stop_tokens = ["\ndef ", "\nclass ", "\n#", "\nif __name__", "\n```"]
77
+ for stop in stop_tokens:
78
+ if stop in completion:
79
+ completion = completion[:completion.index(stop)]
80
+
81
+ return completion
82
+
83
+ def evaluate_model(model, tokenizer, problems, model_name):
84
+ """Evaluate model on HumanEval"""
85
+ print(f"\nEvaluating {model_name}...")
86
+ samples = []
87
+
88
+ for task_id, problem in tqdm(problems.items(), desc=f"Generating ({model_name})"):
89
+ prompt = problem["prompt"]
90
+
91
+ for _ in range(NUM_SAMPLES):
92
+ completion = generate_completion(model, tokenizer, prompt)
93
+ samples.append({
94
+ "task_id": task_id,
95
+ "completion": completion
96
+ })
97
+
98
+ # Write samples and evaluate
99
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
100
+ sample_file = f.name
101
+ write_jsonl(sample_file, samples)
102
+
103
+ results = evaluate_functional_correctness(sample_file, k=[1])
104
+ os.unlink(sample_file)
105
+
106
+ return results
107
+
108
+ def main():
109
+ # Load HumanEval problems
110
+ print("\nLoading HumanEval problems...")
111
+ problems = read_problems()
112
+ print(f"Total problems: {len(problems)}")
113
+
114
+ results = {}
115
+
116
+ # Evaluate base model
117
+ print("\n" + "="*60)
118
+ print("EVALUATING BASE MODEL")
119
+ print("="*60)
120
+ base_model, base_tokenizer = load_model(BASE_MODEL)
121
+ results["base"] = evaluate_model(base_model, base_tokenizer, problems, "Devstral-Small (Base)")
122
+ print(f"\nBase Model Results: {results['base']}")
123
+
124
+ # Free memory
125
+ del base_model
126
+ torch.cuda.empty_cache()
127
+
128
+ # Evaluate fine-tuned model
129
+ print("\n" + "="*60)
130
+ print("EVALUATING FINE-TUNED MODEL")
131
+ print("="*60)
132
+ ft_model, ft_tokenizer = load_model(BASE_MODEL, FINETUNED_MODEL)
133
+ results["finetuned"] = evaluate_model(ft_model, ft_tokenizer, problems, "Alizee-Coder (Fine-tuned)")
134
+ print(f"\nFine-tuned Model Results: {results['finetuned']}")
135
+
136
+ # Summary
137
+ print("\n" + "="*60)
138
+ print("COMPARISON SUMMARY")
139
+ print("="*60)
140
+ print(f"\n{'Model':<40} {'pass@1':>10}")
141
+ print("-"*52)
142
+ print(f"{'Devstral-Small-2505 (Base)':<40} {results['base']['pass@1']*100:>9.1f}%")
143
+ print(f"{'Alizee-Coder-Devstral (Fine-tuned)':<40} {results['finetuned']['pass@1']*100:>9.1f}%")
144
+
145
+ improvement = (results['finetuned']['pass@1'] - results['base']['pass@1']) * 100
146
+ print(f"\n{'Improvement:':<40} {improvement:>+9.1f}%")
147
+
148
+ # Save results
149
+ with open("eval_results.json", "w") as f:
150
+ json.dump(results, f, indent=2)
151
+ print("\nResults saved to eval_results.json")
152
+
153
+ if __name__ == "__main__":
154
+ main()