Prithvik-1 commited on
Commit
a2b3989
·
verified ·
1 Parent(s): 0361c24

Upload test_exact_training_format.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. test_exact_training_format.py +105 -0
test_exact_training_format.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test with EXACT training format to see if model generates correctly
4
+ """
5
+
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ sys.path.insert(0, str(Path(__file__).parent / "scripts" / "inference"))
11
+
12
+ from inference_codellama import load_local_model
13
+ import torch
14
+ from transformers import AutoTokenizer
15
+
16
+ def main():
17
+ script_dir = Path(__file__).parent
18
+ model_path = script_dir / "training-outputs" / "codellama-fifo-v1"
19
+ base_model_path = script_dir / "models" / "base-models" / "CodeLlama-7B-Instruct"
20
+ train_dataset = script_dir / "datasets" / "processed" / "split" / "train.jsonl"
21
+
22
+ print("=" * 80)
23
+ print("🧪 TESTING WITH EXACT TRAINING FORMAT")
24
+ print("=" * 80)
25
+
26
+ # Load sample
27
+ with open(train_dataset, 'r') as f:
28
+ sample = json.loads(f.readline())
29
+
30
+ instruction = sample["instruction"]
31
+ expected_response = sample["response"]
32
+
33
+ print(f"\n📝 Instruction ({len(instruction)} chars):")
34
+ print(instruction[:300] + "...")
35
+
36
+ print(f"\n🎯 Expected Response ({len(expected_response)} chars):")
37
+ print(expected_response[:300] + "...")
38
+
39
+ # Load model
40
+ print("\n📦 Loading model...")
41
+ model, tokenizer = load_local_model(
42
+ str(model_path),
43
+ str(base_model_path) if base_model_path.exists() else None
44
+ )
45
+
46
+ # EXACT training format: instruction + EOS (model continues)
47
+ prompt = f"{instruction}{tokenizer.eos_token}"
48
+
49
+ print(f"\n🔍 Prompt format (EXACT training format):")
50
+ print(f" Format: instruction + EOS")
51
+ print(f" Length: {len(prompt)} chars")
52
+ print()
53
+
54
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1536).to(model.device)
55
+
56
+ print(f"📊 Tokenized: {inputs['input_ids'].shape[1]} tokens")
57
+ print(f"\n🤖 Generating with temperature 0.1...")
58
+ print("=" * 80)
59
+
60
+ with torch.no_grad():
61
+ outputs = model.generate(
62
+ **inputs,
63
+ max_new_tokens=1000,
64
+ temperature=0.1,
65
+ do_sample=False, # Greedy decoding
66
+ repetition_penalty=1.2,
67
+ pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
68
+ eos_token_id=tokenizer.eos_token_id,
69
+ )
70
+
71
+ # Decode only new tokens
72
+ input_length = inputs['input_ids'].shape[1]
73
+ generated_ids = outputs[0][input_length:]
74
+ generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
75
+
76
+ if generated_text.endswith(tokenizer.eos_token):
77
+ generated_text = generated_text[:-len(tokenizer.eos_token)].rstrip()
78
+
79
+ print("\n" + "=" * 80)
80
+ print("✅ GENERATED OUTPUT:")
81
+ print("=" * 80)
82
+ print(generated_text)
83
+ print("=" * 80)
84
+
85
+ # Check if it's code
86
+ has_module = "module" in generated_text.lower()
87
+ has_endmodule = "endmodule" in generated_text.lower()
88
+ has_verilog = "verilog" in generated_text.lower() or "```" in generated_text
89
+
90
+ print(f"\n📊 Analysis:")
91
+ print(f" Contains 'module': {has_module}")
92
+ print(f" Contains 'endmodule': {has_endmodule}")
93
+ print(f" Contains 'verilog': {has_verilog}")
94
+ print(f" Length: {len(generated_text)} chars")
95
+
96
+ if has_module and has_endmodule:
97
+ print(f" ✅ STATUS: Generated Verilog code!")
98
+ elif has_module:
99
+ print(f" ⚠️ STATUS: Partial code")
100
+ else:
101
+ print(f" ❌ STATUS: Not generating code")
102
+
103
+ if __name__ == "__main__":
104
+ main()
105
+