Upload reformat_dataset_for_codellama.py with huggingface_hub
Browse files
reformat_dataset_for_codellama.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Reformat dataset to use CodeLlama chat template format
|
| 4 |
+
CodeLlama-Instruct expects: <s>[INST] <<SYS>>...<</SYS>> User [/INST] Response </s>
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from transformers import AutoTokenizer
|
| 11 |
+
|
| 12 |
+
def extract_system_and_user(instruction: str):
|
| 13 |
+
"""Extract system prompt and user message from instruction"""
|
| 14 |
+
# The instruction contains: "System prompt...\n\nTask description"
|
| 15 |
+
parts = instruction.split("\n\n", 1)
|
| 16 |
+
|
| 17 |
+
if len(parts) == 2:
|
| 18 |
+
system_msg = parts[0].strip()
|
| 19 |
+
user_msg = parts[1].strip()
|
| 20 |
+
|
| 21 |
+
# Check if system message contains the role description
|
| 22 |
+
if "Elinnos RTL Code Generator" in system_msg or "specialized Verilog" in system_msg:
|
| 23 |
+
return system_msg, user_msg
|
| 24 |
+
|
| 25 |
+
# Default: extract system prompt
|
| 26 |
+
if "You are" in instruction and "\n\n" in instruction:
|
| 27 |
+
parts = instruction.split("\n\n", 1)
|
| 28 |
+
system_msg = parts[0]
|
| 29 |
+
user_msg = parts[1] if len(parts) > 1 else ""
|
| 30 |
+
return system_msg, user_msg
|
| 31 |
+
|
| 32 |
+
# Fallback: use default system prompt
|
| 33 |
+
system_msg = "You are Elinnos RTL Code Generator v1.0, a specialized Verilog/SystemVerilog code generation agent. Your role: Generate clean, synthesizable RTL code for hardware design tasks. Output ONLY functional RTL code with no $display, assertions, comments, or debug statements."
|
| 34 |
+
user_msg = instruction
|
| 35 |
+
|
| 36 |
+
return system_msg, user_msg
|
| 37 |
+
|
| 38 |
+
def reformat_dataset(input_file: str, output_file: str):
|
| 39 |
+
"""Reformat dataset to use CodeLlama chat template format"""
|
| 40 |
+
|
| 41 |
+
print("=" * 80)
|
| 42 |
+
print("🔄 REFORMATTING DATASET FOR CODELLAMA CHAT TEMPLATE")
|
| 43 |
+
print("=" * 80)
|
| 44 |
+
|
| 45 |
+
# Load tokenizer
|
| 46 |
+
tokenizer_path = "models/base-models/CodeLlama-7B-Instruct"
|
| 47 |
+
print(f"\n📦 Loading tokenizer from: {tokenizer_path}")
|
| 48 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
| 49 |
+
|
| 50 |
+
# Read input dataset
|
| 51 |
+
samples = []
|
| 52 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
for line in f:
|
| 54 |
+
if line.strip():
|
| 55 |
+
try:
|
| 56 |
+
samples.append(json.loads(line))
|
| 57 |
+
except json.JSONDecodeError as e:
|
| 58 |
+
print(f"⚠️ Skipping invalid JSON: {e}")
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
print(f"✅ Loaded {len(samples)} samples from {input_file}")
|
| 62 |
+
|
| 63 |
+
# Reformat each sample
|
| 64 |
+
reformatted_samples = []
|
| 65 |
+
|
| 66 |
+
for i, sample in enumerate(samples, 1):
|
| 67 |
+
instruction = sample.get("instruction", "").strip()
|
| 68 |
+
response = sample.get("response", "").strip()
|
| 69 |
+
|
| 70 |
+
if not instruction or not response:
|
| 71 |
+
print(f"⚠️ Skipping sample {i}: missing instruction or response")
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
# Extract system and user messages
|
| 75 |
+
system_message, user_message = extract_system_and_user(instruction)
|
| 76 |
+
|
| 77 |
+
# Create messages for CodeLlama chat template
|
| 78 |
+
messages = [
|
| 79 |
+
{"role": "system", "content": system_message},
|
| 80 |
+
{"role": "user", "content": user_message}
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
# Apply chat template to get the prompt part
|
| 84 |
+
# This will create: <s>[INST] <<SYS>>...<</SYS>> User [/INST]
|
| 85 |
+
formatted_prompt = tokenizer.apply_chat_template(
|
| 86 |
+
messages,
|
| 87 |
+
tokenize=False,
|
| 88 |
+
add_generation_prompt=True # Adds [/INST] at the end, ready for generation
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# For training, we need:
|
| 92 |
+
# formatted_prompt + response + EOS
|
| 93 |
+
# The formatted_prompt already ends with [/INST]
|
| 94 |
+
# We append response + EOS
|
| 95 |
+
|
| 96 |
+
reformatted_samples.append({
|
| 97 |
+
"instruction": formatted_prompt, # The prompt part (ends with [/INST])
|
| 98 |
+
"response": response # What model should generate
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
if i % 10 == 0:
|
| 102 |
+
print(f" Processed {i}/{len(samples)} samples...")
|
| 103 |
+
|
| 104 |
+
# Save reformatted dataset
|
| 105 |
+
output_path = Path(output_file)
|
| 106 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
|
| 108 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 109 |
+
for sample in reformatted_samples:
|
| 110 |
+
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
| 111 |
+
|
| 112 |
+
print(f"\n✅ Reformatted {len(reformatted_samples)} samples")
|
| 113 |
+
print(f"💾 Saved to: {output_file}")
|
| 114 |
+
|
| 115 |
+
# Show example
|
| 116 |
+
if reformatted_samples:
|
| 117 |
+
print("\n📝 Example reformatted sample:")
|
| 118 |
+
print("-" * 80)
|
| 119 |
+
example = reformatted_samples[0]
|
| 120 |
+
print(f"Instruction (first 400 chars):")
|
| 121 |
+
print(example["instruction"][:400] + "...")
|
| 122 |
+
print(f"\nResponse (first 200 chars):")
|
| 123 |
+
print(example["response"][:200] + "...")
|
| 124 |
+
print("=" * 80)
|
| 125 |
+
|
| 126 |
+
return len(reformatted_samples)
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
script_dir = Path(__file__).parent
|
| 130 |
+
|
| 131 |
+
input_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_v1.jsonl"
|
| 132 |
+
output_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_chat_format.jsonl"
|
| 133 |
+
|
| 134 |
+
if not input_file.exists():
|
| 135 |
+
print(f"❌ Error: Input file not found: {input_file}")
|
| 136 |
+
sys.exit(1)
|
| 137 |
+
|
| 138 |
+
count = reformat_dataset(str(input_file), str(output_file))
|
| 139 |
+
print(f"\n✅ Successfully reformatted {count} samples!")
|
| 140 |
+
print(f"\nNext steps:")
|
| 141 |
+
print(f"1. Split the reformatted dataset: python3 scripts/dataset_split.py --input {output_file}")
|
| 142 |
+
print(f"2. Update training script to use chat template format")
|
| 143 |
+
print(f"3. Retrain with new format")
|