Elinnos
/

codellama-fine-tuning

Model card Files Files and versions

xet

Community

Prithvik-1 commited on Nov 25, 2025

Commit

0361c24

verified ·

1 Parent(s): 195b569

Upload reformat_dataset_for_codellama.py with huggingface_hub

Browse files

Files changed (1) hide show

reformat_dataset_for_codellama.py +143 -0

reformat_dataset_for_codellama.py ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/usr/bin/env python3
+"""
+Reformat dataset to use CodeLlama chat template format
+CodeLlama-Instruct expects: <s>[INST] <<SYS>>...<</SYS>> User [/INST] Response </s>
+"""
+import json
+import sys
+from pathlib import Path
+from transformers import AutoTokenizer
+def extract_system_and_user(instruction: str):
+    """Extract system prompt and user message from instruction"""
+    # The instruction contains: "System prompt...\n\nTask description"
+    parts = instruction.split("\n\n", 1)
+    if len(parts) == 2:
+        system_msg = parts[0].strip()
+        user_msg = parts[1].strip()
+        # Check if system message contains the role description
+        if "Elinnos RTL Code Generator" in system_msg or "specialized Verilog" in system_msg:
+            return system_msg, user_msg
+    # Default: extract system prompt
+    if "You are" in instruction and "\n\n" in instruction:
+        parts = instruction.split("\n\n", 1)
+        system_msg = parts[0]
+        user_msg = parts[1] if len(parts) > 1 else ""
+        return system_msg, user_msg
+    # Fallback: use default system prompt
+    system_msg = "You are Elinnos RTL Code Generator v1.0, a specialized Verilog/SystemVerilog code generation agent. Your role: Generate clean, synthesizable RTL code for hardware design tasks. Output ONLY functional RTL code with no $display, assertions, comments, or debug statements."
+    user_msg = instruction
+    return system_msg, user_msg
+def reformat_dataset(input_file: str, output_file: str):
+    """Reformat dataset to use CodeLlama chat template format"""
+    print("=" * 80)
+    print("🔄 REFORMATTING DATASET FOR CODELLAMA CHAT TEMPLATE")
+    print("=" * 80)
+    # Load tokenizer
+    tokenizer_path = "models/base-models/CodeLlama-7B-Instruct"
+    print(f"\n📦 Loading tokenizer from: {tokenizer_path}")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    # Read input dataset
+    samples = []
+    with open(input_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                try:
+                    samples.append(json.loads(line))
+                except json.JSONDecodeError as e:
+                    print(f"⚠️  Skipping invalid JSON: {e}")
+                    continue
+    print(f"✅ Loaded {len(samples)} samples from {input_file}")
+    # Reformat each sample
+    reformatted_samples = []
+    for i, sample in enumerate(samples, 1):
+        instruction = sample.get("instruction", "").strip()
+        response = sample.get("response", "").strip()
+        if not instruction or not response:
+            print(f"⚠️  Skipping sample {i}: missing instruction or response")
+            continue
+        # Extract system and user messages
+        system_message, user_message = extract_system_and_user(instruction)
+        # Create messages for CodeLlama chat template
+        messages = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": user_message}
+        ]
+        # Apply chat template to get the prompt part
+        # This will create: <s>[INST] <<SYS>>...<</SYS>> User [/INST]
+        formatted_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True  # Adds [/INST] at the end, ready for generation
+        )
+        # For training, we need:
+        # formatted_prompt + response + EOS
+        # The formatted_prompt already ends with [/INST]
+        # We append response + EOS
+        reformatted_samples.append({
+            "instruction": formatted_prompt,  # The prompt part (ends with [/INST])
+            "response": response  # What model should generate
+        })
+        if i % 10 == 0:
+            print(f"   Processed {i}/{len(samples)} samples...")
+    # Save reformatted dataset
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for sample in reformatted_samples:
+            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
+    print(f"\n✅ Reformatted {len(reformatted_samples)} samples")
+    print(f"💾 Saved to: {output_file}")
+    # Show example
+    if reformatted_samples:
+        print("\n📝 Example reformatted sample:")
+        print("-" * 80)
+        example = reformatted_samples[0]
+        print(f"Instruction (first 400 chars):")
+        print(example["instruction"][:400] + "...")
+        print(f"\nResponse (first 200 chars):")
+        print(example["response"][:200] + "...")
+        print("=" * 80)
+    return len(reformatted_samples)
+if __name__ == "__main__":
+    script_dir = Path(__file__).parent
+    input_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_v1.jsonl"
+    output_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_chat_format.jsonl"
+    if not input_file.exists():
+        print(f"❌ Error: Input file not found: {input_file}")
+        sys.exit(1)
+    count = reformat_dataset(str(input_file), str(output_file))
+    print(f"\n✅ Successfully reformatted {count} samples!")
+    print(f"\nNext steps:")
+    print(f"1. Split the reformatted dataset: python3 scripts/dataset_split.py --input {output_file}")
+    print(f"2. Update training script to use chat template format")
+    print(f"3. Retrain with new format")