#!/usr/bin/env python3 """ Reformat dataset to use CodeLlama chat template format CodeLlama-Instruct expects: [INST] <>...<> User [/INST] Response """ import json import sys from pathlib import Path from transformers import AutoTokenizer def extract_system_and_user(instruction: str): """Extract system prompt and user message from instruction""" # The instruction contains: "System prompt...\n\nTask description" parts = instruction.split("\n\n", 1) if len(parts) == 2: system_msg = parts[0].strip() user_msg = parts[1].strip() # Check if system message contains the role description if "Elinnos RTL Code Generator" in system_msg or "specialized Verilog" in system_msg: return system_msg, user_msg # Default: extract system prompt if "You are" in instruction and "\n\n" in instruction: parts = instruction.split("\n\n", 1) system_msg = parts[0] user_msg = parts[1] if len(parts) > 1 else "" return system_msg, user_msg # Fallback: use default system prompt system_msg = "You are Elinnos RTL Code Generator v1.0, a specialized Verilog/SystemVerilog code generation agent. Your role: Generate clean, synthesizable RTL code for hardware design tasks. Output ONLY functional RTL code with no $display, assertions, comments, or debug statements." user_msg = instruction return system_msg, user_msg def reformat_dataset(input_file: str, output_file: str): """Reformat dataset to use CodeLlama chat template format""" print("=" * 80) print("šŸ”„ REFORMATTING DATASET FOR CODELLAMA CHAT TEMPLATE") print("=" * 80) # Load tokenizer tokenizer_path = "models/base-models/CodeLlama-7B-Instruct" print(f"\nšŸ“¦ Loading tokenizer from: {tokenizer_path}") tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # Read input dataset samples = [] with open(input_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): try: samples.append(json.loads(line)) except json.JSONDecodeError as e: print(f"āš ļø Skipping invalid JSON: {e}") continue print(f"āœ… Loaded {len(samples)} samples from {input_file}") # Reformat each sample reformatted_samples = [] for i, sample in enumerate(samples, 1): instruction = sample.get("instruction", "").strip() response = sample.get("response", "").strip() if not instruction or not response: print(f"āš ļø Skipping sample {i}: missing instruction or response") continue # Extract system and user messages system_message, user_message = extract_system_and_user(instruction) # Create messages for CodeLlama chat template messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_message} ] # Apply chat template to get the prompt part # This will create: [INST] <>...<> User [/INST] formatted_prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True # Adds [/INST] at the end, ready for generation ) # For training, we need: # formatted_prompt + response + EOS # The formatted_prompt already ends with [/INST] # We append response + EOS reformatted_samples.append({ "instruction": formatted_prompt, # The prompt part (ends with [/INST]) "response": response # What model should generate }) if i % 10 == 0: print(f" Processed {i}/{len(samples)} samples...") # Save reformatted dataset output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: for sample in reformatted_samples: f.write(json.dumps(sample, ensure_ascii=False) + '\n') print(f"\nāœ… Reformatted {len(reformatted_samples)} samples") print(f"šŸ’¾ Saved to: {output_file}") # Show example if reformatted_samples: print("\nšŸ“ Example reformatted sample:") print("-" * 80) example = reformatted_samples[0] print(f"Instruction (first 400 chars):") print(example["instruction"][:400] + "...") print(f"\nResponse (first 200 chars):") print(example["response"][:200] + "...") print("=" * 80) return len(reformatted_samples) if __name__ == "__main__": script_dir = Path(__file__).parent input_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_v1.jsonl" output_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_chat_format.jsonl" if not input_file.exists(): print(f"āŒ Error: Input file not found: {input_file}") sys.exit(1) count = reformat_dataset(str(input_file), str(output_file)) print(f"\nāœ… Successfully reformatted {count} samples!") print(f"\nNext steps:") print(f"1. Split the reformatted dataset: python3 scripts/dataset_split.py --input {output_file}") print(f"2. Update training script to use chat template format") print(f"3. Retrain with new format")