| |
| """ |
| Reformat dataset to use CodeLlama chat template format |
| CodeLlama-Instruct expects: <s>[INST] <<SYS>>...<</SYS>> User [/INST] Response </s> |
| """ |
|
|
| import json |
| import sys |
| from pathlib import Path |
| from transformers import AutoTokenizer |
|
|
| def extract_system_and_user(instruction: str): |
| """Extract system prompt and user message from instruction""" |
| |
| parts = instruction.split("\n\n", 1) |
| |
| if len(parts) == 2: |
| system_msg = parts[0].strip() |
| user_msg = parts[1].strip() |
| |
| |
| if "Elinnos RTL Code Generator" in system_msg or "specialized Verilog" in system_msg: |
| return system_msg, user_msg |
| |
| |
| if "You are" in instruction and "\n\n" in instruction: |
| parts = instruction.split("\n\n", 1) |
| system_msg = parts[0] |
| user_msg = parts[1] if len(parts) > 1 else "" |
| return system_msg, user_msg |
| |
| |
| system_msg = "You are Elinnos RTL Code Generator v1.0, a specialized Verilog/SystemVerilog code generation agent. Your role: Generate clean, synthesizable RTL code for hardware design tasks. Output ONLY functional RTL code with no $display, assertions, comments, or debug statements." |
| user_msg = instruction |
| |
| return system_msg, user_msg |
|
|
| def reformat_dataset(input_file: str, output_file: str): |
| """Reformat dataset to use CodeLlama chat template format""" |
| |
| print("=" * 80) |
| print("๐ REFORMATTING DATASET FOR CODELLAMA CHAT TEMPLATE") |
| print("=" * 80) |
| |
| |
| tokenizer_path = "models/base-models/CodeLlama-7B-Instruct" |
| print(f"\n๐ฆ Loading tokenizer from: {tokenizer_path}") |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
| |
| |
| samples = [] |
| with open(input_file, 'r', encoding='utf-8') as f: |
| for line in f: |
| if line.strip(): |
| try: |
| samples.append(json.loads(line)) |
| except json.JSONDecodeError as e: |
| print(f"โ ๏ธ Skipping invalid JSON: {e}") |
| continue |
| |
| print(f"โ
Loaded {len(samples)} samples from {input_file}") |
| |
| |
| reformatted_samples = [] |
| |
| for i, sample in enumerate(samples, 1): |
| instruction = sample.get("instruction", "").strip() |
| response = sample.get("response", "").strip() |
| |
| if not instruction or not response: |
| print(f"โ ๏ธ Skipping sample {i}: missing instruction or response") |
| continue |
| |
| |
| system_message, user_message = extract_system_and_user(instruction) |
| |
| |
| messages = [ |
| {"role": "system", "content": system_message}, |
| {"role": "user", "content": user_message} |
| ] |
| |
| |
| |
| formatted_prompt = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
| |
| |
| |
| |
| |
| |
| reformatted_samples.append({ |
| "instruction": formatted_prompt, |
| "response": response |
| }) |
| |
| if i % 10 == 0: |
| print(f" Processed {i}/{len(samples)} samples...") |
| |
| |
| output_path = Path(output_file) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
| with open(output_file, 'w', encoding='utf-8') as f: |
| for sample in reformatted_samples: |
| f.write(json.dumps(sample, ensure_ascii=False) + '\n') |
| |
| print(f"\nโ
Reformatted {len(reformatted_samples)} samples") |
| print(f"๐พ Saved to: {output_file}") |
| |
| |
| if reformatted_samples: |
| print("\n๐ Example reformatted sample:") |
| print("-" * 80) |
| example = reformatted_samples[0] |
| print(f"Instruction (first 400 chars):") |
| print(example["instruction"][:400] + "...") |
| print(f"\nResponse (first 200 chars):") |
| print(example["response"][:200] + "...") |
| print("=" * 80) |
| |
| return len(reformatted_samples) |
|
|
| if __name__ == "__main__": |
| script_dir = Path(__file__).parent |
| |
| input_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_v1.jsonl" |
| output_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_chat_format.jsonl" |
| |
| if not input_file.exists(): |
| print(f"โ Error: Input file not found: {input_file}") |
| sys.exit(1) |
| |
| count = reformat_dataset(str(input_file), str(output_file)) |
| print(f"\nโ
Successfully reformatted {count} samples!") |
| print(f"\nNext steps:") |
| print(f"1. Split the reformatted dataset: python3 scripts/dataset_split.py --input {output_file}") |
| print(f"2. Update training script to use chat template format") |
| print(f"3. Retrain with new format") |
|
|