|
|
|
|
|
""" |
|
|
Reformat dataset to use CodeLlama chat template format |
|
|
CodeLlama-Instruct expects: <s>[INST] <<SYS>>...<</SYS>> User [/INST] Response </s> |
|
|
""" |
|
|
|
|
|
import json |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
def extract_system_and_user(instruction: str): |
|
|
"""Extract system prompt and user message from instruction""" |
|
|
|
|
|
parts = instruction.split("\n\n", 1) |
|
|
|
|
|
if len(parts) == 2: |
|
|
system_msg = parts[0].strip() |
|
|
user_msg = parts[1].strip() |
|
|
|
|
|
|
|
|
if "Elinnos RTL Code Generator" in system_msg or "specialized Verilog" in system_msg: |
|
|
return system_msg, user_msg |
|
|
|
|
|
|
|
|
if "You are" in instruction and "\n\n" in instruction: |
|
|
parts = instruction.split("\n\n", 1) |
|
|
system_msg = parts[0] |
|
|
user_msg = parts[1] if len(parts) > 1 else "" |
|
|
return system_msg, user_msg |
|
|
|
|
|
|
|
|
system_msg = "You are Elinnos RTL Code Generator v1.0, a specialized Verilog/SystemVerilog code generation agent. Your role: Generate clean, synthesizable RTL code for hardware design tasks. Output ONLY functional RTL code with no $display, assertions, comments, or debug statements." |
|
|
user_msg = instruction |
|
|
|
|
|
return system_msg, user_msg |
|
|
|
|
|
def reformat_dataset(input_file: str, output_file: str): |
|
|
"""Reformat dataset to use CodeLlama chat template format""" |
|
|
|
|
|
print("=" * 80) |
|
|
print("๐ REFORMATTING DATASET FOR CODELLAMA CHAT TEMPLATE") |
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
tokenizer_path = "models/base-models/CodeLlama-7B-Instruct" |
|
|
print(f"\n๐ฆ Loading tokenizer from: {tokenizer_path}") |
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
|
|
|
|
|
|
|
|
samples = [] |
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
if line.strip(): |
|
|
try: |
|
|
samples.append(json.loads(line)) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"โ ๏ธ Skipping invalid JSON: {e}") |
|
|
continue |
|
|
|
|
|
print(f"โ
Loaded {len(samples)} samples from {input_file}") |
|
|
|
|
|
|
|
|
reformatted_samples = [] |
|
|
|
|
|
for i, sample in enumerate(samples, 1): |
|
|
instruction = sample.get("instruction", "").strip() |
|
|
response = sample.get("response", "").strip() |
|
|
|
|
|
if not instruction or not response: |
|
|
print(f"โ ๏ธ Skipping sample {i}: missing instruction or response") |
|
|
continue |
|
|
|
|
|
|
|
|
system_message, user_message = extract_system_and_user(instruction) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": system_message}, |
|
|
{"role": "user", "content": user_message} |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
formatted_prompt = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reformatted_samples.append({ |
|
|
"instruction": formatted_prompt, |
|
|
"response": response |
|
|
}) |
|
|
|
|
|
if i % 10 == 0: |
|
|
print(f" Processed {i}/{len(samples)} samples...") |
|
|
|
|
|
|
|
|
output_path = Path(output_file) |
|
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
for sample in reformatted_samples: |
|
|
f.write(json.dumps(sample, ensure_ascii=False) + '\n') |
|
|
|
|
|
print(f"\nโ
Reformatted {len(reformatted_samples)} samples") |
|
|
print(f"๐พ Saved to: {output_file}") |
|
|
|
|
|
|
|
|
if reformatted_samples: |
|
|
print("\n๐ Example reformatted sample:") |
|
|
print("-" * 80) |
|
|
example = reformatted_samples[0] |
|
|
print(f"Instruction (first 400 chars):") |
|
|
print(example["instruction"][:400] + "...") |
|
|
print(f"\nResponse (first 200 chars):") |
|
|
print(example["response"][:200] + "...") |
|
|
print("=" * 80) |
|
|
|
|
|
return len(reformatted_samples) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
script_dir = Path(__file__).parent |
|
|
|
|
|
input_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_v1.jsonl" |
|
|
output_file = script_dir / "datasets" / "processed" / "elinnos_fifo_codellama_chat_format.jsonl" |
|
|
|
|
|
if not input_file.exists(): |
|
|
print(f"โ Error: Input file not found: {input_file}") |
|
|
sys.exit(1) |
|
|
|
|
|
count = reformat_dataset(str(input_file), str(output_file)) |
|
|
print(f"\nโ
Successfully reformatted {count} samples!") |
|
|
print(f"\nNext steps:") |
|
|
print(f"1. Split the reformatted dataset: python3 scripts/dataset_split.py --input {output_file}") |
|
|
print(f"2. Update training script to use chat template format") |
|
|
print(f"3. Retrain with new format") |
|
|
|