""" build_training_data.py - Convert pin_database.json -> training dataset cho fine-tune LLM Chạy: python build_training_data.py Output: fpga_training_data.jsonl (conversational format) """ import json import random from collections import defaultdict with open("pin_database.json", "r") as f: PIN_DB = json.load(f) # Build reverse indexes DEVICE_PACKAGES = defaultdict(set) for key, db in PIN_DB.items(): for dev in db["devices"]: DEVICE_PACKAGES[dev].add(db["package"]) PART_NUMBER_PATTERNS = { "LX4": ["XC6SLX4-2TQG144C", "XC6SLX4-2CPG196C", "XC6SLX4-2CSG225C"], "LX9": ["XC6SLX9-2TQG144C", "XC6SLX9-2CPG196C", "XC6SLX9-2CSG225C", "XC6SLX9-2FTG256C", "XC6SLX9-2CSG324C"], "LX16": ["XC6SLX16-2CPG196C", "XC6SLX16-2CSG225C", "XC6SLX16-2FTG256C", "XC6SLX16-2CSG324C"], "LX25": ["XC6SLX25-2FTG256C", "XC6SLX25-2CSG324C"], "LX25T": ["XC6SLX25T-2CSG324C", "XC6SLX25T-2FGG484C"], "LX45": ["XC6SLX45-2CSG324C", "XC6SLX45-2FGG484C", "XC6SLX45-2CSG484C", "XC6SLX45-2FGG676C"], "LX45T": ["XC6SLX45T-2CSG324C", "XC6SLX45T-2FGG484C", "XC6SLX45T-2CSG484C"], "LX75": ["XC6SLX75-2FGG484C", "XC6SLX75-2CSG484C", "XC6SLX75-2FGG676C"], "LX75T": ["XC6SLX75T-2FGG484C", "XC6SLX75T-2CSG484C", "XC6SLX75T-2FGG676C"], "LX100": ["XC6SLX100-2FGG484C", "XC6SLX100-2CSG484C", "XC6SLX100-2FGG676C"], "LX100T": ["XC6SLX100T-2FGG484C", "XC6SLX100T-2CSG484C", "XC6SLX100T-2FGG676C", "XC6SLX100T-2FGG900C"], "LX150": ["XC6SLX150-2FGG484C", "XC6SLX150-2CSG484C", "XC6SLX150-2FGG676C", "XC6SLX150-2FGG900C"], "LX150T": ["XC6SLX150T-2FGG484C", "XC6SLX150T-2CSG484C", "XC6SLX150T-2FGG676C", "XC6SLX150T-2FGG900C"], } def create_qa_samples(): """Tạo các cặp Q&A từ database""" samples = [] # 1. Pin lookup samples (~2000 samples) for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] # Randomly pick pins for training (not all, to save size) pins_to_sample = random.sample(db["pins"], min(60, len(db["pins"]))) for pin in pins_to_sample: pin_num = pin["pin_number"] desc = pin["pin_description"] bank = pin["bank"] # Generate part number if available part_num = None for dev in devices: if dev in PART_NUMBER_PATTERNS: for pn in PART_NUMBER_PATTERNS[dev]: if package.replace("(G)", "G") in pn or package.replace("(G)", "") in pn: part_num = pn break if part_num: break # Q&A type 1: Part number + pin if part_num: q = f"What is pin {pin_num} on {part_num}?" a = f"Pin {pin_num} on {part_num} is **{desc}** (Bank {bank})." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) q2 = f"{part_num} ball {pin_num} là gì?" a2 = f"Ball {pin_num} trên {part_num} là **{desc}** (Bank {bank})." samples.append({"messages": [{"role": "user", "content": q2}, {"role": "assistant", "content": a2}]}) # Q&A type 2: Package + device + pin for dev in devices[:2]: # Limit to avoid explosion q = f"What is the function of pin {pin_num} on Spartan-6 {dev} in {package}?" a = f"On Spartan-6 {dev} in {package} package, pin {pin_num} is **{desc}** (Bank {bank})." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) q_vi = f"Pin {pin_num} trên {package} {dev} là gì?" a_vi = f"Pin {pin_num} trên {package} {dev} là **{desc}** (Bank {bank})." samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]}) # 2. Device info samples (~300 samples) for device, packages in DEVICE_PACKAGES.items(): packages_list = sorted(list(packages)) q = f"What packages does Spartan-6 {device} support?" a = f"Spartan-6 {device} supports the following packages: {', '.join(packages_list)}." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) q_vi = f"{device} hỗ trợ những package nào?" a_vi = f"{device} hỗ trợ các package: {', '.join(packages_list)}." samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]}) # Pin count per package for pkg in packages_list[:3]: key = None for k, db in PIN_DB.items(): if db["package"] == pkg and device in db["devices"]: key = k break if key: total = PIN_DB[key]["total_pins"] q = f"How many pins does {device} in {pkg} have?" a = f"{device} in {pkg} package has **{total} pins**." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) # 3. Search by function samples (~500 samples) # Find all GCLK pins, VREF pins, etc. for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] # GCLK pins gclk_pins = [p for p in db["pins"] if "GCLK" in p["pin_description"]] if gclk_pins: for dev in devices[:1]: q = f"Where are the GCLK pins on {dev} {package}?" pin_list = ", ".join([f"{p['pin_number']} ({p['pin_description']}, Bank {p['bank']})" for p in gclk_pins[:5]]) a = f"GCLK pins on {dev} {package}: {pin_list}. Total: {len(gclk_pins)} GCLK pins." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) # VREF pins vref_pins = [p for p in db["pins"] if "VREF" in p["pin_description"]] if vref_pins: for dev in devices[:1]: q = f"VREF pins on {dev} {package}?" pin_list = ", ".join([f"{p['pin_number']} (Bank {p['bank']})" for p in vref_pins[:5]]) a = f"VREF pins on {dev} {package}: {pin_list}. Total: {len(vref_pins)} VREF pins." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) # Bank summary bank_counts = defaultdict(int) for p in db["pins"]: bank_counts[p["bank"]] += 1 for dev in devices[:1]: q = f"How many pins per bank on {dev} {package}?" bank_info = ", ".join([f"Bank {b}: {c} pins" for b, c in sorted(bank_counts.items(), key=lambda x: (int(x[0]) if x[0].isdigit() else -1))[:6]]) a = f"{dev} {package} pin distribution: {bank_info}." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) # 4. Part number parsing samples (~200 samples) for device, pns in PART_NUMBER_PATTERNS.items(): for pn in pns: q = f"What is {pn}?" # Parse import re m = re.match(r'XC6S(LX\d+T?)-(\d+)([A-Z]+\d+)([A-Z])', pn) if m: dev = m.group(1) speed = m.group(2) pkg = m.group(3) temp = m.group(4) temp_str = "Commercial" if temp == "C" else "Industrial" if temp == "I" else temp a = f"{pn} is a Xilinx Spartan-6 FPGA: Device {dev}, Speed Grade {speed}, Package {pkg}, Temperature Grade {temp_str}." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) # 5. Differential pair samples for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] # Find P/N pairs pairs = defaultdict(list) for p in db["pins"]: desc = p["pin_description"] if "_P_" in desc or "_N_" in desc: # Extract base name base = desc.replace("_P_", "_").replace("_N_", "_") pairs[base].append(p) for dev in devices[:1]: # Pick a few pairs sample_pairs = list(pairs.items())[:3] for base, pins in sample_pairs: if len(pins) == 2: p_pin = [p for p in pins if "_P_" in p["pin_description"]][0] n_pin = [p for p in pins if "_N_" in p["pin_description"]][0] q = f"What is the differential pair {base.rstrip('_')} on {dev} {package}?" a = f"{base.rstrip('_')} differential pair on {dev} {package}: Positive (P) at pin **{p_pin['pin_number']}** (Bank {p_pin['bank']}), Negative (N) at pin **{n_pin['pin_number']}** (Bank {n_pin['bank']})." samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) return samples if __name__ == "__main__": print("Building training data from pin_database.json...") samples = create_qa_samples() print(f"Generated {len(samples)} training samples") # Shuffle random.shuffle(samples) # Save output_file = "fpga_training_data.jsonl" with open(output_file, "w", encoding="utf-8") as f: for s in samples: f.write(json.dumps(s, ensure_ascii=False) + "\n") print(f"Saved to {output_file}") # Show sample print("\n=== Sample ===") print(json.dumps(samples[0], indent=2, ensure_ascii=False))