| """ |
| build_training_data.py - Convert pin_database.json -> training dataset cho fine-tune LLM |
| Chạy: python build_training_data.py |
| Output: fpga_training_data.jsonl (conversational format) |
| """ |
|
|
| import json |
| import random |
| from collections import defaultdict |
|
|
| with open("pin_database.json", "r") as f: |
| PIN_DB = json.load(f) |
|
|
| |
| DEVICE_PACKAGES = defaultdict(set) |
| for key, db in PIN_DB.items(): |
| for dev in db["devices"]: |
| DEVICE_PACKAGES[dev].add(db["package"]) |
|
|
| PART_NUMBER_PATTERNS = { |
| "LX4": ["XC6SLX4-2TQG144C", "XC6SLX4-2CPG196C", "XC6SLX4-2CSG225C"], |
| "LX9": ["XC6SLX9-2TQG144C", "XC6SLX9-2CPG196C", "XC6SLX9-2CSG225C", "XC6SLX9-2FTG256C", "XC6SLX9-2CSG324C"], |
| "LX16": ["XC6SLX16-2CPG196C", "XC6SLX16-2CSG225C", "XC6SLX16-2FTG256C", "XC6SLX16-2CSG324C"], |
| "LX25": ["XC6SLX25-2FTG256C", "XC6SLX25-2CSG324C"], |
| "LX25T": ["XC6SLX25T-2CSG324C", "XC6SLX25T-2FGG484C"], |
| "LX45": ["XC6SLX45-2CSG324C", "XC6SLX45-2FGG484C", "XC6SLX45-2CSG484C", "XC6SLX45-2FGG676C"], |
| "LX45T": ["XC6SLX45T-2CSG324C", "XC6SLX45T-2FGG484C", "XC6SLX45T-2CSG484C"], |
| "LX75": ["XC6SLX75-2FGG484C", "XC6SLX75-2CSG484C", "XC6SLX75-2FGG676C"], |
| "LX75T": ["XC6SLX75T-2FGG484C", "XC6SLX75T-2CSG484C", "XC6SLX75T-2FGG676C"], |
| "LX100": ["XC6SLX100-2FGG484C", "XC6SLX100-2CSG484C", "XC6SLX100-2FGG676C"], |
| "LX100T": ["XC6SLX100T-2FGG484C", "XC6SLX100T-2CSG484C", "XC6SLX100T-2FGG676C", "XC6SLX100T-2FGG900C"], |
| "LX150": ["XC6SLX150-2FGG484C", "XC6SLX150-2CSG484C", "XC6SLX150-2FGG676C", "XC6SLX150-2FGG900C"], |
| "LX150T": ["XC6SLX150T-2FGG484C", "XC6SLX150T-2CSG484C", "XC6SLX150T-2FGG676C", "XC6SLX150T-2FGG900C"], |
| } |
|
|
| def create_qa_samples(): |
| """Tạo các cặp Q&A từ database""" |
| samples = [] |
| |
| |
| for key, db in PIN_DB.items(): |
| package = db["package"] |
| devices = db["devices"] |
| |
| |
| pins_to_sample = random.sample(db["pins"], min(60, len(db["pins"]))) |
| |
| for pin in pins_to_sample: |
| pin_num = pin["pin_number"] |
| desc = pin["pin_description"] |
| bank = pin["bank"] |
| |
| |
| part_num = None |
| for dev in devices: |
| if dev in PART_NUMBER_PATTERNS: |
| for pn in PART_NUMBER_PATTERNS[dev]: |
| if package.replace("(G)", "G") in pn or package.replace("(G)", "") in pn: |
| part_num = pn |
| break |
| if part_num: |
| break |
| |
| |
| if part_num: |
| q = f"What is pin {pin_num} on {part_num}?" |
| a = f"Pin {pin_num} on {part_num} is **{desc}** (Bank {bank})." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| q2 = f"{part_num} ball {pin_num} là gì?" |
| a2 = f"Ball {pin_num} trên {part_num} là **{desc}** (Bank {bank})." |
| samples.append({"messages": [{"role": "user", "content": q2}, {"role": "assistant", "content": a2}]}) |
| |
| |
| for dev in devices[:2]: |
| q = f"What is the function of pin {pin_num} on Spartan-6 {dev} in {package}?" |
| a = f"On Spartan-6 {dev} in {package} package, pin {pin_num} is **{desc}** (Bank {bank})." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| q_vi = f"Pin {pin_num} trên {package} {dev} là gì?" |
| a_vi = f"Pin {pin_num} trên {package} {dev} là **{desc}** (Bank {bank})." |
| samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]}) |
| |
| |
| for device, packages in DEVICE_PACKAGES.items(): |
| packages_list = sorted(list(packages)) |
| |
| q = f"What packages does Spartan-6 {device} support?" |
| a = f"Spartan-6 {device} supports the following packages: {', '.join(packages_list)}." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| q_vi = f"{device} hỗ trợ những package nào?" |
| a_vi = f"{device} hỗ trợ các package: {', '.join(packages_list)}." |
| samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]}) |
| |
| |
| for pkg in packages_list[:3]: |
| key = None |
| for k, db in PIN_DB.items(): |
| if db["package"] == pkg and device in db["devices"]: |
| key = k |
| break |
| if key: |
| total = PIN_DB[key]["total_pins"] |
| q = f"How many pins does {device} in {pkg} have?" |
| a = f"{device} in {pkg} package has **{total} pins**." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| |
| |
| for key, db in PIN_DB.items(): |
| package = db["package"] |
| devices = db["devices"] |
| |
| |
| gclk_pins = [p for p in db["pins"] if "GCLK" in p["pin_description"]] |
| if gclk_pins: |
| for dev in devices[:1]: |
| q = f"Where are the GCLK pins on {dev} {package}?" |
| pin_list = ", ".join([f"{p['pin_number']} ({p['pin_description']}, Bank {p['bank']})" for p in gclk_pins[:5]]) |
| a = f"GCLK pins on {dev} {package}: {pin_list}. Total: {len(gclk_pins)} GCLK pins." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| |
| vref_pins = [p for p in db["pins"] if "VREF" in p["pin_description"]] |
| if vref_pins: |
| for dev in devices[:1]: |
| q = f"VREF pins on {dev} {package}?" |
| pin_list = ", ".join([f"{p['pin_number']} (Bank {p['bank']})" for p in vref_pins[:5]]) |
| a = f"VREF pins on {dev} {package}: {pin_list}. Total: {len(vref_pins)} VREF pins." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| |
| bank_counts = defaultdict(int) |
| for p in db["pins"]: |
| bank_counts[p["bank"]] += 1 |
| |
| for dev in devices[:1]: |
| q = f"How many pins per bank on {dev} {package}?" |
| bank_info = ", ".join([f"Bank {b}: {c} pins" for b, c in sorted(bank_counts.items(), key=lambda x: (int(x[0]) if x[0].isdigit() else -1))[:6]]) |
| a = f"{dev} {package} pin distribution: {bank_info}." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| |
| for device, pns in PART_NUMBER_PATTERNS.items(): |
| for pn in pns: |
| q = f"What is {pn}?" |
| |
| import re |
| m = re.match(r'XC6S(LX\d+T?)-(\d+)([A-Z]+\d+)([A-Z])', pn) |
| if m: |
| dev = m.group(1) |
| speed = m.group(2) |
| pkg = m.group(3) |
| temp = m.group(4) |
| temp_str = "Commercial" if temp == "C" else "Industrial" if temp == "I" else temp |
| a = f"{pn} is a Xilinx Spartan-6 FPGA: Device {dev}, Speed Grade {speed}, Package {pkg}, Temperature Grade {temp_str}." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| |
| for key, db in PIN_DB.items(): |
| package = db["package"] |
| devices = db["devices"] |
| |
| pairs = defaultdict(list) |
| for p in db["pins"]: |
| desc = p["pin_description"] |
| if "_P_" in desc or "_N_" in desc: |
| |
| base = desc.replace("_P_", "_").replace("_N_", "_") |
| pairs[base].append(p) |
| |
| for dev in devices[:1]: |
| |
| sample_pairs = list(pairs.items())[:3] |
| for base, pins in sample_pairs: |
| if len(pins) == 2: |
| p_pin = [p for p in pins if "_P_" in p["pin_description"]][0] |
| n_pin = [p for p in pins if "_N_" in p["pin_description"]][0] |
| q = f"What is the differential pair {base.rstrip('_')} on {dev} {package}?" |
| a = f"{base.rstrip('_')} differential pair on {dev} {package}: Positive (P) at pin **{p_pin['pin_number']}** (Bank {p_pin['bank']}), Negative (N) at pin **{n_pin['pin_number']}** (Bank {n_pin['bank']})." |
| samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]}) |
| |
| return samples |
|
|
| if __name__ == "__main__": |
| print("Building training data from pin_database.json...") |
| samples = create_qa_samples() |
| print(f"Generated {len(samples)} training samples") |
| |
| |
| random.shuffle(samples) |
| |
| |
| output_file = "fpga_training_data.jsonl" |
| with open(output_file, "w", encoding="utf-8") as f: |
| for s in samples: |
| f.write(json.dumps(s, ensure_ascii=False) + "\n") |
| |
| print(f"Saved to {output_file}") |
| |
| |
| print("\n=== Sample ===") |
| print(json.dumps(samples[0], indent=2, ensure_ascii=False)) |
|
|