""" build_tree_training_data.py - Tạo training data có cấu trúc CÂY (tree-structured) Cho phép AI navigate: Root -> Device -> Package -> Bank -> Pin Và ngược lại: Pin -> tìm đường về Root Chạy: python build_tree_training_data.py Output: fpga_tree_training_data.jsonl """ import json import random from collections import defaultdict with open("pin_database.json", "r") as f: PIN_DB = json.load(f) # ═══════════════════════════════════════════════════════════════════════════════ # XÂY DỰNG CÂY PHÂN CẤP FPGA # ═══════════════════════════════════════════════════════════════════════════════ FPGA_TREE = { "name": "Xilinx Integrated Circuits", "type": "root", "children": { "Spartan-6 FPGA": { "type": "family", "description": "Low-cost FPGA family for high-volume applications", "children": {} } } } # Device info DEVICE_INFO = { "LX4": {"transceiver": False, "description": "Smallest Spartan-6, 3,840 logic cells", "packages": ["TQG144", "CPG196", "CSG225"]}, "LX9": {"transceiver": False, "description": "9,152 logic cells", "packages": ["TQG144", "CPG196", "CSG225", "FT(G)256", "CSG324"]}, "LX16": {"transceiver": False, "description": "15,282 logic cells", "packages": ["CPG196", "CSG225", "FT(G)256", "CSG324"]}, "LX25": {"transceiver": False, "description": "24,051 logic cells", "packages": ["FT(G)256", "CSG324"]}, "LX25T": {"transceiver": True, "description": "24,051 logic cells + GTP transceivers", "packages": ["CSG324", "FG(G)484"]}, "LX45": {"transceiver": False, "description": "43,661 logic cells", "packages": ["CSG324", "FG(G)484", "CS(G)484", "FG(G)676"]}, "LX45T": {"transceiver": True, "description": "43,661 logic cells + GTP transceivers", "packages": ["CSG324", "FG(G)484", "CS(G)484"]}, "LX75": {"transceiver": False, "description": "74,681 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]}, "LX75T": {"transceiver": True, "description": "74,681 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]}, "LX100": {"transceiver": False, "description": "101,261 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]}, "LX100T": {"transceiver": True, "description": "101,261 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]}, "LX150": {"transceiver": False, "description": "147,443 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]}, "LX150T": {"transceiver": True, "description": "147,443 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]}, } # Package info PACKAGE_INFO = { "TQG144": {"type": "TQFP", "pitch": "0.5mm", "size": "20x20mm", "pins": 144, "description": "Thin Quad Flat Pack"}, "CPG196": {"type": "CSBGA", "pitch": "0.5mm", "size": "8x8mm", "pins": 196, "description": "Chip Scale BGA"}, "CSG225": {"type": "CSBGA", "pitch": "0.8mm", "size": "13x13mm", "pins": 225, "description": "Chip Scale BGA"}, "FT(G)256": {"type": "BGA", "pitch": "1.0mm", "size": "17x17mm", "pins": 256, "description": "Fine-pitch Thin BGA"}, "CSG324": {"type": "CSBGA", "pitch": "0.8mm", "size": "15x15mm", "pins": 324, "description": "Chip Scale BGA"}, "FG(G)484": {"type": "BGA", "pitch": "1.0mm", "size": "23x23mm", "pins": 484, "description": "Fine-pitch BGA"}, "CS(G)484": {"type": "CSBGA", "pitch": "0.8mm", "size": "19x19mm", "pins": 484, "description": "Chip Scale BGA"}, "FG(G)676": {"type": "BGA", "pitch": "1.0mm", "size": "27x27mm", "pins": 676, "description": "Fine-pitch BGA"}, "FG(G)900": {"type": "BGA", "pitch": "1.0mm", "size": "31x31mm", "pins": 900, "description": "Fine-pitch BGA"}, } # Bank locations BANK_LOCATION = { "0": "Top-Left", "1": "Right-Top", "2": "Bottom", "3": "Left", "4": "Left-Top (LX75T/LX100T/LX150T only)", "5": "Right-Top extra (LX75T/LX100T/LX150T only)", "101": "GTP Transceiver Top-Left", "123": "GTP Transceiver Top-Right", "NA": "Dedicated pins (JTAG, Config, Power)", } # Build tree from PIN_DB for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] for device in devices: if device not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"]: dev_info = DEVICE_INFO.get(device, {}) FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device] = { "type": "device", "transceiver": dev_info.get("transceiver", False), "description": dev_info.get("description", ""), "children": {} } if package not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"]: pkg_info = PACKAGE_INFO.get(package, {}) FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package] = { "type": "package", "pins": pkg_info.get("pins", 0), "description": pkg_info.get("description", ""), "children": {} } # Group pins by bank bank_pins = defaultdict(list) for pin in db["pins"]: bank_pins[pin["bank"]].append(pin) for bank, pins in bank_pins.items(): if bank not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"]: FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"][bank] = { "type": "bank", "location": BANK_LOCATION.get(bank, "Unknown"), "pin_count": len(pins), "children": {} } for pin in pins: pin_num = pin["pin_number"] FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"][bank]["children"][pin_num] = { "type": "pin", "description": pin["pin_description"], "bufio2": pin.get("bufio2_region", ""), "no_connect": pin.get("no_connect", False), } # ═══════════════════════════════════════════════════════════════════════════════ # TẠO TREE-STRUCTURED TRAINING DATA # ═══════════════════════════════════════════════════════════════════════════════ def create_tree_path(device, package, bank, pin_num, pin_desc): """Tạo đường đi từ gốc đến lá""" pkg_info = PACKAGE_INFO.get(package, {}) dev_info = DEVICE_INFO.get(device, {}) bank_loc = BANK_LOCATION.get(bank, "Unknown") path = f"Root/Xilinx/Spartan-6/{device}/{package}/Bank_{bank}/{pin_num}" explanation = f"""Đường đi trong cây phân cấp: - **Root**: Xilinx Integrated Circuits - **Family**: Spartan-6 FPGA (low-cost, high-volume) - **Device**: {device} ({dev_info.get('description', '')}) - Transceiver: {'Yes' if dev_info.get('transceiver') else 'No'} - **Package**: {package} ({pkg_info.get('description', '')}) - Type: {pkg_info.get('type', 'N/A')} - Pitch: {pkg_info.get('pitch', 'N/A')} - Size: {pkg_info.get('size', 'N/A')} - Total pins: {pkg_info.get('pins', 'N/A')} - **Bank**: {bank} ({bank_loc}) - **Pin (Leaf Node)**: Ball {pin_num} - **Định nghĩa**: {pin_desc} """ return path, explanation # Generate samples samples = [] # 1. TOP-DOWN: Root -> Leaf (3000 samples) print("[1/6] Generating TOP-DOWN tree path samples...") count = 0 for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] for device in devices: dev_info = DEVICE_INFO.get(device, {}) pkg_info = PACKAGE_INFO.get(package, {}) # Pick random pins from different banks bank_pins = defaultdict(list) for pin in db["pins"]: bank_pins[pin["bank"]].append(pin) # Sample from each bank for bank, pins in bank_pins.items(): sampled = random.sample(pins, min(3, len(pins))) for pin in sampled: pin_num = pin["pin_number"] pin_desc = pin["pin_description"] path, explanation = create_tree_path(device, package, bank, pin_num, pin_desc) # Various question styles questions = [ f"Đi từ gốc Root xuống lá: Spartan-6 -> {device} -> {package} -> Bank {bank} -> Pin {pin_num}. Node lá này là gì?", f"Trong cây phân cấp FPGA, tìm đường đến ball {pin_num} của {device} {package} Bank {bank}", f"Navigate tree: Root/Xilinx/Spartan-6/{device}/{package}/Bank_{bank}/{pin_num}", f"Từ Spartan-6 đi xuống {device} package {package} bank {bank}, ball {pin_num} là gì?", ] for q in questions: if count >= 3000: break samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": explanation} ] }) count += 1 # 2. BOTTOM-UP: Leaf -> Root (2000 samples) print("[2/6] Generating BOTTOM-UP samples...") count = 0 for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] for device in devices: # Pick random pins sampled = random.sample(db["pins"], min(10, len(db["pins"]))) for pin in sampled: pin_num = pin["pin_number"] pin_desc = pin["pin_description"] bank = pin["bank"] # Find all devices that share this pin in this package sibling_devices = [d for d in devices if d in DEVICE_INFO] pkg_info = PACKAGE_INFO.get(package, {}) dev_info = DEVICE_INFO.get(device, {}) questions = [ f"Tôi có node lá {pin_desc} tại ball {pin_num}. Tìm đường đi ngược về gốc.", f"Pin {pin_num} ({pin_desc}) thuộc device nào, package nào, bank nào?", f"Navigate up: {pin_desc}@{pin_num} -> ? -> ? -> Root", f"Ball {pin_num} trong {package} là {pin_desc}. Nó nằm ở đâu trong cây FPGA?", ] answer = f"""Đường đi ngược từ lá về gốc: - **Node lá**: Ball {pin_num} = **{pin_desc}** - **Bank**: {bank} ({BANK_LOCATION.get(bank, 'Unknown')}) - **Package**: {package} ({pkg_info.get('description', '')}, {pkg_info.get('pins', 'N/A')} pins) - **Device**: {device} ({dev_info.get('description', '')}) - **Family**: Spartan-6 FPGA - **Root**: Xilinx Integrated Circuits Full path: `{pin_num}/{pin_desc} -> Bank {bank} -> {package} -> {device} -> Spartan-6 -> Xilinx` """ for q in questions: if count >= 2000: break samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": answer} ] }) count += 1 # 3. SIBLING: Cùng bank, các pin khác nhau (1000 samples) print("[3/6] Generating SIBLING relationship samples...") count = 0 for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] for device in devices[:1]: # Group by bank bank_pins = defaultdict(list) for pin in db["pins"]: bank_pins[pin["bank"]].append(pin) for bank, pins in bank_pins.items(): if len(pins) >= 3: sampled = random.sample(pins, min(5, len(pins))) pin_list = [f"{p['pin_number']}={p['pin_description']}" for p in sampled] q = f"Trong cây {device}/{package}/Bank_{bank}, liệt kê các node lá (siblings)" a = f"""Các node lá (pins) trong nhánh **{device}/{package}/Bank {bank}** (siblings): | Ball | Pin Description | Chức năng | |------|----------------|-----------| """ for p in sampled: desc = p["pin_description"] func = "User I/O" if "GCLK" in desc: func = "Global Clock" elif "VREF" in desc: func = "Voltage Reference" elif "HSWAPEN" in desc: func = "Hot Swap Enable" elif desc in ["TCK", "TDI", "TDO", "TMS"]: func = "JTAG" elif "CCLK" in desc: func = "Config Clock" elif "PROGRAM" in desc: func = "Config Reset" elif "DONE" in desc: func = "Config Done" elif "GND" in desc: func = "Ground" elif "VCC" in desc: func = "Power" a += f"| {p['pin_number']} | {desc} | {func} |\n" a += f"\nTổng: {len(pins)} pins trong Bank {bank} ({BANK_LOCATION.get(bank, '')})" if count < 1000: samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": a} ] }) count += 1 # 4. CROSS-BRANCH: So sánh 2 package (800 samples) print("[4/6] Generating CROSS-BRANCH comparison samples...") count = 0 # Compare LX150T in different packages packages_to_compare = [("FG(G)484", "CS(G)484"), ("FG(G)484", "FG(G)676"), ("CS(G)484", "FG(G)676")] for pkg1, pkg2 in packages_to_compare: # Find data for both db1 = None db2 = None for key, db in PIN_DB.items(): if db["package"] == pkg1 and "LX150T" in db["devices"]: db1 = db if db["package"] == pkg2 and "LX150T" in db["devices"]: db2 = db if db1 and db2: bank_pins1 = defaultdict(list) bank_pins2 = defaultdict(list) for p in db1["pins"]: bank_pins1[p["bank"]].append(p) for p in db2["pins"]: bank_pins2[p["bank"]].append(p) q = f"So sánh cây LX150T {pkg1} và LX150T {pkg2}, Bank 0 khác nhau thế nào?" a = f"""So sánh nhánh **LX150T/{pkg1}/Bank_0** vs **LX150T/{pkg2}/Bank_0**: | Thuộc tính | {pkg1} | {pkg2} | |------------|--------|--------| | Tổng pins | {len(bank_pins1.get('0', []))} | {len(bank_pins2.get('0', []))} | | Package type | {PACKAGE_INFO.get(pkg1, {}).get('type', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('type', 'N/A')} | | Package size | {PACKAGE_INFO.get(pkg1, {}).get('size', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('size', 'N/A')} | | Pitch | {PACKAGE_INFO.get(pkg1, {}).get('pitch', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('pitch', 'N/A')} | **Giống nhau**: Cùng Device (LX150T), cùng Bank 0 (Top-Left region), cùng các chức năng pin (IO_LXXY, GCLK, VREF). **Khác nhau**: Số lượng pin khác do package size khác. Ball assignments (vị trí vật lý) khác nhau hoàn toàn. """ if count < 800: samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": a} ] }) count += 1 # 5. PART NUMBER -> TREE PATH (500 samples) print("[5/6] Generating Part Number -> Tree Path samples...") PART_NUMBERS = { "XC6SLX150T-2FGG484C": {"device": "LX150T", "package": "FG(G)484", "speed": "2", "temp": "Commercial"}, "XC6SLX150T-2CSG484C": {"device": "LX150T", "package": "CS(G)484", "speed": "2", "temp": "Commercial"}, "XC6SLX150T-2FGG676C": {"device": "LX150T", "package": "FG(G)676", "speed": "2", "temp": "Commercial"}, "XC6SLX45-3CSG324I": {"device": "LX45", "package": "CSG324", "speed": "3", "temp": "Industrial"}, "XC6SLX9-2TQG144C": {"device": "LX9", "package": "TQG144", "speed": "2", "temp": "Commercial"}, } for pn, info in PART_NUMBERS.items(): dev = info["device"] pkg = info["package"] dev_info = DEVICE_INFO.get(dev, {}) pkg_info = PACKAGE_INFO.get(pkg, {}) # Find total pins total_pins = 0 for key, db in PIN_DB.items(): if db["package"] == pkg and dev in db["devices"]: total_pins = db["total_pins"] break q = f"Part number {pn} ánh xạ vào cây phân cấp như thế nào?" a = f"""Part Number **{pn}** -> Tree Path: ``` Root/Xilinx/Spartan-6/{dev}/{pkg} ``` **Phân tích Part Number:** - **XC6S**: Xilinx Spartan-6 Family - **{dev}**: Device {dev} ({dev_info.get('description', '')}) - **-{info['speed']}**: Speed Grade {info['speed']} - **{pkg.replace('(G)', 'G')}**: Package {pkg} ({pkg_info.get('description', '')}) - **C**: {info['temp']} Temperature **Thuộc tính trong cây:** - Node: Device ({dev}) - Transceiver: {'Yes' if dev_info.get('transceiver') else 'No'} - Node con: Package ({pkg}) - Type: {pkg_info.get('type', 'N/A')} - Pins: {total_pins} - Size: {pkg_info.get('size', 'N/A')} - Các lá: {total_pins} pin balls (chia thành các Bank 0-3, NA, 101, 123) """ samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": a} ] }) # 6. PIN FUNCTION -> FIND IN TREE (1000 samples) print("[6/6] Generating Pin Function -> Tree Search samples...") PIN_FUNCTIONS = { "GCLK": "Global Clock pins", "VREF": "Voltage Reference pins", "HSWAPEN": "Hot Swap Enable pin", "CCLK": "Configuration Clock", "PROGRAM_B": "Program/Reset configuration", "DONE": "Configuration Done indicator", "TCK": "JTAG Test Clock", "TDI": "JTAG Test Data In", "TDO": "JTAG Test Data Out", "TMS": "JTAG Test Mode Select", } for func_keyword, func_desc in PIN_FUNCTIONS.items(): # Find example pins found_pins = [] for key, db in PIN_DB.items(): package = db["package"] devices = db["devices"] for pin in db["pins"]: if func_keyword in pin["pin_description"]: found_pins.append({ "device": devices[0] if devices else "Unknown", "package": package, "bank": pin["bank"], "ball": pin["pin_number"], "desc": pin["pin_description"], }) if len(found_pins) >= 20: break if found_pins: sampled = random.sample(found_pins, min(8, len(found_pins))) q = f"Tìm tất cả node lá có chức năng {func_keyword} ({func_desc}) trong cây Spartan-6" a = f"""Tìm kiếm trong cây phân cấp: nodes có chức năng **{func_desc}** ({func_keyword}) | Device | Package | Bank | Ball | Pin Description | Path | |--------|---------|------|------|-----------------|------| """ for p in sampled: a += f"| {p['device']} | {p['package']} | {p['bank']} | {p['ball']} | {p['desc']} | Root/Spartan-6/{p['device']}/{p['package']}/Bank_{p['bank']}/{p['ball']} |\n" a += f""" **Pattern trong cây**: Các node {func_keyword} thường nằm rải rác ở nhiều Bank khác nhau, nhưng tập trung nhiều ở Bank 0 và Bank 2 (Top và Bottom regions). """ samples.append({ "messages": [ {"role": "user", "content": q}, {"role": "assistant", "content": a} ] }) # Save print(f"\nTotal tree-structured samples: {len(samples)}") random.shuffle(samples) output_file = "fpga_tree_training_data.jsonl" with open(output_file, "w", encoding="utf-8") as f: for s in samples: f.write(json.dumps(s, ensure_ascii=False) + "\n") print(f"Saved to {output_file}") # Show samples print("\n=== Sample 1: TOP-DOWN ===") print(json.dumps(samples[0], indent=2, ensure_ascii=False)[:1500]) print("\n=== Sample 2: BOTTOM-UP ===") bottom_up = [s for s in samples if "ngược" in s["messages"][0]["content"] or "up" in s["messages"][0]["content"].lower()] if bottom_up: print(json.dumps(bottom_up[0], indent=2, ensure_ascii=False)[:1500])