hdmt-rag-local / build_tree_training_data.py
trandangduc0's picture
Upload build_tree_training_data.py with huggingface_hub
7031f38 verified
"""
build_tree_training_data.py - Tạo training data có cấu trúc CÂY (tree-structured)
Cho phép AI navigate: Root -> Device -> Package -> Bank -> Pin
Và ngược lại: Pin -> tìm đường về Root
Chạy: python build_tree_training_data.py
Output: fpga_tree_training_data.jsonl
"""
import json
import random
from collections import defaultdict
with open("pin_database.json", "r") as f:
PIN_DB = json.load(f)
# ═══════════════════════════════════════════════════════════════════════════════
# XÂY DỰNG CÂY PHÂN CẤP FPGA
# ═══════════════════════════════════════════════════════════════════════════════
FPGA_TREE = {
"name": "Xilinx Integrated Circuits",
"type": "root",
"children": {
"Spartan-6 FPGA": {
"type": "family",
"description": "Low-cost FPGA family for high-volume applications",
"children": {}
}
}
}
# Device info
DEVICE_INFO = {
"LX4": {"transceiver": False, "description": "Smallest Spartan-6, 3,840 logic cells", "packages": ["TQG144", "CPG196", "CSG225"]},
"LX9": {"transceiver": False, "description": "9,152 logic cells", "packages": ["TQG144", "CPG196", "CSG225", "FT(G)256", "CSG324"]},
"LX16": {"transceiver": False, "description": "15,282 logic cells", "packages": ["CPG196", "CSG225", "FT(G)256", "CSG324"]},
"LX25": {"transceiver": False, "description": "24,051 logic cells", "packages": ["FT(G)256", "CSG324"]},
"LX25T": {"transceiver": True, "description": "24,051 logic cells + GTP transceivers", "packages": ["CSG324", "FG(G)484"]},
"LX45": {"transceiver": False, "description": "43,661 logic cells", "packages": ["CSG324", "FG(G)484", "CS(G)484", "FG(G)676"]},
"LX45T": {"transceiver": True, "description": "43,661 logic cells + GTP transceivers", "packages": ["CSG324", "FG(G)484", "CS(G)484"]},
"LX75": {"transceiver": False, "description": "74,681 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]},
"LX75T": {"transceiver": True, "description": "74,681 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]},
"LX100": {"transceiver": False, "description": "101,261 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676"]},
"LX100T": {"transceiver": True, "description": "101,261 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]},
"LX150": {"transceiver": False, "description": "147,443 logic cells", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]},
"LX150T": {"transceiver": True, "description": "147,443 logic cells + GTP transceivers", "packages": ["FG(G)484", "CS(G)484", "FG(G)676", "FG(G)900"]},
}
# Package info
PACKAGE_INFO = {
"TQG144": {"type": "TQFP", "pitch": "0.5mm", "size": "20x20mm", "pins": 144, "description": "Thin Quad Flat Pack"},
"CPG196": {"type": "CSBGA", "pitch": "0.5mm", "size": "8x8mm", "pins": 196, "description": "Chip Scale BGA"},
"CSG225": {"type": "CSBGA", "pitch": "0.8mm", "size": "13x13mm", "pins": 225, "description": "Chip Scale BGA"},
"FT(G)256": {"type": "BGA", "pitch": "1.0mm", "size": "17x17mm", "pins": 256, "description": "Fine-pitch Thin BGA"},
"CSG324": {"type": "CSBGA", "pitch": "0.8mm", "size": "15x15mm", "pins": 324, "description": "Chip Scale BGA"},
"FG(G)484": {"type": "BGA", "pitch": "1.0mm", "size": "23x23mm", "pins": 484, "description": "Fine-pitch BGA"},
"CS(G)484": {"type": "CSBGA", "pitch": "0.8mm", "size": "19x19mm", "pins": 484, "description": "Chip Scale BGA"},
"FG(G)676": {"type": "BGA", "pitch": "1.0mm", "size": "27x27mm", "pins": 676, "description": "Fine-pitch BGA"},
"FG(G)900": {"type": "BGA", "pitch": "1.0mm", "size": "31x31mm", "pins": 900, "description": "Fine-pitch BGA"},
}
# Bank locations
BANK_LOCATION = {
"0": "Top-Left",
"1": "Right-Top",
"2": "Bottom",
"3": "Left",
"4": "Left-Top (LX75T/LX100T/LX150T only)",
"5": "Right-Top extra (LX75T/LX100T/LX150T only)",
"101": "GTP Transceiver Top-Left",
"123": "GTP Transceiver Top-Right",
"NA": "Dedicated pins (JTAG, Config, Power)",
}
# Build tree from PIN_DB
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
for device in devices:
if device not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"]:
dev_info = DEVICE_INFO.get(device, {})
FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device] = {
"type": "device",
"transceiver": dev_info.get("transceiver", False),
"description": dev_info.get("description", ""),
"children": {}
}
if package not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"]:
pkg_info = PACKAGE_INFO.get(package, {})
FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package] = {
"type": "package",
"pins": pkg_info.get("pins", 0),
"description": pkg_info.get("description", ""),
"children": {}
}
# Group pins by bank
bank_pins = defaultdict(list)
for pin in db["pins"]:
bank_pins[pin["bank"]].append(pin)
for bank, pins in bank_pins.items():
if bank not in FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"]:
FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"][bank] = {
"type": "bank",
"location": BANK_LOCATION.get(bank, "Unknown"),
"pin_count": len(pins),
"children": {}
}
for pin in pins:
pin_num = pin["pin_number"]
FPGA_TREE["children"]["Spartan-6 FPGA"]["children"][device]["children"][package]["children"][bank]["children"][pin_num] = {
"type": "pin",
"description": pin["pin_description"],
"bufio2": pin.get("bufio2_region", ""),
"no_connect": pin.get("no_connect", False),
}
# ═══════════════════════════════════════════════════════════════════════════════
# TẠO TREE-STRUCTURED TRAINING DATA
# ═══════════════════════════════════════════════════════════════════════════════
def create_tree_path(device, package, bank, pin_num, pin_desc):
"""Tạo đường đi từ gốc đến lá"""
pkg_info = PACKAGE_INFO.get(package, {})
dev_info = DEVICE_INFO.get(device, {})
bank_loc = BANK_LOCATION.get(bank, "Unknown")
path = f"Root/Xilinx/Spartan-6/{device}/{package}/Bank_{bank}/{pin_num}"
explanation = f"""Đường đi trong cây phân cấp:
- **Root**: Xilinx Integrated Circuits
- **Family**: Spartan-6 FPGA (low-cost, high-volume)
- **Device**: {device} ({dev_info.get('description', '')})
- Transceiver: {'Yes' if dev_info.get('transceiver') else 'No'}
- **Package**: {package} ({pkg_info.get('description', '')})
- Type: {pkg_info.get('type', 'N/A')}
- Pitch: {pkg_info.get('pitch', 'N/A')}
- Size: {pkg_info.get('size', 'N/A')}
- Total pins: {pkg_info.get('pins', 'N/A')}
- **Bank**: {bank} ({bank_loc})
- **Pin (Leaf Node)**: Ball {pin_num}
- **Định nghĩa**: {pin_desc}
"""
return path, explanation
# Generate samples
samples = []
# 1. TOP-DOWN: Root -> Leaf (3000 samples)
print("[1/6] Generating TOP-DOWN tree path samples...")
count = 0
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
for device in devices:
dev_info = DEVICE_INFO.get(device, {})
pkg_info = PACKAGE_INFO.get(package, {})
# Pick random pins from different banks
bank_pins = defaultdict(list)
for pin in db["pins"]:
bank_pins[pin["bank"]].append(pin)
# Sample from each bank
for bank, pins in bank_pins.items():
sampled = random.sample(pins, min(3, len(pins)))
for pin in sampled:
pin_num = pin["pin_number"]
pin_desc = pin["pin_description"]
path, explanation = create_tree_path(device, package, bank, pin_num, pin_desc)
# Various question styles
questions = [
f"Đi từ gốc Root xuống lá: Spartan-6 -> {device} -> {package} -> Bank {bank} -> Pin {pin_num}. Node lá này là gì?",
f"Trong cây phân cấp FPGA, tìm đường đến ball {pin_num} của {device} {package} Bank {bank}",
f"Navigate tree: Root/Xilinx/Spartan-6/{device}/{package}/Bank_{bank}/{pin_num}",
f"Từ Spartan-6 đi xuống {device} package {package} bank {bank}, ball {pin_num} là gì?",
]
for q in questions:
if count >= 3000:
break
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": explanation}
]
})
count += 1
# 2. BOTTOM-UP: Leaf -> Root (2000 samples)
print("[2/6] Generating BOTTOM-UP samples...")
count = 0
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
for device in devices:
# Pick random pins
sampled = random.sample(db["pins"], min(10, len(db["pins"])))
for pin in sampled:
pin_num = pin["pin_number"]
pin_desc = pin["pin_description"]
bank = pin["bank"]
# Find all devices that share this pin in this package
sibling_devices = [d for d in devices if d in DEVICE_INFO]
pkg_info = PACKAGE_INFO.get(package, {})
dev_info = DEVICE_INFO.get(device, {})
questions = [
f"Tôi có node lá {pin_desc} tại ball {pin_num}. Tìm đường đi ngược về gốc.",
f"Pin {pin_num} ({pin_desc}) thuộc device nào, package nào, bank nào?",
f"Navigate up: {pin_desc}@{pin_num} -> ? -> ? -> Root",
f"Ball {pin_num} trong {package}{pin_desc}. Nó nằm ở đâu trong cây FPGA?",
]
answer = f"""Đường đi ngược từ lá về gốc:
- **Node lá**: Ball {pin_num} = **{pin_desc}**
- **Bank**: {bank} ({BANK_LOCATION.get(bank, 'Unknown')})
- **Package**: {package} ({pkg_info.get('description', '')}, {pkg_info.get('pins', 'N/A')} pins)
- **Device**: {device} ({dev_info.get('description', '')})
- **Family**: Spartan-6 FPGA
- **Root**: Xilinx Integrated Circuits
Full path: `{pin_num}/{pin_desc} -> Bank {bank} -> {package} -> {device} -> Spartan-6 -> Xilinx`
"""
for q in questions:
if count >= 2000:
break
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": answer}
]
})
count += 1
# 3. SIBLING: Cùng bank, các pin khác nhau (1000 samples)
print("[3/6] Generating SIBLING relationship samples...")
count = 0
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
for device in devices[:1]:
# Group by bank
bank_pins = defaultdict(list)
for pin in db["pins"]:
bank_pins[pin["bank"]].append(pin)
for bank, pins in bank_pins.items():
if len(pins) >= 3:
sampled = random.sample(pins, min(5, len(pins)))
pin_list = [f"{p['pin_number']}={p['pin_description']}" for p in sampled]
q = f"Trong cây {device}/{package}/Bank_{bank}, liệt kê các node lá (siblings)"
a = f"""Các node lá (pins) trong nhánh **{device}/{package}/Bank {bank}** (siblings):
| Ball | Pin Description | Chức năng |
|------|----------------|-----------|
"""
for p in sampled:
desc = p["pin_description"]
func = "User I/O"
if "GCLK" in desc: func = "Global Clock"
elif "VREF" in desc: func = "Voltage Reference"
elif "HSWAPEN" in desc: func = "Hot Swap Enable"
elif desc in ["TCK", "TDI", "TDO", "TMS"]: func = "JTAG"
elif "CCLK" in desc: func = "Config Clock"
elif "PROGRAM" in desc: func = "Config Reset"
elif "DONE" in desc: func = "Config Done"
elif "GND" in desc: func = "Ground"
elif "VCC" in desc: func = "Power"
a += f"| {p['pin_number']} | {desc} | {func} |\n"
a += f"\nTổng: {len(pins)} pins trong Bank {bank} ({BANK_LOCATION.get(bank, '')})"
if count < 1000:
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": a}
]
})
count += 1
# 4. CROSS-BRANCH: So sánh 2 package (800 samples)
print("[4/6] Generating CROSS-BRANCH comparison samples...")
count = 0
# Compare LX150T in different packages
packages_to_compare = [("FG(G)484", "CS(G)484"), ("FG(G)484", "FG(G)676"), ("CS(G)484", "FG(G)676")]
for pkg1, pkg2 in packages_to_compare:
# Find data for both
db1 = None
db2 = None
for key, db in PIN_DB.items():
if db["package"] == pkg1 and "LX150T" in db["devices"]:
db1 = db
if db["package"] == pkg2 and "LX150T" in db["devices"]:
db2 = db
if db1 and db2:
bank_pins1 = defaultdict(list)
bank_pins2 = defaultdict(list)
for p in db1["pins"]:
bank_pins1[p["bank"]].append(p)
for p in db2["pins"]:
bank_pins2[p["bank"]].append(p)
q = f"So sánh cây LX150T {pkg1} và LX150T {pkg2}, Bank 0 khác nhau thế nào?"
a = f"""So sánh nhánh **LX150T/{pkg1}/Bank_0** vs **LX150T/{pkg2}/Bank_0**:
| Thuộc tính | {pkg1} | {pkg2} |
|------------|--------|--------|
| Tổng pins | {len(bank_pins1.get('0', []))} | {len(bank_pins2.get('0', []))} |
| Package type | {PACKAGE_INFO.get(pkg1, {}).get('type', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('type', 'N/A')} |
| Package size | {PACKAGE_INFO.get(pkg1, {}).get('size', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('size', 'N/A')} |
| Pitch | {PACKAGE_INFO.get(pkg1, {}).get('pitch', 'N/A')} | {PACKAGE_INFO.get(pkg2, {}).get('pitch', 'N/A')} |
**Giống nhau**: Cùng Device (LX150T), cùng Bank 0 (Top-Left region), cùng các chức năng pin (IO_LXXY, GCLK, VREF).
**Khác nhau**: Số lượng pin khác do package size khác. Ball assignments (vị trí vật lý) khác nhau hoàn toàn.
"""
if count < 800:
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": a}
]
})
count += 1
# 5. PART NUMBER -> TREE PATH (500 samples)
print("[5/6] Generating Part Number -> Tree Path samples...")
PART_NUMBERS = {
"XC6SLX150T-2FGG484C": {"device": "LX150T", "package": "FG(G)484", "speed": "2", "temp": "Commercial"},
"XC6SLX150T-2CSG484C": {"device": "LX150T", "package": "CS(G)484", "speed": "2", "temp": "Commercial"},
"XC6SLX150T-2FGG676C": {"device": "LX150T", "package": "FG(G)676", "speed": "2", "temp": "Commercial"},
"XC6SLX45-3CSG324I": {"device": "LX45", "package": "CSG324", "speed": "3", "temp": "Industrial"},
"XC6SLX9-2TQG144C": {"device": "LX9", "package": "TQG144", "speed": "2", "temp": "Commercial"},
}
for pn, info in PART_NUMBERS.items():
dev = info["device"]
pkg = info["package"]
dev_info = DEVICE_INFO.get(dev, {})
pkg_info = PACKAGE_INFO.get(pkg, {})
# Find total pins
total_pins = 0
for key, db in PIN_DB.items():
if db["package"] == pkg and dev in db["devices"]:
total_pins = db["total_pins"]
break
q = f"Part number {pn} ánh xạ vào cây phân cấp như thế nào?"
a = f"""Part Number **{pn}** -> Tree Path:
```
Root/Xilinx/Spartan-6/{dev}/{pkg}
```
**Phân tích Part Number:**
- **XC6S**: Xilinx Spartan-6 Family
- **{dev}**: Device {dev} ({dev_info.get('description', '')})
- **-{info['speed']}**: Speed Grade {info['speed']}
- **{pkg.replace('(G)', 'G')}**: Package {pkg} ({pkg_info.get('description', '')})
- **C**: {info['temp']} Temperature
**Thuộc tính trong cây:**
- Node: Device ({dev})
- Transceiver: {'Yes' if dev_info.get('transceiver') else 'No'}
- Node con: Package ({pkg})
- Type: {pkg_info.get('type', 'N/A')}
- Pins: {total_pins}
- Size: {pkg_info.get('size', 'N/A')}
- Các lá: {total_pins} pin balls (chia thành các Bank 0-3, NA, 101, 123)
"""
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": a}
]
})
# 6. PIN FUNCTION -> FIND IN TREE (1000 samples)
print("[6/6] Generating Pin Function -> Tree Search samples...")
PIN_FUNCTIONS = {
"GCLK": "Global Clock pins",
"VREF": "Voltage Reference pins",
"HSWAPEN": "Hot Swap Enable pin",
"CCLK": "Configuration Clock",
"PROGRAM_B": "Program/Reset configuration",
"DONE": "Configuration Done indicator",
"TCK": "JTAG Test Clock",
"TDI": "JTAG Test Data In",
"TDO": "JTAG Test Data Out",
"TMS": "JTAG Test Mode Select",
}
for func_keyword, func_desc in PIN_FUNCTIONS.items():
# Find example pins
found_pins = []
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
for pin in db["pins"]:
if func_keyword in pin["pin_description"]:
found_pins.append({
"device": devices[0] if devices else "Unknown",
"package": package,
"bank": pin["bank"],
"ball": pin["pin_number"],
"desc": pin["pin_description"],
})
if len(found_pins) >= 20:
break
if found_pins:
sampled = random.sample(found_pins, min(8, len(found_pins)))
q = f"Tìm tất cả node lá có chức năng {func_keyword} ({func_desc}) trong cây Spartan-6"
a = f"""Tìm kiếm trong cây phân cấp: nodes có chức năng **{func_desc}** ({func_keyword})
| Device | Package | Bank | Ball | Pin Description | Path |
|--------|---------|------|------|-----------------|------|
"""
for p in sampled:
a += f"| {p['device']} | {p['package']} | {p['bank']} | {p['ball']} | {p['desc']} | Root/Spartan-6/{p['device']}/{p['package']}/Bank_{p['bank']}/{p['ball']} |\n"
a += f"""
**Pattern trong cây**: Các node {func_keyword} thường nằm rải rác ở nhiều Bank khác nhau,
nhưng tập trung nhiều ở Bank 0 và Bank 2 (Top và Bottom regions).
"""
samples.append({
"messages": [
{"role": "user", "content": q},
{"role": "assistant", "content": a}
]
})
# Save
print(f"\nTotal tree-structured samples: {len(samples)}")
random.shuffle(samples)
output_file = "fpga_tree_training_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
for s in samples:
f.write(json.dumps(s, ensure_ascii=False) + "\n")
print(f"Saved to {output_file}")
# Show samples
print("\n=== Sample 1: TOP-DOWN ===")
print(json.dumps(samples[0], indent=2, ensure_ascii=False)[:1500])
print("\n=== Sample 2: BOTTOM-UP ===")
bottom_up = [s for s in samples if "ngược" in s["messages"][0]["content"] or "up" in s["messages"][0]["content"].lower()]
if bottom_up:
print(json.dumps(bottom_up[0], indent=2, ensure_ascii=False)[:1500])