hdmt-rag-local / build_training_data.py
trandangduc0's picture
Upload build_training_data.py with huggingface_hub
45c62e9 verified
"""
build_training_data.py - Convert pin_database.json -> training dataset cho fine-tune LLM
Chạy: python build_training_data.py
Output: fpga_training_data.jsonl (conversational format)
"""
import json
import random
from collections import defaultdict
with open("pin_database.json", "r") as f:
PIN_DB = json.load(f)
# Build reverse indexes
DEVICE_PACKAGES = defaultdict(set)
for key, db in PIN_DB.items():
for dev in db["devices"]:
DEVICE_PACKAGES[dev].add(db["package"])
PART_NUMBER_PATTERNS = {
"LX4": ["XC6SLX4-2TQG144C", "XC6SLX4-2CPG196C", "XC6SLX4-2CSG225C"],
"LX9": ["XC6SLX9-2TQG144C", "XC6SLX9-2CPG196C", "XC6SLX9-2CSG225C", "XC6SLX9-2FTG256C", "XC6SLX9-2CSG324C"],
"LX16": ["XC6SLX16-2CPG196C", "XC6SLX16-2CSG225C", "XC6SLX16-2FTG256C", "XC6SLX16-2CSG324C"],
"LX25": ["XC6SLX25-2FTG256C", "XC6SLX25-2CSG324C"],
"LX25T": ["XC6SLX25T-2CSG324C", "XC6SLX25T-2FGG484C"],
"LX45": ["XC6SLX45-2CSG324C", "XC6SLX45-2FGG484C", "XC6SLX45-2CSG484C", "XC6SLX45-2FGG676C"],
"LX45T": ["XC6SLX45T-2CSG324C", "XC6SLX45T-2FGG484C", "XC6SLX45T-2CSG484C"],
"LX75": ["XC6SLX75-2FGG484C", "XC6SLX75-2CSG484C", "XC6SLX75-2FGG676C"],
"LX75T": ["XC6SLX75T-2FGG484C", "XC6SLX75T-2CSG484C", "XC6SLX75T-2FGG676C"],
"LX100": ["XC6SLX100-2FGG484C", "XC6SLX100-2CSG484C", "XC6SLX100-2FGG676C"],
"LX100T": ["XC6SLX100T-2FGG484C", "XC6SLX100T-2CSG484C", "XC6SLX100T-2FGG676C", "XC6SLX100T-2FGG900C"],
"LX150": ["XC6SLX150-2FGG484C", "XC6SLX150-2CSG484C", "XC6SLX150-2FGG676C", "XC6SLX150-2FGG900C"],
"LX150T": ["XC6SLX150T-2FGG484C", "XC6SLX150T-2CSG484C", "XC6SLX150T-2FGG676C", "XC6SLX150T-2FGG900C"],
}
def create_qa_samples():
"""Tạo các cặp Q&A từ database"""
samples = []
# 1. Pin lookup samples (~2000 samples)
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
# Randomly pick pins for training (not all, to save size)
pins_to_sample = random.sample(db["pins"], min(60, len(db["pins"])))
for pin in pins_to_sample:
pin_num = pin["pin_number"]
desc = pin["pin_description"]
bank = pin["bank"]
# Generate part number if available
part_num = None
for dev in devices:
if dev in PART_NUMBER_PATTERNS:
for pn in PART_NUMBER_PATTERNS[dev]:
if package.replace("(G)", "G") in pn or package.replace("(G)", "") in pn:
part_num = pn
break
if part_num:
break
# Q&A type 1: Part number + pin
if part_num:
q = f"What is pin {pin_num} on {part_num}?"
a = f"Pin {pin_num} on {part_num} is **{desc}** (Bank {bank})."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
q2 = f"{part_num} ball {pin_num} là gì?"
a2 = f"Ball {pin_num} trên {part_num} là **{desc}** (Bank {bank})."
samples.append({"messages": [{"role": "user", "content": q2}, {"role": "assistant", "content": a2}]})
# Q&A type 2: Package + device + pin
for dev in devices[:2]: # Limit to avoid explosion
q = f"What is the function of pin {pin_num} on Spartan-6 {dev} in {package}?"
a = f"On Spartan-6 {dev} in {package} package, pin {pin_num} is **{desc}** (Bank {bank})."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
q_vi = f"Pin {pin_num} trên {package} {dev} là gì?"
a_vi = f"Pin {pin_num} trên {package} {dev} là **{desc}** (Bank {bank})."
samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]})
# 2. Device info samples (~300 samples)
for device, packages in DEVICE_PACKAGES.items():
packages_list = sorted(list(packages))
q = f"What packages does Spartan-6 {device} support?"
a = f"Spartan-6 {device} supports the following packages: {', '.join(packages_list)}."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
q_vi = f"{device} hỗ trợ những package nào?"
a_vi = f"{device} hỗ trợ các package: {', '.join(packages_list)}."
samples.append({"messages": [{"role": "user", "content": q_vi}, {"role": "assistant", "content": a_vi}]})
# Pin count per package
for pkg in packages_list[:3]:
key = None
for k, db in PIN_DB.items():
if db["package"] == pkg and device in db["devices"]:
key = k
break
if key:
total = PIN_DB[key]["total_pins"]
q = f"How many pins does {device} in {pkg} have?"
a = f"{device} in {pkg} package has **{total} pins**."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
# 3. Search by function samples (~500 samples)
# Find all GCLK pins, VREF pins, etc.
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
# GCLK pins
gclk_pins = [p for p in db["pins"] if "GCLK" in p["pin_description"]]
if gclk_pins:
for dev in devices[:1]:
q = f"Where are the GCLK pins on {dev} {package}?"
pin_list = ", ".join([f"{p['pin_number']} ({p['pin_description']}, Bank {p['bank']})" for p in gclk_pins[:5]])
a = f"GCLK pins on {dev} {package}: {pin_list}. Total: {len(gclk_pins)} GCLK pins."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
# VREF pins
vref_pins = [p for p in db["pins"] if "VREF" in p["pin_description"]]
if vref_pins:
for dev in devices[:1]:
q = f"VREF pins on {dev} {package}?"
pin_list = ", ".join([f"{p['pin_number']} (Bank {p['bank']})" for p in vref_pins[:5]])
a = f"VREF pins on {dev} {package}: {pin_list}. Total: {len(vref_pins)} VREF pins."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
# Bank summary
bank_counts = defaultdict(int)
for p in db["pins"]:
bank_counts[p["bank"]] += 1
for dev in devices[:1]:
q = f"How many pins per bank on {dev} {package}?"
bank_info = ", ".join([f"Bank {b}: {c} pins" for b, c in sorted(bank_counts.items(), key=lambda x: (int(x[0]) if x[0].isdigit() else -1))[:6]])
a = f"{dev} {package} pin distribution: {bank_info}."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
# 4. Part number parsing samples (~200 samples)
for device, pns in PART_NUMBER_PATTERNS.items():
for pn in pns:
q = f"What is {pn}?"
# Parse
import re
m = re.match(r'XC6S(LX\d+T?)-(\d+)([A-Z]+\d+)([A-Z])', pn)
if m:
dev = m.group(1)
speed = m.group(2)
pkg = m.group(3)
temp = m.group(4)
temp_str = "Commercial" if temp == "C" else "Industrial" if temp == "I" else temp
a = f"{pn} is a Xilinx Spartan-6 FPGA: Device {dev}, Speed Grade {speed}, Package {pkg}, Temperature Grade {temp_str}."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
# 5. Differential pair samples
for key, db in PIN_DB.items():
package = db["package"]
devices = db["devices"]
# Find P/N pairs
pairs = defaultdict(list)
for p in db["pins"]:
desc = p["pin_description"]
if "_P_" in desc or "_N_" in desc:
# Extract base name
base = desc.replace("_P_", "_").replace("_N_", "_")
pairs[base].append(p)
for dev in devices[:1]:
# Pick a few pairs
sample_pairs = list(pairs.items())[:3]
for base, pins in sample_pairs:
if len(pins) == 2:
p_pin = [p for p in pins if "_P_" in p["pin_description"]][0]
n_pin = [p for p in pins if "_N_" in p["pin_description"]][0]
q = f"What is the differential pair {base.rstrip('_')} on {dev} {package}?"
a = f"{base.rstrip('_')} differential pair on {dev} {package}: Positive (P) at pin **{p_pin['pin_number']}** (Bank {p_pin['bank']}), Negative (N) at pin **{n_pin['pin_number']}** (Bank {n_pin['bank']})."
samples.append({"messages": [{"role": "user", "content": q}, {"role": "assistant", "content": a}]})
return samples
if __name__ == "__main__":
print("Building training data from pin_database.json...")
samples = create_qa_samples()
print(f"Generated {len(samples)} training samples")
# Shuffle
random.shuffle(samples)
# Save
output_file = "fpga_training_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
for s in samples:
f.write(json.dumps(s, ensure_ascii=False) + "\n")
print(f"Saved to {output_file}")
# Show sample
print("\n=== Sample ===")
print(json.dumps(samples[0], indent=2, ensure_ascii=False))