File size: 5,699 Bytes
b1cb0ff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """
data_processor.py - Convert xlsb -> ChromaDB vectors cho HDMT chatbot
Chạy: python data_processor.py
Output: ./chroma_db/ (ChromaDB persistent)
"""
import os, re, json
from collections import defaultdict
from pyxlsb import open_workbook
# CONFIG
XLSB_PATH = "data/Debug (version 1).xlsb"
CHROMA_DB_PATH = "./chroma_db"
os.makedirs("data", exist_ok=True)
os.makedirs(CHROMA_DB_PATH, exist_ok=True)
# LOAD EMBEDDING MODEL
print("[1/6] Loading embedding model (intfloat/multilingual-e5-large)...")
from sentence_transformers import SentenceTransformer
EMBED_MODEL = SentenceTransformer("intfloat/multilingual-e5-large")
print(" Model loaded (1024 dimensions)")
# HELPERS
def normalize_text(text):
if not text: return ""
text = str(text).strip()
text = re.sub(r'\d{4}-[A-Za-z]{3}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*\[.*?\]\s*', '', text)
text = re.sub(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*', '', text)
text = re.sub(r'IWHT\d+', 'SN', text)
text = re.sub(r'0x[0-9A-Fa-f]+', 'HEX', text)
text = re.sub(r'\b\d{6,}\b', 'N', text)
return re.sub(r'\s+', ' ', text).strip()
def extract_components(text):
if not text: return []
return list(set(re.findall(r'\b([A-Z]{1,3}\d{1,4}[A-Z]?)\b', str(text))))
# READ XLSB
print(f"[2/6] Reading {XLSB_PATH}...")
wb = open_workbook(XLSB_PATH)
with wb.get_sheet('AI_Training_Combined') as sheet:
rows = list(sheet.rows())
headers = [str(c.v) if c.v else "" for c in rows[0]]
idx = {h: i for i, h in enumerate(headers)}
cases = [[str(c.v) if c.v else "" for c in r] for r in rows[1:]]
print(f" {len(cases)} cases loaded")
# Read cross-ref sheets
print("[3/6] Reading cross-reference sheets...")
fpga_map, memory_map, pin_map = {}, {}, {}
with wb.get_sheet('FPGA_Channel_Map') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 4 and vals[1]:
fpga_map[vals[1]] = {"fpga": vals[0], "pmu": vals[2], "adate": vals[3]}
with wb.get_sheet('Memory_Slot_Failures') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 4 and vals[0]:
memory_map[vals[0]] = {"msg": vals[1], "ecc": vals[2], "fpga_conv": vals[3]}
with wb.get_sheet('Pin_Signal_Map') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 3 and vals[0]:
pin_map[vals[0]] = {"pins": vals[1], "desc": vals[2]}
print(f" FPGA:{len(fpga_map)} Memory:{len(memory_map)} Pin:{len(pin_map)}")
# BUILD DOCUMENTS
print("[4/6] Building documents...")
documents = []
comp_index = defaultdict(list)
for i, row in enumerate(cases):
fk = row[idx.get('Failure_Key', 1)]
fd = row[idx.get('Failure_Description', 2)]
comps = row[idx.get('Components_Replaced', 5)]
result = row[idx.get('Result', 6)]
best = row[idx.get('Best_Actions (Weighted)', 9)]
priority = row[idx.get('Priority_Replace (Ranked)', 11)]
bkm = row[idx.get('BKM_Procedure', 12)]
bkm_comp = row[idx.get('BKM_Focus_Components', 13)]
norm = normalize_text(fd)
xref = []
cf = extract_components(comps)
for c in cf:
comp_index[c].append(f"case_{i}")
if c in pin_map:
xref.append(f"{c}: {pin_map[c]['desc']} ({pin_map[c]['pins']})")
if c in fpga_map:
xref.append(f"{c}: FPGA {fpga_map[c]['fpga']}, PMU {fpga_map[c]['pmu']}")
text = f"Board:{row[0]} Failure:{fk} Error:{norm} Result:{result} Components:{comps} Best:{best} Priority:{priority} BKM:{bkm} BKMComp:{bkm_comp} CrossRef:{'|'.join(xref)}"
documents.append({
"id": f"case_{i}",
"text": text,
"metadata": {
"failure_key": fk, "failure_desc": norm, "failure_raw": fd[:200],
"components": comps, "result": result, "best_actions": best,
"priority_replace": priority, "bkm_procedure": bkm,
"bkm_components": bkm_comp, "cross_ref": xref, "comps_found": cf,
}
})
print(f" {len(documents)} documents")
# BUILD CHROMA DB
print("[5/6] Building ChromaDB (~5-10 min)...")
import chromadb
from chromadb.config import Settings
client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False))
try: client.delete_collection("hdmt_cases")
except: pass
collection = client.create_collection(name="hdmt_cases")
for i in range(0, len(documents), 50):
batch = documents[i:i+50]
texts = [d["text"] for d in batch]
emb = EMBED_MODEL.encode([f"passage: {t}" for t in texts], normalize_embeddings=True).tolist()
collection.add(embeddings=emb, documents=texts, metadatas=[d["metadata"] for d in batch], ids=[d["id"] for d in batch])
print(f" {min(i+50, len(documents))}/{len(documents)}")
# SAVE AUXILIARY
print("[6/6] Saving auxiliary data...")
with open("component_index.json", "w") as f:
json.dump({k: list(set(v)) for k, v in comp_index.items()}, f)
with open("cross_ref_maps.json", "w") as f:
json.dump({"fpga": fpga_map, "memory": memory_map, "pin": pin_map}, f)
fk_stats = defaultdict(lambda: {"pass":0, "fail":0, "total":0})
for d in documents:
fk_stats[d["metadata"]["failure_key"]]["total"] += 1
if d["metadata"]["result"] == "Pass": fk_stats[fk]["pass"] += 1
else: fk_stats[fk]["fail"] += 1
stats = {k: {"total":v["total"], "pass":v["pass"], "fail":v["fail"], "rate":round(v["pass"]/v["total"]*100,1) if v["total"] else 0} for k,v in fk_stats.items()}
with open("stats_summary.json", "w") as f:
json.dump(stats, f)
print("\nDone! Database built in ./chroma_db/")
print("Next: bash vllm_server.sh")
|