""" data_processor.py - Convert xlsb -> ChromaDB vectors cho HDMT chatbot Chạy: python data_processor.py Output: ./chroma_db/ (ChromaDB persistent) """ import os, re, json from collections import defaultdict from pyxlsb import open_workbook # CONFIG XLSB_PATH = "data/Debug (version 1).xlsb" CHROMA_DB_PATH = "./chroma_db" os.makedirs("data", exist_ok=True) os.makedirs(CHROMA_DB_PATH, exist_ok=True) # LOAD EMBEDDING MODEL print("[1/6] Loading embedding model (intfloat/multilingual-e5-large)...") from sentence_transformers import SentenceTransformer EMBED_MODEL = SentenceTransformer("intfloat/multilingual-e5-large") print(" Model loaded (1024 dimensions)") # HELPERS def normalize_text(text): if not text: return "" text = str(text).strip() text = re.sub(r'\d{4}-[A-Za-z]{3}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*\[.*?\]\s*', '', text) text = re.sub(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*', '', text) text = re.sub(r'IWHT\d+', 'SN', text) text = re.sub(r'0x[0-9A-Fa-f]+', 'HEX', text) text = re.sub(r'\b\d{6,}\b', 'N', text) return re.sub(r'\s+', ' ', text).strip() def extract_components(text): if not text: return [] return list(set(re.findall(r'\b([A-Z]{1,3}\d{1,4}[A-Z]?)\b', str(text)))) # READ XLSB print(f"[2/6] Reading {XLSB_PATH}...") wb = open_workbook(XLSB_PATH) with wb.get_sheet('AI_Training_Combined') as sheet: rows = list(sheet.rows()) headers = [str(c.v) if c.v else "" for c in rows[0]] idx = {h: i for i, h in enumerate(headers)} cases = [[str(c.v) if c.v else "" for c in r] for r in rows[1:]] print(f" {len(cases)} cases loaded") # Read cross-ref sheets print("[3/6] Reading cross-reference sheets...") fpga_map, memory_map, pin_map = {}, {}, {} with wb.get_sheet('FPGA_Channel_Map') as sheet: for r in list(sheet.rows())[1:]: vals = [str(c.v) if c.v else "" for c in r] if len(vals) >= 4 and vals[1]: fpga_map[vals[1]] = {"fpga": vals[0], "pmu": vals[2], "adate": vals[3]} with wb.get_sheet('Memory_Slot_Failures') as sheet: for r in list(sheet.rows())[1:]: vals = [str(c.v) if c.v else "" for c in r] if len(vals) >= 4 and vals[0]: memory_map[vals[0]] = {"msg": vals[1], "ecc": vals[2], "fpga_conv": vals[3]} with wb.get_sheet('Pin_Signal_Map') as sheet: for r in list(sheet.rows())[1:]: vals = [str(c.v) if c.v else "" for c in r] if len(vals) >= 3 and vals[0]: pin_map[vals[0]] = {"pins": vals[1], "desc": vals[2]} print(f" FPGA:{len(fpga_map)} Memory:{len(memory_map)} Pin:{len(pin_map)}") # BUILD DOCUMENTS print("[4/6] Building documents...") documents = [] comp_index = defaultdict(list) for i, row in enumerate(cases): fk = row[idx.get('Failure_Key', 1)] fd = row[idx.get('Failure_Description', 2)] comps = row[idx.get('Components_Replaced', 5)] result = row[idx.get('Result', 6)] best = row[idx.get('Best_Actions (Weighted)', 9)] priority = row[idx.get('Priority_Replace (Ranked)', 11)] bkm = row[idx.get('BKM_Procedure', 12)] bkm_comp = row[idx.get('BKM_Focus_Components', 13)] norm = normalize_text(fd) xref = [] cf = extract_components(comps) for c in cf: comp_index[c].append(f"case_{i}") if c in pin_map: xref.append(f"{c}: {pin_map[c]['desc']} ({pin_map[c]['pins']})") if c in fpga_map: xref.append(f"{c}: FPGA {fpga_map[c]['fpga']}, PMU {fpga_map[c]['pmu']}") text = f"Board:{row[0]} Failure:{fk} Error:{norm} Result:{result} Components:{comps} Best:{best} Priority:{priority} BKM:{bkm} BKMComp:{bkm_comp} CrossRef:{'|'.join(xref)}" documents.append({ "id": f"case_{i}", "text": text, "metadata": { "failure_key": fk, "failure_desc": norm, "failure_raw": fd[:200], "components": comps, "result": result, "best_actions": best, "priority_replace": priority, "bkm_procedure": bkm, "bkm_components": bkm_comp, "cross_ref": xref, "comps_found": cf, } }) print(f" {len(documents)} documents") # BUILD CHROMA DB print("[5/6] Building ChromaDB (~5-10 min)...") import chromadb from chromadb.config import Settings client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False)) try: client.delete_collection("hdmt_cases") except: pass collection = client.create_collection(name="hdmt_cases") for i in range(0, len(documents), 50): batch = documents[i:i+50] texts = [d["text"] for d in batch] emb = EMBED_MODEL.encode([f"passage: {t}" for t in texts], normalize_embeddings=True).tolist() collection.add(embeddings=emb, documents=texts, metadatas=[d["metadata"] for d in batch], ids=[d["id"] for d in batch]) print(f" {min(i+50, len(documents))}/{len(documents)}") # SAVE AUXILIARY print("[6/6] Saving auxiliary data...") with open("component_index.json", "w") as f: json.dump({k: list(set(v)) for k, v in comp_index.items()}, f) with open("cross_ref_maps.json", "w") as f: json.dump({"fpga": fpga_map, "memory": memory_map, "pin": pin_map}, f) fk_stats = defaultdict(lambda: {"pass":0, "fail":0, "total":0}) for d in documents: fk_stats[d["metadata"]["failure_key"]]["total"] += 1 if d["metadata"]["result"] == "Pass": fk_stats[fk]["pass"] += 1 else: fk_stats[fk]["fail"] += 1 stats = {k: {"total":v["total"], "pass":v["pass"], "fail":v["fail"], "rate":round(v["pass"]/v["total"]*100,1) if v["total"] else 0} for k,v in fk_stats.items()} with open("stats_summary.json", "w") as f: json.dump(stats, f) print("\nDone! Database built in ./chroma_db/") print("Next: bash vllm_server.sh")