| """ |
| data_processor.py - Convert xlsb -> ChromaDB vectors cho HDMT chatbot |
| Chạy: python data_processor.py |
| Output: ./chroma_db/ (ChromaDB persistent) |
| """ |
| import os, re, json |
| from collections import defaultdict |
| from pyxlsb import open_workbook |
|
|
| |
| XLSB_PATH = "data/Debug (version 1).xlsb" |
| CHROMA_DB_PATH = "./chroma_db" |
| os.makedirs("data", exist_ok=True) |
| os.makedirs(CHROMA_DB_PATH, exist_ok=True) |
|
|
| |
| print("[1/6] Loading embedding model (intfloat/multilingual-e5-large)...") |
| from sentence_transformers import SentenceTransformer |
| EMBED_MODEL = SentenceTransformer("intfloat/multilingual-e5-large") |
| print(" Model loaded (1024 dimensions)") |
|
|
| |
| def normalize_text(text): |
| if not text: return "" |
| text = str(text).strip() |
| text = re.sub(r'\d{4}-[A-Za-z]{3}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*\[.*?\]\s*', '', text) |
| text = re.sub(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*', '', text) |
| text = re.sub(r'IWHT\d+', 'SN', text) |
| text = re.sub(r'0x[0-9A-Fa-f]+', 'HEX', text) |
| text = re.sub(r'\b\d{6,}\b', 'N', text) |
| return re.sub(r'\s+', ' ', text).strip() |
|
|
| def extract_components(text): |
| if not text: return [] |
| return list(set(re.findall(r'\b([A-Z]{1,3}\d{1,4}[A-Z]?)\b', str(text)))) |
|
|
| |
| print(f"[2/6] Reading {XLSB_PATH}...") |
| wb = open_workbook(XLSB_PATH) |
| with wb.get_sheet('AI_Training_Combined') as sheet: |
| rows = list(sheet.rows()) |
| headers = [str(c.v) if c.v else "" for c in rows[0]] |
| idx = {h: i for i, h in enumerate(headers)} |
| cases = [[str(c.v) if c.v else "" for c in r] for r in rows[1:]] |
| print(f" {len(cases)} cases loaded") |
|
|
| |
| print("[3/6] Reading cross-reference sheets...") |
| fpga_map, memory_map, pin_map = {}, {}, {} |
| with wb.get_sheet('FPGA_Channel_Map') as sheet: |
| for r in list(sheet.rows())[1:]: |
| vals = [str(c.v) if c.v else "" for c in r] |
| if len(vals) >= 4 and vals[1]: |
| fpga_map[vals[1]] = {"fpga": vals[0], "pmu": vals[2], "adate": vals[3]} |
| with wb.get_sheet('Memory_Slot_Failures') as sheet: |
| for r in list(sheet.rows())[1:]: |
| vals = [str(c.v) if c.v else "" for c in r] |
| if len(vals) >= 4 and vals[0]: |
| memory_map[vals[0]] = {"msg": vals[1], "ecc": vals[2], "fpga_conv": vals[3]} |
| with wb.get_sheet('Pin_Signal_Map') as sheet: |
| for r in list(sheet.rows())[1:]: |
| vals = [str(c.v) if c.v else "" for c in r] |
| if len(vals) >= 3 and vals[0]: |
| pin_map[vals[0]] = {"pins": vals[1], "desc": vals[2]} |
| print(f" FPGA:{len(fpga_map)} Memory:{len(memory_map)} Pin:{len(pin_map)}") |
|
|
| |
| print("[4/6] Building documents...") |
| documents = [] |
| comp_index = defaultdict(list) |
| for i, row in enumerate(cases): |
| fk = row[idx.get('Failure_Key', 1)] |
| fd = row[idx.get('Failure_Description', 2)] |
| comps = row[idx.get('Components_Replaced', 5)] |
| result = row[idx.get('Result', 6)] |
| best = row[idx.get('Best_Actions (Weighted)', 9)] |
| priority = row[idx.get('Priority_Replace (Ranked)', 11)] |
| bkm = row[idx.get('BKM_Procedure', 12)] |
| bkm_comp = row[idx.get('BKM_Focus_Components', 13)] |
| |
| norm = normalize_text(fd) |
| xref = [] |
| cf = extract_components(comps) |
| for c in cf: |
| comp_index[c].append(f"case_{i}") |
| if c in pin_map: |
| xref.append(f"{c}: {pin_map[c]['desc']} ({pin_map[c]['pins']})") |
| if c in fpga_map: |
| xref.append(f"{c}: FPGA {fpga_map[c]['fpga']}, PMU {fpga_map[c]['pmu']}") |
| |
| text = f"Board:{row[0]} Failure:{fk} Error:{norm} Result:{result} Components:{comps} Best:{best} Priority:{priority} BKM:{bkm} BKMComp:{bkm_comp} CrossRef:{'|'.join(xref)}" |
| |
| documents.append({ |
| "id": f"case_{i}", |
| "text": text, |
| "metadata": { |
| "failure_key": fk, "failure_desc": norm, "failure_raw": fd[:200], |
| "components": comps, "result": result, "best_actions": best, |
| "priority_replace": priority, "bkm_procedure": bkm, |
| "bkm_components": bkm_comp, "cross_ref": xref, "comps_found": cf, |
| } |
| }) |
| print(f" {len(documents)} documents") |
|
|
| |
| print("[5/6] Building ChromaDB (~5-10 min)...") |
| import chromadb |
| from chromadb.config import Settings |
| client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False)) |
| try: client.delete_collection("hdmt_cases") |
| except: pass |
| collection = client.create_collection(name="hdmt_cases") |
|
|
| for i in range(0, len(documents), 50): |
| batch = documents[i:i+50] |
| texts = [d["text"] for d in batch] |
| emb = EMBED_MODEL.encode([f"passage: {t}" for t in texts], normalize_embeddings=True).tolist() |
| collection.add(embeddings=emb, documents=texts, metadatas=[d["metadata"] for d in batch], ids=[d["id"] for d in batch]) |
| print(f" {min(i+50, len(documents))}/{len(documents)}") |
|
|
| |
| print("[6/6] Saving auxiliary data...") |
| with open("component_index.json", "w") as f: |
| json.dump({k: list(set(v)) for k, v in comp_index.items()}, f) |
| with open("cross_ref_maps.json", "w") as f: |
| json.dump({"fpga": fpga_map, "memory": memory_map, "pin": pin_map}, f) |
|
|
| fk_stats = defaultdict(lambda: {"pass":0, "fail":0, "total":0}) |
| for d in documents: |
| fk_stats[d["metadata"]["failure_key"]]["total"] += 1 |
| if d["metadata"]["result"] == "Pass": fk_stats[fk]["pass"] += 1 |
| else: fk_stats[fk]["fail"] += 1 |
| stats = {k: {"total":v["total"], "pass":v["pass"], "fail":v["fail"], "rate":round(v["pass"]/v["total"]*100,1) if v["total"] else 0} for k,v in fk_stats.items()} |
| with open("stats_summary.json", "w") as f: |
| json.dump(stats, f) |
|
|
| print("\nDone! Database built in ./chroma_db/") |
| print("Next: bash vllm_server.sh") |
|
|