hdmt-rag-local / data_processor.py
trandangduc0's picture
Upload data_processor.py
b1cb0ff verified
"""
data_processor.py - Convert xlsb -> ChromaDB vectors cho HDMT chatbot
Chạy: python data_processor.py
Output: ./chroma_db/ (ChromaDB persistent)
"""
import os, re, json
from collections import defaultdict
from pyxlsb import open_workbook
# CONFIG
XLSB_PATH = "data/Debug (version 1).xlsb"
CHROMA_DB_PATH = "./chroma_db"
os.makedirs("data", exist_ok=True)
os.makedirs(CHROMA_DB_PATH, exist_ok=True)
# LOAD EMBEDDING MODEL
print("[1/6] Loading embedding model (intfloat/multilingual-e5-large)...")
from sentence_transformers import SentenceTransformer
EMBED_MODEL = SentenceTransformer("intfloat/multilingual-e5-large")
print(" Model loaded (1024 dimensions)")
# HELPERS
def normalize_text(text):
if not text: return ""
text = str(text).strip()
text = re.sub(r'\d{4}-[A-Za-z]{3}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*\[.*?\]\s*', '', text)
text = re.sub(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*', '', text)
text = re.sub(r'IWHT\d+', 'SN', text)
text = re.sub(r'0x[0-9A-Fa-f]+', 'HEX', text)
text = re.sub(r'\b\d{6,}\b', 'N', text)
return re.sub(r'\s+', ' ', text).strip()
def extract_components(text):
if not text: return []
return list(set(re.findall(r'\b([A-Z]{1,3}\d{1,4}[A-Z]?)\b', str(text))))
# READ XLSB
print(f"[2/6] Reading {XLSB_PATH}...")
wb = open_workbook(XLSB_PATH)
with wb.get_sheet('AI_Training_Combined') as sheet:
rows = list(sheet.rows())
headers = [str(c.v) if c.v else "" for c in rows[0]]
idx = {h: i for i, h in enumerate(headers)}
cases = [[str(c.v) if c.v else "" for c in r] for r in rows[1:]]
print(f" {len(cases)} cases loaded")
# Read cross-ref sheets
print("[3/6] Reading cross-reference sheets...")
fpga_map, memory_map, pin_map = {}, {}, {}
with wb.get_sheet('FPGA_Channel_Map') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 4 and vals[1]:
fpga_map[vals[1]] = {"fpga": vals[0], "pmu": vals[2], "adate": vals[3]}
with wb.get_sheet('Memory_Slot_Failures') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 4 and vals[0]:
memory_map[vals[0]] = {"msg": vals[1], "ecc": vals[2], "fpga_conv": vals[3]}
with wb.get_sheet('Pin_Signal_Map') as sheet:
for r in list(sheet.rows())[1:]:
vals = [str(c.v) if c.v else "" for c in r]
if len(vals) >= 3 and vals[0]:
pin_map[vals[0]] = {"pins": vals[1], "desc": vals[2]}
print(f" FPGA:{len(fpga_map)} Memory:{len(memory_map)} Pin:{len(pin_map)}")
# BUILD DOCUMENTS
print("[4/6] Building documents...")
documents = []
comp_index = defaultdict(list)
for i, row in enumerate(cases):
fk = row[idx.get('Failure_Key', 1)]
fd = row[idx.get('Failure_Description', 2)]
comps = row[idx.get('Components_Replaced', 5)]
result = row[idx.get('Result', 6)]
best = row[idx.get('Best_Actions (Weighted)', 9)]
priority = row[idx.get('Priority_Replace (Ranked)', 11)]
bkm = row[idx.get('BKM_Procedure', 12)]
bkm_comp = row[idx.get('BKM_Focus_Components', 13)]
norm = normalize_text(fd)
xref = []
cf = extract_components(comps)
for c in cf:
comp_index[c].append(f"case_{i}")
if c in pin_map:
xref.append(f"{c}: {pin_map[c]['desc']} ({pin_map[c]['pins']})")
if c in fpga_map:
xref.append(f"{c}: FPGA {fpga_map[c]['fpga']}, PMU {fpga_map[c]['pmu']}")
text = f"Board:{row[0]} Failure:{fk} Error:{norm} Result:{result} Components:{comps} Best:{best} Priority:{priority} BKM:{bkm} BKMComp:{bkm_comp} CrossRef:{'|'.join(xref)}"
documents.append({
"id": f"case_{i}",
"text": text,
"metadata": {
"failure_key": fk, "failure_desc": norm, "failure_raw": fd[:200],
"components": comps, "result": result, "best_actions": best,
"priority_replace": priority, "bkm_procedure": bkm,
"bkm_components": bkm_comp, "cross_ref": xref, "comps_found": cf,
}
})
print(f" {len(documents)} documents")
# BUILD CHROMA DB
print("[5/6] Building ChromaDB (~5-10 min)...")
import chromadb
from chromadb.config import Settings
client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False))
try: client.delete_collection("hdmt_cases")
except: pass
collection = client.create_collection(name="hdmt_cases")
for i in range(0, len(documents), 50):
batch = documents[i:i+50]
texts = [d["text"] for d in batch]
emb = EMBED_MODEL.encode([f"passage: {t}" for t in texts], normalize_embeddings=True).tolist()
collection.add(embeddings=emb, documents=texts, metadatas=[d["metadata"] for d in batch], ids=[d["id"] for d in batch])
print(f" {min(i+50, len(documents))}/{len(documents)}")
# SAVE AUXILIARY
print("[6/6] Saving auxiliary data...")
with open("component_index.json", "w") as f:
json.dump({k: list(set(v)) for k, v in comp_index.items()}, f)
with open("cross_ref_maps.json", "w") as f:
json.dump({"fpga": fpga_map, "memory": memory_map, "pin": pin_map}, f)
fk_stats = defaultdict(lambda: {"pass":0, "fail":0, "total":0})
for d in documents:
fk_stats[d["metadata"]["failure_key"]]["total"] += 1
if d["metadata"]["result"] == "Pass": fk_stats[fk]["pass"] += 1
else: fk_stats[fk]["fail"] += 1
stats = {k: {"total":v["total"], "pass":v["pass"], "fail":v["fail"], "rate":round(v["pass"]/v["total"]*100,1) if v["total"] else 0} for k,v in fk_stats.items()}
with open("stats_summary.json", "w") as f:
json.dump(stats, f)
print("\nDone! Database built in ./chroma_db/")
print("Next: bash vllm_server.sh")