File size: 5,699 Bytes
b1cb0ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
data_processor.py - Convert xlsb -> ChromaDB vectors cho HDMT chatbot
Chạy: python data_processor.py
Output: ./chroma_db/ (ChromaDB persistent)
"""
import os, re, json
from collections import defaultdict
from pyxlsb import open_workbook

# CONFIG
XLSB_PATH = "data/Debug (version 1).xlsb"
CHROMA_DB_PATH = "./chroma_db"
os.makedirs("data", exist_ok=True)
os.makedirs(CHROMA_DB_PATH, exist_ok=True)

# LOAD EMBEDDING MODEL
print("[1/6] Loading embedding model (intfloat/multilingual-e5-large)...")
from sentence_transformers import SentenceTransformer
EMBED_MODEL = SentenceTransformer("intfloat/multilingual-e5-large")
print("      Model loaded (1024 dimensions)")

# HELPERS
def normalize_text(text):
    if not text: return ""
    text = str(text).strip()
    text = re.sub(r'\d{4}-[A-Za-z]{3}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*\[.*?\]\s*', '', text)
    text = re.sub(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+\s*', '', text)
    text = re.sub(r'IWHT\d+', 'SN', text)
    text = re.sub(r'0x[0-9A-Fa-f]+', 'HEX', text)
    text = re.sub(r'\b\d{6,}\b', 'N', text)
    return re.sub(r'\s+', ' ', text).strip()

def extract_components(text):
    if not text: return []
    return list(set(re.findall(r'\b([A-Z]{1,3}\d{1,4}[A-Z]?)\b', str(text))))

# READ XLSB
print(f"[2/6] Reading {XLSB_PATH}...")
wb = open_workbook(XLSB_PATH)
with wb.get_sheet('AI_Training_Combined') as sheet:
    rows = list(sheet.rows())
    headers = [str(c.v) if c.v else "" for c in rows[0]]
    idx = {h: i for i, h in enumerate(headers)}
    cases = [[str(c.v) if c.v else "" for c in r] for r in rows[1:]]
print(f"      {len(cases)} cases loaded")

# Read cross-ref sheets
print("[3/6] Reading cross-reference sheets...")
fpga_map, memory_map, pin_map = {}, {}, {}
with wb.get_sheet('FPGA_Channel_Map') as sheet:
    for r in list(sheet.rows())[1:]:
        vals = [str(c.v) if c.v else "" for c in r]
        if len(vals) >= 4 and vals[1]:
            fpga_map[vals[1]] = {"fpga": vals[0], "pmu": vals[2], "adate": vals[3]}
with wb.get_sheet('Memory_Slot_Failures') as sheet:
    for r in list(sheet.rows())[1:]:
        vals = [str(c.v) if c.v else "" for c in r]
        if len(vals) >= 4 and vals[0]:
            memory_map[vals[0]] = {"msg": vals[1], "ecc": vals[2], "fpga_conv": vals[3]}
with wb.get_sheet('Pin_Signal_Map') as sheet:
    for r in list(sheet.rows())[1:]:
        vals = [str(c.v) if c.v else "" for c in r]
        if len(vals) >= 3 and vals[0]:
            pin_map[vals[0]] = {"pins": vals[1], "desc": vals[2]}
print(f"      FPGA:{len(fpga_map)} Memory:{len(memory_map)} Pin:{len(pin_map)}")

# BUILD DOCUMENTS
print("[4/6] Building documents...")
documents = []
comp_index = defaultdict(list)
for i, row in enumerate(cases):
    fk = row[idx.get('Failure_Key', 1)]
    fd = row[idx.get('Failure_Description', 2)]
    comps = row[idx.get('Components_Replaced', 5)]
    result = row[idx.get('Result', 6)]
    best = row[idx.get('Best_Actions (Weighted)', 9)]
    priority = row[idx.get('Priority_Replace (Ranked)', 11)]
    bkm = row[idx.get('BKM_Procedure', 12)]
    bkm_comp = row[idx.get('BKM_Focus_Components', 13)]
    
    norm = normalize_text(fd)
    xref = []
    cf = extract_components(comps)
    for c in cf:
        comp_index[c].append(f"case_{i}")
        if c in pin_map:
            xref.append(f"{c}: {pin_map[c]['desc']} ({pin_map[c]['pins']})")
        if c in fpga_map:
            xref.append(f"{c}: FPGA {fpga_map[c]['fpga']}, PMU {fpga_map[c]['pmu']}")
    
    text = f"Board:{row[0]} Failure:{fk} Error:{norm} Result:{result} Components:{comps} Best:{best} Priority:{priority} BKM:{bkm} BKMComp:{bkm_comp} CrossRef:{'|'.join(xref)}"
    
    documents.append({
        "id": f"case_{i}",
        "text": text,
        "metadata": {
            "failure_key": fk, "failure_desc": norm, "failure_raw": fd[:200],
            "components": comps, "result": result, "best_actions": best,
            "priority_replace": priority, "bkm_procedure": bkm,
            "bkm_components": bkm_comp, "cross_ref": xref, "comps_found": cf,
        }
    })
print(f"      {len(documents)} documents")

# BUILD CHROMA DB
print("[5/6] Building ChromaDB (~5-10 min)...")
import chromadb
from chromadb.config import Settings
client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False))
try: client.delete_collection("hdmt_cases")
except: pass
collection = client.create_collection(name="hdmt_cases")

for i in range(0, len(documents), 50):
    batch = documents[i:i+50]
    texts = [d["text"] for d in batch]
    emb = EMBED_MODEL.encode([f"passage: {t}" for t in texts], normalize_embeddings=True).tolist()
    collection.add(embeddings=emb, documents=texts, metadatas=[d["metadata"] for d in batch], ids=[d["id"] for d in batch])
    print(f"      {min(i+50, len(documents))}/{len(documents)}")

# SAVE AUXILIARY
print("[6/6] Saving auxiliary data...")
with open("component_index.json", "w") as f:
    json.dump({k: list(set(v)) for k, v in comp_index.items()}, f)
with open("cross_ref_maps.json", "w") as f:
    json.dump({"fpga": fpga_map, "memory": memory_map, "pin": pin_map}, f)

fk_stats = defaultdict(lambda: {"pass":0, "fail":0, "total":0})
for d in documents:
    fk_stats[d["metadata"]["failure_key"]]["total"] += 1
    if d["metadata"]["result"] == "Pass": fk_stats[fk]["pass"] += 1
    else: fk_stats[fk]["fail"] += 1
stats = {k: {"total":v["total"], "pass":v["pass"], "fail":v["fail"], "rate":round(v["pass"]/v["total"]*100,1) if v["total"] else 0} for k,v in fk_stats.items()}
with open("stats_summary.json", "w") as f:
    json.dump(stats, f)

print("\nDone! Database built in ./chroma_db/")
print("Next: bash vllm_server.sh")