Heng2004's picture
Update loader.py
1b5b80c verified
raw
history blame
6.54 kB
# loader.py
import os
import json
from typing import List, Dict, Any
import qa_store
# Base paths (make them relative to this file)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
def load_curriculum() -> None:
"""
Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
"""
qa_store.ENTRIES.clear()
qa_store.AUTO_QA_KNOWLEDGE.clear()
if not os.path.exists(CURRICULUM_PATH):
print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
return
with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj: Dict[str, Any] = json.loads(line)
except json.JSONDecodeError:
print("[WARN] Skipping invalid JSON line in curriculum file.")
continue
if "text" not in obj:
continue
qa_store.ENTRIES.append(obj)
for pair in obj.get("qa", []):
q = (pair.get("q") or "").strip()
a = (pair.get("a") or "").strip()
if not q or not a:
continue
norm_q = qa_store.normalize_question(q)
qa_store.AUTO_QA_KNOWLEDGE.append(
{
"norm_q": norm_q,
"q": q,
"a": a,
"source": "auto",
"id": obj.get("id", ""),
}
)
if qa_store.ENTRIES:
qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
else:
qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
def load_glossary() -> None:
"""Load glossary entries into qa_store.GLOSSARY."""
qa_store.GLOSSARY.clear()
if not os.path.exists(GLOSSARY_PATH):
print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
return
with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
print("[WARN] Skipping invalid glossary JSON line")
continue
qa_store.GLOSSARY.append(obj)
print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
def load_manual_qa() -> None:
"""
Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
"""
qa_store.MANUAL_QA_LIST.clear()
qa_store.MANUAL_QA_INDEX.clear()
max_num = 0
if not os.path.exists(MANUAL_QA_PATH):
print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}")
qa_store.NEXT_MANUAL_ID = 1
return
with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
print("[WARN] Skipping invalid JSON line in manual QA file.")
continue
q = (obj.get("q") or "").strip()
a = (obj.get("a") or "").strip()
if not q or not a:
continue
entry_id = str(obj.get("id") or "")
if not entry_id:
max_num += 1
entry_id = f"manual_{max_num:04d}"
# track biggest number in id
import re as _re
m = _re.search(r"(\d+)$", entry_id)
if m:
max_num = max(max_num, int(m.group(1)))
norm_q = qa_store.normalize_question(q)
entry = {
"id": entry_id,
"q": q,
"a": a,
"norm_q": norm_q,
}
qa_store.MANUAL_QA_LIST.append(entry)
qa_store.MANUAL_QA_INDEX[norm_q] = entry
qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
# loader.py
def generate_new_manual_id() -> str:
"""
Generate the smallest free manual_XXXX ID based on the
current MANUAL_QA_LIST (so gaps like 11 after delete
are reused).
"""
import re as _re
used_nums = set()
# collect all numbers that are already used in IDs
for e in qa_store.MANUAL_QA_LIST:
raw_id = str(e.get("id") or "")
m = _re.search(r"(\d+)$", raw_id)
if m:
used_nums.add(int(m.group(1)))
# find the smallest positive integer that is not used
i = 1
while i in used_nums:
i += 1
# keep the global counter roughly in sync (optional)
qa_store.NEXT_MANUAL_ID = i + 1
return f"manual_{i:04d}"
def save_manual_qa_file() -> None:
"""
Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
"""
os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
for e in qa_store.MANUAL_QA_LIST:
obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
def rebuild_combined_qa() -> None:
"""
Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
Manual answers override auto ones if same normalized question.
"""
qa_store.QA_INDEX.clear()
qa_store.ALL_QA_KNOWLEDGE.clear()
# auto first
for item in qa_store.AUTO_QA_KNOWLEDGE:
norm_q = item["norm_q"]
qa_store.QA_INDEX[norm_q] = item["a"]
qa_store.ALL_QA_KNOWLEDGE.append(item)
# manual overrides
for e in qa_store.MANUAL_QA_LIST:
item = {
"norm_q": e["norm_q"],
"q": e["q"],
"a": e["a"],
"source": "manual",
"id": e["id"],
}
qa_store.QA_INDEX[item["norm_q"]] = item["a"]
qa_store.ALL_QA_KNOWLEDGE.append(item)
def manual_qa_table_data() -> List[List[str]]:
"""
Table rows for Teacher Panel.
"""
return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]