File size: 9,310 Bytes
a2877a1 31e421c 921357f 237554a 31e421c 921357f 237554a 31e421c 405e720 31e421c 405e720 31e421c 405e720 8798e5f dd57cad e7ab1a5 31e421c e7ab1a5 dd57cad e7ab1a5 dd57cad 8798e5f 31e421c 8798e5f 31e421c 8798e5f 31e421c 8798e5f 31e421c 8798e5f 31e421c 8798e5f 237554a a2877a1 31e421c a2877a1 237554a 31e421c 237554a 31e421c 237554a 31e421c 237554a 31e421c 237554a 31e421c 237554a 31e421c 237554a 31e421c 237554a 1b5b80c 31e421c a2877a1 921357f a2877a1 31e421c a2877a1 31e421c a2877a1 31e421c ad8bcc2 31e421c ad8bcc2 31e421c ad8bcc2 a2877a1 237554a a2877a1 237554a 31e421c 237554a a2877a1 31e421c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
# loader.py
import os
import json
from typing import List, Dict, Any
import qa_store
# ---------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
# Keep Manual QA global so Teacher Panel can write to it easily
MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
# Cache file (Generated locally)
CACHE_FILENAME = "cached_embeddings.pt"
CACHE_PATH = os.path.join(DATA_DIR, CACHE_FILENAME)
DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
DATASET_FILENAME = "manual_qa.jsonl"
# ---------------------------------------------------------
# CLOUD SYNC (Unchanged)
# ---------------------------------------------------------
def sync_upload_cache() -> str:
"""Upload the cached_embeddings.pt to Hugging Face Dataset."""
if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
return "⚠️ Upload Skipped (Repo ID not set)"
try:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj=CACHE_PATH,
path_in_repo=CACHE_FILENAME,
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message="System: Updated embedding cache"
)
return "☁️ Cache Upload Success"
except Exception as e:
print(f"[ERROR] Upload cache failed: {e}")
return f"⚠️ Cache Upload Failed: {e}"
def sync_download_cache() -> None:
"""Download cached_embeddings.pt at startup."""
if not DATASET_REPO_ID: return
try:
from huggingface_hub import hf_hub_download
import shutil
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=CACHE_FILENAME,
repo_type="dataset",
token=os.environ.get("HF_TOKEN")
)
shutil.copy(downloaded_path, CACHE_PATH)
print("[INFO] Cache download success!")
except Exception as e:
print(f"[WARN] Could not download cache (First run?): {e}")
def sync_upload_manual_qa() -> str:
if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
return "⚠️ Upload Skipped"
try:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj=MANUAL_QA_PATH,
path_in_repo=DATASET_FILENAME,
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message="Teacher Panel: Updated Q&A data"
)
return "☁️ Cloud Upload Success"
except Exception as e:
return f"⚠️ Cloud Upload Failed: {e}"
def sync_download_manual_qa() -> None:
if not DATASET_REPO_ID: return
try:
from huggingface_hub import hf_hub_download
import shutil
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=DATASET_FILENAME,
repo_type="dataset",
token=os.environ.get("HF_TOKEN")
)
shutil.copy(downloaded_path, MANUAL_QA_PATH)
print("[INFO] Manual QA download success!")
except Exception as e:
print(f"[WARN] Could not download manual_qa.jsonl: {e}")
# ---------------------------------------------------------
# RECURSIVE LOADERS (The New Upgrade)
# ---------------------------------------------------------
def load_curriculum() -> None:
"""
Recursively find and load all textbook JSONL files in data/
Looks for files named 'textbook.jsonl' OR starting with 'M'.
"""
qa_store.ENTRIES.clear()
qa_store.AUTO_QA_KNOWLEDGE.clear()
print(f"[INFO] Scanning {DATA_DIR} for textbook content...")
file_count = 0
# os.walk goes deep into M_1/U_1/...
for root, dirs, files in os.walk(DATA_DIR):
for file in files:
# Logic: Match specific filenames
is_textbook = file == "textbook.jsonl" or (file.startswith("M") and file.endswith(".jsonl"))
if is_textbook:
full_path = os.path.join(root, file)
_parse_curriculum_file(full_path)
file_count += 1
if qa_store.ENTRIES:
qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
print(f"[INFO] Loaded {len(qa_store.ENTRIES)} entries from {file_count} files.")
else:
qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນ."
print("[WARN] No curriculum files found.")
def _parse_curriculum_file(path: str):
"""Helper to read a single textbook file"""
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line: continue
try:
obj = json.loads(line)
if "text" not in obj: continue
qa_store.ENTRIES.append(obj)
# Extract Auto-QA
for pair in obj.get("qa", []):
q = (pair.get("q") or "").strip()
a = (pair.get("a") or "").strip()
if q and a:
norm_q = qa_store.normalize_question(q)
qa_store.AUTO_QA_KNOWLEDGE.append({
"norm_q": norm_q,
"q": q,
"a": a,
"source": "auto",
"id": obj.get("id", "")
})
except json.JSONDecodeError:
continue
def load_glossary() -> None:
"""
Recursively find and load all glossary JSONL files.
Looks for files named 'glossary.jsonl' OR starting with 'glossary'.
"""
qa_store.GLOSSARY.clear()
print(f"[INFO] Scanning {DATA_DIR} for glossary files...")
for root, dirs, files in os.walk(DATA_DIR):
for file in files:
is_glossary = "glossary" in file and file.endswith(".jsonl")
if is_glossary:
full_path = os.path.join(root, file)
with open(full_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line: continue
try:
obj = json.loads(line)
qa_store.GLOSSARY.append(obj)
except json.JSONDecodeError:
continue
print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
# ---------------------------------------------------------
# MANUAL QA & UTILS (Same as before)
# ---------------------------------------------------------
def load_manual_qa() -> None:
qa_store.MANUAL_QA_LIST.clear()
qa_store.MANUAL_QA_INDEX.clear()
max_num = 0
if not os.path.exists(MANUAL_QA_PATH):
print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}")
qa_store.NEXT_MANUAL_ID = 1
return
with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line: continue
try:
obj = json.loads(line)
entry_id = str(obj.get("id") or "")
# ID tracking logic
import re
m = re.search(r"(\d+)$", entry_id)
if m: max_num = max(max_num, int(m.group(1)))
q = (obj.get("q") or "").strip()
a = (obj.get("a") or "").strip()
if q and a:
norm_q = qa_store.normalize_question(q)
entry = {"id": entry_id, "q": q, "a": a, "norm_q": norm_q}
qa_store.MANUAL_QA_LIST.append(entry)
qa_store.MANUAL_QA_INDEX[norm_q] = entry
except json.JSONDecodeError:
continue
qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
def generate_new_manual_id() -> str:
import re
used_nums = set()
for e in qa_store.MANUAL_QA_LIST:
raw_id = str(e.get("id") or "")
m = re.search(r"(\d+)$", raw_id)
if m: used_nums.add(int(m.group(1)))
i = 1
while i in used_nums: i += 1
return f"manual_{i:04d}"
def save_manual_qa_file() -> None:
os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
for e in qa_store.MANUAL_QA_LIST:
obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
def rebuild_combined_qa() -> None:
qa_store.QA_INDEX.clear()
qa_store.ALL_QA_KNOWLEDGE.clear()
for item in qa_store.AUTO_QA_KNOWLEDGE:
norm_q = item["norm_q"]
qa_store.QA_INDEX[norm_q] = item["a"]
qa_store.ALL_QA_KNOWLEDGE.append(item)
for e in qa_store.MANUAL_QA_LIST:
item = {"norm_q": e["norm_q"], "q": e["q"], "a": e["a"], "source": "manual", "id": e["id"]}
qa_store.QA_INDEX[item["norm_q"]] = item["a"]
qa_store.ALL_QA_KNOWLEDGE.append(item)
def manual_qa_table_data() -> List[List[str]]:
return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST] |