Heng2004's picture
Update loader.py
04301d9 verified
raw
history blame
4.98 kB
# loader.py
import os
import json
from typing import List, Dict, Any
from huggingface_hub import hf_hub_download, HfApi
import qa_store
# --- CONFIGURATION ---
# CHANGE THIS to your actual dataset ID (username/dataset-name)
DATASET_REPO_ID = "YourUsername/lao-science-qa-store"
DATASET_FILENAME = "manual_qa.jsonl"
# ---------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
def sync_download_manual_qa():
"""
Startup Step: Download the latest manual_qa.jsonl from HF Dataset.
If it doesn't exist yet (first run), we ignore the error.
"""
print("☁️ [Sync] Checking for remote QA data...")
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!")
return
try:
# Download file to local path
os.makedirs(DATA_DIR, exist_ok=True)
hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=DATASET_FILENAME,
repo_type="dataset",
local_dir=DATA_DIR,
local_dir_use_symlinks=False, # force real file
token=hf_token
)
print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.")
except Exception as e:
print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}")
def load_curriculum() -> None:
# ... (Keep your existing code for load_curriculum) ...
pass # Placeholder to indicate keeping existing code
def load_glossary() -> None:
# ... (Keep your existing code for load_glossary) ...
pass
def load_manual_qa() -> None:
"""
Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
"""
qa_store.MANUAL_QA_LIST.clear()
qa_store.MANUAL_QA_INDEX.clear()
max_num = 0
if not os.path.exists(MANUAL_QA_PATH):
print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}")
qa_store.NEXT_MANUAL_ID = 1
return
with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
q = (obj.get("q") or "").strip()
a = (obj.get("a") or "").strip()
if not q or not a:
continue
entry_id = str(obj.get("id") or "")
if not entry_id:
max_num += 1
entry_id = f"manual_{max_num:04d}"
# track biggest number for ID generation
import re as _re
m = _re.search(r"(\d+)$", entry_id)
if m:
max_num = max(max_num, int(m.group(1)))
norm_q = qa_store.normalize_question(q)
entry = {
"id": entry_id,
"q": q,
"a": a,
"norm_q": norm_q,
}
qa_store.MANUAL_QA_LIST.append(entry)
qa_store.MANUAL_QA_INDEX[norm_q] = entry
qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
def generate_new_manual_id() -> str:
# ... (Keep existing code) ...
import re as _re
used_nums = set()
for e in qa_store.MANUAL_QA_LIST:
raw_id = str(e.get("id") or "")
m = _re.search(r"(\d+)$", raw_id)
if m:
used_nums.add(int(m.group(1)))
i = 1
while i in used_nums:
i += 1
return f"manual_{i:04d}"
def save_manual_qa_file() -> None:
"""
1. Save to local disk.
2. Upload to HF Dataset immediately.
"""
os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
# 1. Save Local
with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
for e in qa_store.MANUAL_QA_LIST:
obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
# 2. Upload to HF Dataset
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
try:
api = HfApi(token=hf_token)
api.upload_file(
path_or_fileobj=MANUAL_QA_PATH,
path_in_repo=DATASET_FILENAME,
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message="Teacher Panel: Updated Q&A"
)
print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.")
except Exception as e:
print(f"❌ [Sync] Upload failed: {e}")
def rebuild_combined_qa() -> None:
# ... (Keep existing code) ...
pass
def manual_qa_table_data() -> List[List[str]]:
# ... (Keep existing code) ...
return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]