# loader.py import os import json from typing import List, Dict, Any from huggingface_hub import hf_hub_download, HfApi import qa_store # --- CONFIGURATION --- # CHANGE THIS to your actual dataset ID (username/dataset-name) DATASET_REPO_ID = "YourUsername/lao-science-qa-store" DATASET_FILENAME = "manual_qa.jsonl" # --------------------- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.path.join(BASE_DIR, "data") CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl") MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl") GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl") def sync_download_manual_qa(): """ Startup Step: Download the latest manual_qa.jsonl from HF Dataset. If it doesn't exist yet (first run), we ignore the error. """ print("☁️ [Sync] Checking for remote QA data...") hf_token = os.environ.get("HF_TOKEN") if not hf_token: print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!") return try: # Download file to local path os.makedirs(DATA_DIR, exist_ok=True) hf_hub_download( repo_id=DATASET_REPO_ID, filename=DATASET_FILENAME, repo_type="dataset", local_dir=DATA_DIR, local_dir_use_symlinks=False, # force real file token=hf_token ) print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.") except Exception as e: print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}") def load_curriculum() -> None: # ... (Keep your existing code for load_curriculum) ... pass # Placeholder to indicate keeping existing code def load_glossary() -> None: # ... (Keep your existing code for load_glossary) ... pass def load_manual_qa() -> None: """ Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX. """ qa_store.MANUAL_QA_LIST.clear() qa_store.MANUAL_QA_INDEX.clear() max_num = 0 if not os.path.exists(MANUAL_QA_PATH): print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}") qa_store.NEXT_MANUAL_ID = 1 return with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: obj = json.loads(line) except json.JSONDecodeError: continue q = (obj.get("q") or "").strip() a = (obj.get("a") or "").strip() if not q or not a: continue entry_id = str(obj.get("id") or "") if not entry_id: max_num += 1 entry_id = f"manual_{max_num:04d}" # track biggest number for ID generation import re as _re m = _re.search(r"(\d+)$", entry_id) if m: max_num = max(max_num, int(m.group(1))) norm_q = qa_store.normalize_question(q) entry = { "id": entry_id, "q": q, "a": a, "norm_q": norm_q, } qa_store.MANUAL_QA_LIST.append(entry) qa_store.MANUAL_QA_INDEX[norm_q] = entry qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1 def generate_new_manual_id() -> str: # ... (Keep existing code) ... import re as _re used_nums = set() for e in qa_store.MANUAL_QA_LIST: raw_id = str(e.get("id") or "") m = _re.search(r"(\d+)$", raw_id) if m: used_nums.add(int(m.group(1))) i = 1 while i in used_nums: i += 1 return f"manual_{i:04d}" def save_manual_qa_file() -> None: """ 1. Save to local disk. 2. Upload to HF Dataset immediately. """ os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True) # 1. Save Local with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f: for e in qa_store.MANUAL_QA_LIST: obj = {"id": e["id"], "q": e["q"], "a": e["a"]} f.write(json.dumps(obj, ensure_ascii=False) + "\n") # 2. Upload to HF Dataset hf_token = os.environ.get("HF_TOKEN") if hf_token: try: api = HfApi(token=hf_token) api.upload_file( path_or_fileobj=MANUAL_QA_PATH, path_in_repo=DATASET_FILENAME, repo_id=DATASET_REPO_ID, repo_type="dataset", commit_message="Teacher Panel: Updated Q&A" ) print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.") except Exception as e: print(f"❌ [Sync] Upload failed: {e}") def rebuild_combined_qa() -> None: # ... (Keep existing code) ... pass def manual_qa_table_data() -> List[List[str]]: # ... (Keep existing code) ... return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]