Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

App Files Files Community

Heng2004 commited on 2 days ago

Commit

04301d9

verified ·

1 Parent(s): 9403520

Update loader.py

Browse files

Files changed (1) hide show

loader.py +60 -126

loader.py CHANGED Viewed

@@ -2,92 +2,55 @@
 import os
 import json
 from typing import List, Dict, Any
 import qa_store
-# Base paths (make them relative to this file)
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(BASE_DIR, "data")
 CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
 MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
 GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
-def load_curriculum() -> None:
     """
-    Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
     """
-    qa_store.ENTRIES.clear()
-    qa_store.AUTO_QA_KNOWLEDGE.clear()
-    if not os.path.exists(CURRICULUM_PATH):
-        print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
-        qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
         return
-    with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                obj: Dict[str, Any] = json.loads(line)
-            except json.JSONDecodeError:
-                print("[WARN] Skipping invalid JSON line in curriculum file.")
-                continue
-            if "text" not in obj:
-                continue
-            qa_store.ENTRIES.append(obj)
-            for pair in obj.get("qa", []):
-                q = (pair.get("q") or "").strip()
-                a = (pair.get("a") or "").strip()
-                if not q or not a:
-                    continue
-                norm_q = qa_store.normalize_question(q)
-                qa_store.AUTO_QA_KNOWLEDGE.append(
-                    {
-                        "norm_q": norm_q,
-                        "q": q,
-                        "a": a,
-                        "source": "auto",
-                        "id": obj.get("id", ""),
-                    }
-                )
-    if qa_store.ENTRIES:
-        qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
-    else:
-        qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
 def load_glossary() -> None:
-    """Load glossary entries into qa_store.GLOSSARY."""
-    qa_store.GLOSSARY.clear()
-    if not os.path.exists(GLOSSARY_PATH):
-        print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
-        return
-    with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                obj = json.loads(line)
-            except json.JSONDecodeError:
-                print("[WARN] Skipping invalid glossary JSON line")
-                continue
-            qa_store.GLOSSARY.append(obj)
-    print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
 def load_manual_qa() -> None:
     """
@@ -102,7 +65,6 @@ def load_manual_qa() -> None:
         qa_store.NEXT_MANUAL_ID = 1
         return
     with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
@@ -111,10 +73,8 @@ def load_manual_qa() -> None:
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
-                print("[WARN] Skipping invalid JSON line in manual QA file.")
                 continue
             q = (obj.get("q") or "").strip()
             a = (obj.get("a") or "").strip()
             if not q or not a:
@@ -125,9 +85,8 @@ def load_manual_qa() -> None:
                 max_num += 1
                 entry_id = f"manual_{max_num:04d}"
-            # track biggest number in id
             import re as _re
             m = _re.search(r"(\d+)$", entry_id)
             if m:
                 max_num = max(max_num, int(m.group(1)))
@@ -144,78 +103,53 @@ def load_manual_qa() -> None:
     qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
-# loader.py
 def generate_new_manual_id() -> str:
-    """
-    Generate the smallest free manual_XXXX ID based on the
-    current MANUAL_QA_LIST (so gaps like 11 after delete
-    are reused).
-    """
     import re as _re
     used_nums = set()
-    # collect all numbers that are already used in IDs
     for e in qa_store.MANUAL_QA_LIST:
         raw_id = str(e.get("id") or "")
         m = _re.search(r"(\d+)$", raw_id)
         if m:
             used_nums.add(int(m.group(1)))
-    # find the smallest positive integer that is not used
     i = 1
     while i in used_nums:
         i += 1
-    # keep the global counter roughly in sync (optional)
-    qa_store.NEXT_MANUAL_ID = i + 1
     return f"manual_{i:04d}"
 def save_manual_qa_file() -> None:
     """
-    Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
     """
     os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
     with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
         for e in qa_store.MANUAL_QA_LIST:
             obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
             f.write(json.dumps(obj, ensure_ascii=False) + "\n")
 def rebuild_combined_qa() -> None:
-    """
-    Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
-    Manual answers override auto ones if same normalized question.
-    """
-    qa_store.QA_INDEX.clear()
-    qa_store.ALL_QA_KNOWLEDGE.clear()
-    # auto first
-    for item in qa_store.AUTO_QA_KNOWLEDGE:
-        norm_q = item["norm_q"]
-        qa_store.QA_INDEX[norm_q] = item["a"]
-        qa_store.ALL_QA_KNOWLEDGE.append(item)
-    # manual overrides
-    for e in qa_store.MANUAL_QA_LIST:
-        item = {
-            "norm_q": e["norm_q"],
-            "q": e["q"],
-            "a": e["a"],
-            "source": "manual",
-            "id": e["id"],
-        }
-        qa_store.QA_INDEX[item["norm_q"]] = item["a"]
-        qa_store.ALL_QA_KNOWLEDGE.append(item)
 def manual_qa_table_data() -> List[List[str]]:
-    """
-    Table rows for Teacher Panel.
-    """
-    return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]

 import os
 import json
 from typing import List, Dict, Any
+from huggingface_hub import hf_hub_download, HfApi
 import qa_store
+# --- CONFIGURATION ---
+# CHANGE THIS to your actual dataset ID (username/dataset-name)
+DATASET_REPO_ID = "YourUsername/lao-science-qa-store"
+DATASET_FILENAME = "manual_qa.jsonl"
+# ---------------------
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(BASE_DIR, "data")
 CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
 MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
 GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
+def sync_download_manual_qa():
     """
+    Startup Step: Download the latest manual_qa.jsonl from HF Dataset.
+    If it doesn't exist yet (first run), we ignore the error.
     """
+    print("☁️ [Sync] Checking for remote QA data...")
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hf_token:
+        print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!")
         return
+    try:
+        # Download file to local path
+        os.makedirs(DATA_DIR, exist_ok=True)
+        hf_hub_download(
+            repo_id=DATASET_REPO_ID,
+            filename=DATASET_FILENAME,
+            repo_type="dataset",
+            local_dir=DATA_DIR,
+            local_dir_use_symlinks=False,  # force real file
+            token=hf_token
+        )
+        print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.")
+    except Exception as e:
+        print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}")
+def load_curriculum() -> None:
+    # ... (Keep your existing code for load_curriculum) ...
+    pass # Placeholder to indicate keeping existing code
 def load_glossary() -> None:
+    # ... (Keep your existing code for load_glossary) ...
+    pass
 def load_manual_qa() -> None:
     """
         qa_store.NEXT_MANUAL_ID = 1
         return
     with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
                 continue
             q = (obj.get("q") or "").strip()
             a = (obj.get("a") or "").strip()
             if not q or not a:
                 max_num += 1
                 entry_id = f"manual_{max_num:04d}"
+            # track biggest number for ID generation
             import re as _re
             m = _re.search(r"(\d+)$", entry_id)
             if m:
                 max_num = max(max_num, int(m.group(1)))
     qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
 def generate_new_manual_id() -> str:
+    # ... (Keep existing code) ...
     import re as _re
     used_nums = set()
     for e in qa_store.MANUAL_QA_LIST:
         raw_id = str(e.get("id") or "")
         m = _re.search(r"(\d+)$", raw_id)
         if m:
             used_nums.add(int(m.group(1)))
     i = 1
     while i in used_nums:
         i += 1
     return f"manual_{i:04d}"
 def save_manual_qa_file() -> None:
     """
+    1. Save to local disk.
+    2. Upload to HF Dataset immediately.
     """
     os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
+    # 1. Save Local
     with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
         for e in qa_store.MANUAL_QA_LIST:
             obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
             f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+    # 2. Upload to HF Dataset
+    hf_token = os.environ.get("HF_TOKEN")
+    if hf_token:
+        try:
+            api = HfApi(token=hf_token)
+            api.upload_file(
+                path_or_fileobj=MANUAL_QA_PATH,
+                path_in_repo=DATASET_FILENAME,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                commit_message="Teacher Panel: Updated Q&A"
+            )
+            print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.")
+        except Exception as e:
+            print(f"❌ [Sync] Upload failed: {e}")
 def rebuild_combined_qa() -> None:
+    # ... (Keep existing code) ...
+    pass
 def manual_qa_table_data() -> List[List[str]]:
+    # ... (Keep existing code) ...
+    return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]