|
|
|
|
|
import os |
|
|
import json |
|
|
from typing import List, Dict, Any |
|
|
from huggingface_hub import hf_hub_download, HfApi |
|
|
|
|
|
import qa_store |
|
|
|
|
|
|
|
|
|
|
|
DATASET_REPO_ID = "YourUsername/lao-science-qa-store" |
|
|
DATASET_FILENAME = "manual_qa.jsonl" |
|
|
|
|
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
DATA_DIR = os.path.join(BASE_DIR, "data") |
|
|
CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl") |
|
|
MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl") |
|
|
GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl") |
|
|
|
|
|
def sync_download_manual_qa(): |
|
|
""" |
|
|
Startup Step: Download the latest manual_qa.jsonl from HF Dataset. |
|
|
If it doesn't exist yet (first run), we ignore the error. |
|
|
""" |
|
|
print("☁️ [Sync] Checking for remote QA data...") |
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if not hf_token: |
|
|
print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!") |
|
|
return |
|
|
|
|
|
try: |
|
|
|
|
|
os.makedirs(DATA_DIR, exist_ok=True) |
|
|
hf_hub_download( |
|
|
repo_id=DATASET_REPO_ID, |
|
|
filename=DATASET_FILENAME, |
|
|
repo_type="dataset", |
|
|
local_dir=DATA_DIR, |
|
|
local_dir_use_symlinks=False, |
|
|
token=hf_token |
|
|
) |
|
|
print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.") |
|
|
except Exception as e: |
|
|
print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}") |
|
|
|
|
|
def load_curriculum() -> None: |
|
|
|
|
|
pass |
|
|
|
|
|
def load_glossary() -> None: |
|
|
|
|
|
pass |
|
|
|
|
|
def load_manual_qa() -> None: |
|
|
""" |
|
|
Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX. |
|
|
""" |
|
|
qa_store.MANUAL_QA_LIST.clear() |
|
|
qa_store.MANUAL_QA_INDEX.clear() |
|
|
max_num = 0 |
|
|
|
|
|
if not os.path.exists(MANUAL_QA_PATH): |
|
|
print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}") |
|
|
qa_store.NEXT_MANUAL_ID = 1 |
|
|
return |
|
|
|
|
|
with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
try: |
|
|
obj = json.loads(line) |
|
|
except json.JSONDecodeError: |
|
|
continue |
|
|
|
|
|
q = (obj.get("q") or "").strip() |
|
|
a = (obj.get("a") or "").strip() |
|
|
if not q or not a: |
|
|
continue |
|
|
|
|
|
entry_id = str(obj.get("id") or "") |
|
|
if not entry_id: |
|
|
max_num += 1 |
|
|
entry_id = f"manual_{max_num:04d}" |
|
|
|
|
|
|
|
|
import re as _re |
|
|
m = _re.search(r"(\d+)$", entry_id) |
|
|
if m: |
|
|
max_num = max(max_num, int(m.group(1))) |
|
|
|
|
|
norm_q = qa_store.normalize_question(q) |
|
|
entry = { |
|
|
"id": entry_id, |
|
|
"q": q, |
|
|
"a": a, |
|
|
"norm_q": norm_q, |
|
|
} |
|
|
qa_store.MANUAL_QA_LIST.append(entry) |
|
|
qa_store.MANUAL_QA_INDEX[norm_q] = entry |
|
|
|
|
|
qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1 |
|
|
|
|
|
def generate_new_manual_id() -> str: |
|
|
|
|
|
import re as _re |
|
|
used_nums = set() |
|
|
for e in qa_store.MANUAL_QA_LIST: |
|
|
raw_id = str(e.get("id") or "") |
|
|
m = _re.search(r"(\d+)$", raw_id) |
|
|
if m: |
|
|
used_nums.add(int(m.group(1))) |
|
|
i = 1 |
|
|
while i in used_nums: |
|
|
i += 1 |
|
|
return f"manual_{i:04d}" |
|
|
|
|
|
def save_manual_qa_file() -> None: |
|
|
""" |
|
|
1. Save to local disk. |
|
|
2. Upload to HF Dataset immediately. |
|
|
""" |
|
|
os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True) |
|
|
|
|
|
|
|
|
with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f: |
|
|
for e in qa_store.MANUAL_QA_LIST: |
|
|
obj = {"id": e["id"], "q": e["q"], "a": e["a"]} |
|
|
f.write(json.dumps(obj, ensure_ascii=False) + "\n") |
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if hf_token: |
|
|
try: |
|
|
api = HfApi(token=hf_token) |
|
|
api.upload_file( |
|
|
path_or_fileobj=MANUAL_QA_PATH, |
|
|
path_in_repo=DATASET_FILENAME, |
|
|
repo_id=DATASET_REPO_ID, |
|
|
repo_type="dataset", |
|
|
commit_message="Teacher Panel: Updated Q&A" |
|
|
) |
|
|
print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.") |
|
|
except Exception as e: |
|
|
print(f"❌ [Sync] Upload failed: {e}") |
|
|
|
|
|
def rebuild_combined_qa() -> None: |
|
|
|
|
|
pass |
|
|
|
|
|
def manual_qa_table_data() -> List[List[str]]: |
|
|
|
|
|
return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST] |