File size: 4,979 Bytes
a2877a1
 
 
 
04301d9
a2877a1
 
 
04301d9
 
 
 
 
 
921357f
 
8ff3919
921357f
1b5b80c
a2877a1
04301d9
a2877a1
04301d9
 
a2877a1
04301d9
 
 
 
a2877a1
 
04301d9
 
 
 
 
 
 
 
 
 
 
 
 
 
921357f
04301d9
 
 
a2877a1
1b5b80c
04301d9
 
1b5b80c
a2877a1
 
 
 
 
 
 
 
 
921357f
a2877a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04301d9
a2877a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04301d9
ad8bcc2
 
 
 
 
 
 
 
 
 
 
 
a2877a1
 
04301d9
 
a2877a1
 
04301d9
 
a2877a1
 
 
 
04301d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2877a1
 
04301d9
 
 
a2877a1
04301d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# loader.py
import os
import json
from typing import List, Dict, Any
from huggingface_hub import hf_hub_download, HfApi

import qa_store

# --- CONFIGURATION ---
# CHANGE THIS to your actual dataset ID (username/dataset-name)
DATASET_REPO_ID = "YourUsername/lao-science-qa-store" 
DATASET_FILENAME = "manual_qa.jsonl"
# ---------------------

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, "data")
CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")

def sync_download_manual_qa():
    """
    Startup Step: Download the latest manual_qa.jsonl from HF Dataset.
    If it doesn't exist yet (first run), we ignore the error.
    """
    print("☁️ [Sync] Checking for remote QA data...")
    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!")
        return

    try:
        # Download file to local path
        os.makedirs(DATA_DIR, exist_ok=True)
        hf_hub_download(
            repo_id=DATASET_REPO_ID,
            filename=DATASET_FILENAME,
            repo_type="dataset",
            local_dir=DATA_DIR,
            local_dir_use_symlinks=False,  # force real file
            token=hf_token
        )
        print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.")
    except Exception as e:
        print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}")

def load_curriculum() -> None:
    # ... (Keep your existing code for load_curriculum) ...
    pass # Placeholder to indicate keeping existing code

def load_glossary() -> None:
    # ... (Keep your existing code for load_glossary) ...
    pass

def load_manual_qa() -> None:
    """
    Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
    """
    qa_store.MANUAL_QA_LIST.clear()
    qa_store.MANUAL_QA_INDEX.clear()
    max_num = 0

    if not os.path.exists(MANUAL_QA_PATH):
        print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}")
        qa_store.NEXT_MANUAL_ID = 1
        return

    with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            q = (obj.get("q") or "").strip()
            a = (obj.get("a") or "").strip()
            if not q or not a:
                continue

            entry_id = str(obj.get("id") or "")
            if not entry_id:
                max_num += 1
                entry_id = f"manual_{max_num:04d}"

            # track biggest number for ID generation
            import re as _re
            m = _re.search(r"(\d+)$", entry_id)
            if m:
                max_num = max(max_num, int(m.group(1)))

            norm_q = qa_store.normalize_question(q)
            entry = {
                "id": entry_id,
                "q": q,
                "a": a,
                "norm_q": norm_q,
            }
            qa_store.MANUAL_QA_LIST.append(entry)
            qa_store.MANUAL_QA_INDEX[norm_q] = entry

    qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1

def generate_new_manual_id() -> str:
    # ... (Keep existing code) ...
    import re as _re
    used_nums = set()
    for e in qa_store.MANUAL_QA_LIST:
        raw_id = str(e.get("id") or "")
        m = _re.search(r"(\d+)$", raw_id)
        if m:
            used_nums.add(int(m.group(1)))
    i = 1
    while i in used_nums:
        i += 1
    return f"manual_{i:04d}"

def save_manual_qa_file() -> None:
    """
    1. Save to local disk.
    2. Upload to HF Dataset immediately.
    """
    os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
    
    # 1. Save Local
    with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
        for e in qa_store.MANUAL_QA_LIST:
            obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
    
    # 2. Upload to HF Dataset
    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        try:
            api = HfApi(token=hf_token)
            api.upload_file(
                path_or_fileobj=MANUAL_QA_PATH,
                path_in_repo=DATASET_FILENAME,
                repo_id=DATASET_REPO_ID,
                repo_type="dataset",
                commit_message="Teacher Panel: Updated Q&A"
            )
            print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.")
        except Exception as e:
            print(f"❌ [Sync] Upload failed: {e}")

def rebuild_combined_qa() -> None:
    # ... (Keep existing code) ...
    pass
    
def manual_qa_table_data() -> List[List[str]]:
    # ... (Keep existing code) ...
    return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]