Heng2004 commited on
Commit
04301d9
·
verified ·
1 Parent(s): 9403520

Update loader.py

Browse files
Files changed (1) hide show
  1. loader.py +60 -126
loader.py CHANGED
@@ -2,92 +2,55 @@
2
  import os
3
  import json
4
  from typing import List, Dict, Any
 
5
 
6
  import qa_store
7
 
8
- # Base paths (make them relative to this file)
 
 
 
 
 
9
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
10
  DATA_DIR = os.path.join(BASE_DIR, "data")
11
-
12
  CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
13
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
14
-
15
  GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
16
 
17
- def load_curriculum() -> None:
18
  """
19
- Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
 
20
  """
21
- qa_store.ENTRIES.clear()
22
- qa_store.AUTO_QA_KNOWLEDGE.clear()
23
-
24
- if not os.path.exists(CURRICULUM_PATH):
25
- print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
26
- qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
27
  return
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
31
- for line in f:
32
- line = line.strip()
33
- if not line:
34
- continue
35
- try:
36
- obj: Dict[str, Any] = json.loads(line)
37
- except json.JSONDecodeError:
38
- print("[WARN] Skipping invalid JSON line in curriculum file.")
39
- continue
40
-
41
-
42
- if "text" not in obj:
43
- continue
44
-
45
- qa_store.ENTRIES.append(obj)
46
-
47
- for pair in obj.get("qa", []):
48
- q = (pair.get("q") or "").strip()
49
- a = (pair.get("a") or "").strip()
50
- if not q or not a:
51
- continue
52
- norm_q = qa_store.normalize_question(q)
53
- qa_store.AUTO_QA_KNOWLEDGE.append(
54
- {
55
- "norm_q": norm_q,
56
- "q": q,
57
- "a": a,
58
- "source": "auto",
59
- "id": obj.get("id", ""),
60
- }
61
- )
62
-
63
- if qa_store.ENTRIES:
64
- qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
65
- else:
66
- qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
67
-
68
 
69
  def load_glossary() -> None:
70
- """Load glossary entries into qa_store.GLOSSARY."""
71
- qa_store.GLOSSARY.clear()
72
-
73
- if not os.path.exists(GLOSSARY_PATH):
74
- print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
75
- return
76
-
77
- with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
78
- for line in f:
79
- line = line.strip()
80
- if not line:
81
- continue
82
- try:
83
- obj = json.loads(line)
84
- except json.JSONDecodeError:
85
- print("[WARN] Skipping invalid glossary JSON line")
86
- continue
87
- qa_store.GLOSSARY.append(obj)
88
-
89
- print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
90
-
91
 
92
  def load_manual_qa() -> None:
93
  """
@@ -102,7 +65,6 @@ def load_manual_qa() -> None:
102
  qa_store.NEXT_MANUAL_ID = 1
103
  return
104
 
105
-
106
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
107
  for line in f:
108
  line = line.strip()
@@ -111,10 +73,8 @@ def load_manual_qa() -> None:
111
  try:
112
  obj = json.loads(line)
113
  except json.JSONDecodeError:
114
- print("[WARN] Skipping invalid JSON line in manual QA file.")
115
  continue
116
 
117
-
118
  q = (obj.get("q") or "").strip()
119
  a = (obj.get("a") or "").strip()
120
  if not q or not a:
@@ -125,9 +85,8 @@ def load_manual_qa() -> None:
125
  max_num += 1
126
  entry_id = f"manual_{max_num:04d}"
127
 
128
- # track biggest number in id
129
  import re as _re
130
-
131
  m = _re.search(r"(\d+)$", entry_id)
132
  if m:
133
  max_num = max(max_num, int(m.group(1)))
@@ -144,78 +103,53 @@ def load_manual_qa() -> None:
144
 
145
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
146
 
147
-
148
- # loader.py
149
-
150
  def generate_new_manual_id() -> str:
151
- """
152
- Generate the smallest free manual_XXXX ID based on the
153
- current MANUAL_QA_LIST (so gaps like 11 after delete
154
- are reused).
155
- """
156
  import re as _re
157
-
158
  used_nums = set()
159
-
160
- # collect all numbers that are already used in IDs
161
  for e in qa_store.MANUAL_QA_LIST:
162
  raw_id = str(e.get("id") or "")
163
  m = _re.search(r"(\d+)$", raw_id)
164
  if m:
165
  used_nums.add(int(m.group(1)))
166
-
167
- # find the smallest positive integer that is not used
168
  i = 1
169
  while i in used_nums:
170
  i += 1
171
-
172
- # keep the global counter roughly in sync (optional)
173
- qa_store.NEXT_MANUAL_ID = i + 1
174
-
175
  return f"manual_{i:04d}"
176
 
177
-
178
-
179
  def save_manual_qa_file() -> None:
180
  """
181
- Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
 
182
  """
183
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
 
 
184
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
185
  for e in qa_store.MANUAL_QA_LIST:
186
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
187
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
188
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def rebuild_combined_qa() -> None:
191
- """
192
- Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
193
- Manual answers override auto ones if same normalized question.
194
- """
195
- qa_store.QA_INDEX.clear()
196
- qa_store.ALL_QA_KNOWLEDGE.clear()
197
-
198
- # auto first
199
- for item in qa_store.AUTO_QA_KNOWLEDGE:
200
- norm_q = item["norm_q"]
201
- qa_store.QA_INDEX[norm_q] = item["a"]
202
- qa_store.ALL_QA_KNOWLEDGE.append(item)
203
-
204
- # manual overrides
205
- for e in qa_store.MANUAL_QA_LIST:
206
- item = {
207
- "norm_q": e["norm_q"],
208
- "q": e["q"],
209
- "a": e["a"],
210
- "source": "manual",
211
- "id": e["id"],
212
- }
213
- qa_store.QA_INDEX[item["norm_q"]] = item["a"]
214
- qa_store.ALL_QA_KNOWLEDGE.append(item)
215
-
216
-
217
  def manual_qa_table_data() -> List[List[str]]:
218
- """
219
- Table rows for Teacher Panel.
220
- """
221
- return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]
 
2
  import os
3
  import json
4
  from typing import List, Dict, Any
5
+ from huggingface_hub import hf_hub_download, HfApi
6
 
7
  import qa_store
8
 
9
+ # --- CONFIGURATION ---
10
+ # CHANGE THIS to your actual dataset ID (username/dataset-name)
11
+ DATASET_REPO_ID = "YourUsername/lao-science-qa-store"
12
+ DATASET_FILENAME = "manual_qa.jsonl"
13
+ # ---------------------
14
+
15
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  DATA_DIR = os.path.join(BASE_DIR, "data")
 
17
  CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
18
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
 
19
  GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
20
 
21
+ def sync_download_manual_qa():
22
  """
23
+ Startup Step: Download the latest manual_qa.jsonl from HF Dataset.
24
+ If it doesn't exist yet (first run), we ignore the error.
25
  """
26
+ print("☁️ [Sync] Checking for remote QA data...")
27
+ hf_token = os.environ.get("HF_TOKEN")
28
+ if not hf_token:
29
+ print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!")
 
 
30
  return
31
 
32
+ try:
33
+ # Download file to local path
34
+ os.makedirs(DATA_DIR, exist_ok=True)
35
+ hf_hub_download(
36
+ repo_id=DATASET_REPO_ID,
37
+ filename=DATASET_FILENAME,
38
+ repo_type="dataset",
39
+ local_dir=DATA_DIR,
40
+ local_dir_use_symlinks=False, # force real file
41
+ token=hf_token
42
+ )
43
+ print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.")
44
+ except Exception as e:
45
+ print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}")
46
 
47
+ def load_curriculum() -> None:
48
+ # ... (Keep your existing code for load_curriculum) ...
49
+ pass # Placeholder to indicate keeping existing code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def load_glossary() -> None:
52
+ # ... (Keep your existing code for load_glossary) ...
53
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def load_manual_qa() -> None:
56
  """
 
65
  qa_store.NEXT_MANUAL_ID = 1
66
  return
67
 
 
68
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
69
  for line in f:
70
  line = line.strip()
 
73
  try:
74
  obj = json.loads(line)
75
  except json.JSONDecodeError:
 
76
  continue
77
 
 
78
  q = (obj.get("q") or "").strip()
79
  a = (obj.get("a") or "").strip()
80
  if not q or not a:
 
85
  max_num += 1
86
  entry_id = f"manual_{max_num:04d}"
87
 
88
+ # track biggest number for ID generation
89
  import re as _re
 
90
  m = _re.search(r"(\d+)$", entry_id)
91
  if m:
92
  max_num = max(max_num, int(m.group(1)))
 
103
 
104
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
105
 
 
 
 
106
  def generate_new_manual_id() -> str:
107
+ # ... (Keep existing code) ...
 
 
 
 
108
  import re as _re
 
109
  used_nums = set()
 
 
110
  for e in qa_store.MANUAL_QA_LIST:
111
  raw_id = str(e.get("id") or "")
112
  m = _re.search(r"(\d+)$", raw_id)
113
  if m:
114
  used_nums.add(int(m.group(1)))
 
 
115
  i = 1
116
  while i in used_nums:
117
  i += 1
 
 
 
 
118
  return f"manual_{i:04d}"
119
 
 
 
120
  def save_manual_qa_file() -> None:
121
  """
122
+ 1. Save to local disk.
123
+ 2. Upload to HF Dataset immediately.
124
  """
125
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
126
+
127
+ # 1. Save Local
128
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
129
  for e in qa_store.MANUAL_QA_LIST:
130
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
131
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
132
+
133
+ # 2. Upload to HF Dataset
134
+ hf_token = os.environ.get("HF_TOKEN")
135
+ if hf_token:
136
+ try:
137
+ api = HfApi(token=hf_token)
138
+ api.upload_file(
139
+ path_or_fileobj=MANUAL_QA_PATH,
140
+ path_in_repo=DATASET_FILENAME,
141
+ repo_id=DATASET_REPO_ID,
142
+ repo_type="dataset",
143
+ commit_message="Teacher Panel: Updated Q&A"
144
+ )
145
+ print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.")
146
+ except Exception as e:
147
+ print(f"❌ [Sync] Upload failed: {e}")
148
 
149
  def rebuild_combined_qa() -> None:
150
+ # ... (Keep existing code) ...
151
+ pass
152
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def manual_qa_table_data() -> List[List[str]]:
154
+ # ... (Keep existing code) ...
155
+ return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]