Heng2004 commited on
Commit
237554a
·
verified ·
1 Parent(s): 181cebd

Update loader.py

Browse files
Files changed (1) hide show
  1. loader.py +127 -59
loader.py CHANGED
@@ -2,55 +2,94 @@
2
  import os
3
  import json
4
  from typing import List, Dict, Any
 
5
  from huggingface_hub import hf_hub_download, HfApi
6
 
7
  import qa_store
8
 
9
- # --- CONFIGURATION ---
10
- # CHANGE THIS to your actual dataset ID (username/dataset-name)
11
- DATASET_REPO_ID = "YourUsername/lao-science-qa-store"
12
- DATASET_FILENAME = "manual_qa.jsonl"
13
- # ---------------------
14
-
15
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
  DATA_DIR = os.path.join(BASE_DIR, "data")
 
17
  CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
18
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
 
19
  GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
20
 
21
- def sync_download_manual_qa():
22
  """
23
- Startup Step: Download the latest manual_qa.jsonl from HF Dataset.
24
- If it doesn't exist yet (first run), we ignore the error.
25
  """
26
- print("☁️ [Sync] Checking for remote QA data...")
27
- hf_token = os.environ.get("HF_TOKEN")
28
- if not hf_token:
29
- print("⚠️ [Sync] No HF_TOKEN found. Data will not persist across restarts!")
 
 
30
  return
31
 
32
- try:
33
- # Download file to local path
34
- os.makedirs(DATA_DIR, exist_ok=True)
35
- hf_hub_download(
36
- repo_id=DATASET_REPO_ID,
37
- filename=DATASET_FILENAME,
38
- repo_type="dataset",
39
- local_dir=DATA_DIR,
40
- local_dir_use_symlinks=False, # force real file
41
- token=hf_token
42
- )
43
- print("✅ [Sync] Downloaded latest manual_qa.jsonl from Dataset.")
44
- except Exception as e:
45
- print(f"ℹ️ [Sync] Could not download remote file (might be first run): {e}")
46
 
47
- def load_curriculum() -> None:
48
- # ... (Keep your existing code for load_curriculum) ...
49
- pass # Placeholder to indicate keeping existing code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def load_glossary() -> None:
52
- # ... (Keep your existing code for load_glossary) ...
53
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def load_manual_qa() -> None:
56
  """
@@ -65,6 +104,7 @@ def load_manual_qa() -> None:
65
  qa_store.NEXT_MANUAL_ID = 1
66
  return
67
 
 
68
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
69
  for line in f:
70
  line = line.strip()
@@ -73,8 +113,10 @@ def load_manual_qa() -> None:
73
  try:
74
  obj = json.loads(line)
75
  except json.JSONDecodeError:
 
76
  continue
77
 
 
78
  q = (obj.get("q") or "").strip()
79
  a = (obj.get("a") or "").strip()
80
  if not q or not a:
@@ -85,8 +127,9 @@ def load_manual_qa() -> None:
85
  max_num += 1
86
  entry_id = f"manual_{max_num:04d}"
87
 
88
- # track biggest number for ID generation
89
  import re as _re
 
90
  m = _re.search(r"(\d+)$", entry_id)
91
  if m:
92
  max_num = max(max_num, int(m.group(1)))
@@ -103,53 +146,78 @@ def load_manual_qa() -> None:
103
 
104
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
105
 
 
 
 
106
  def generate_new_manual_id() -> str:
107
- # ... (Keep existing code) ...
 
 
 
 
108
  import re as _re
 
109
  used_nums = set()
 
 
110
  for e in qa_store.MANUAL_QA_LIST:
111
  raw_id = str(e.get("id") or "")
112
  m = _re.search(r"(\d+)$", raw_id)
113
  if m:
114
  used_nums.add(int(m.group(1)))
 
 
115
  i = 1
116
  while i in used_nums:
117
  i += 1
 
 
 
 
118
  return f"manual_{i:04d}"
119
 
 
 
120
  def save_manual_qa_file() -> None:
121
  """
122
- 1. Save to local disk.
123
- 2. Upload to HF Dataset immediately.
124
  """
125
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
126
-
127
- # 1. Save Local
128
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
129
  for e in qa_store.MANUAL_QA_LIST:
130
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
131
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
132
-
133
- # 2. Upload to HF Dataset
134
- hf_token = os.environ.get("HF_TOKEN")
135
- if hf_token:
136
- try:
137
- api = HfApi(token=hf_token)
138
- api.upload_file(
139
- path_or_fileobj=MANUAL_QA_PATH,
140
- path_in_repo=DATASET_FILENAME,
141
- repo_id=DATASET_REPO_ID,
142
- repo_type="dataset",
143
- commit_message="Teacher Panel: Updated Q&A"
144
- )
145
- print("☁️ [Sync] Uploaded manual_qa.jsonl to Dataset.")
146
- except Exception as e:
147
- print(f"❌ [Sync] Upload failed: {e}")
148
 
149
  def rebuild_combined_qa() -> None:
150
- # ... (Keep existing code) ...
151
- pass
152
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def manual_qa_table_data() -> List[List[str]]:
154
- # ... (Keep existing code) ...
155
- return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]
 
 
 
2
  import os
3
  import json
4
  from typing import List, Dict, Any
5
+
6
  from huggingface_hub import hf_hub_download, HfApi
7
 
8
  import qa_store
9
 
10
+ # Base paths (make them relative to this file)
 
 
 
 
 
11
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
12
  DATA_DIR = os.path.join(BASE_DIR, "data")
13
+
14
  CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
15
  MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")
16
+
17
  GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")
18
 
19
+ def load_curriculum() -> None:
20
  """
21
+ Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
 
22
  """
23
+ qa_store.ENTRIES.clear()
24
+ qa_store.AUTO_QA_KNOWLEDGE.clear()
25
+
26
+ if not os.path.exists(CURRICULUM_PATH):
27
+ print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
28
+ qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
29
  return
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
33
+ for line in f:
34
+ line = line.strip()
35
+ if not line:
36
+ continue
37
+ try:
38
+ obj: Dict[str, Any] = json.loads(line)
39
+ except json.JSONDecodeError:
40
+ print("[WARN] Skipping invalid JSON line in curriculum file.")
41
+ continue
42
+
43
+
44
+ if "text" not in obj:
45
+ continue
46
+
47
+ qa_store.ENTRIES.append(obj)
48
+
49
+ for pair in obj.get("qa", []):
50
+ q = (pair.get("q") or "").strip()
51
+ a = (pair.get("a") or "").strip()
52
+ if not q or not a:
53
+ continue
54
+ norm_q = qa_store.normalize_question(q)
55
+ qa_store.AUTO_QA_KNOWLEDGE.append(
56
+ {
57
+ "norm_q": norm_q,
58
+ "q": q,
59
+ "a": a,
60
+ "source": "auto",
61
+ "id": obj.get("id", ""),
62
+ }
63
+ )
64
+
65
+ if qa_store.ENTRIES:
66
+ qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
67
+ else:
68
+ qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
69
+
70
 
71
  def load_glossary() -> None:
72
+ """Load glossary entries into qa_store.GLOSSARY."""
73
+ qa_store.GLOSSARY.clear()
74
+
75
+ if not os.path.exists(GLOSSARY_PATH):
76
+ print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
77
+ return
78
+
79
+ with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
80
+ for line in f:
81
+ line = line.strip()
82
+ if not line:
83
+ continue
84
+ try:
85
+ obj = json.loads(line)
86
+ except json.JSONDecodeError:
87
+ print("[WARN] Skipping invalid glossary JSON line")
88
+ continue
89
+ qa_store.GLOSSARY.append(obj)
90
+
91
+ print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")
92
+
93
 
94
  def load_manual_qa() -> None:
95
  """
 
104
  qa_store.NEXT_MANUAL_ID = 1
105
  return
106
 
107
+
108
  with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
109
  for line in f:
110
  line = line.strip()
 
113
  try:
114
  obj = json.loads(line)
115
  except json.JSONDecodeError:
116
+ print("[WARN] Skipping invalid JSON line in manual QA file.")
117
  continue
118
 
119
+
120
  q = (obj.get("q") or "").strip()
121
  a = (obj.get("a") or "").strip()
122
  if not q or not a:
 
127
  max_num += 1
128
  entry_id = f"manual_{max_num:04d}"
129
 
130
+ # track biggest number in id
131
  import re as _re
132
+
133
  m = _re.search(r"(\d+)$", entry_id)
134
  if m:
135
  max_num = max(max_num, int(m.group(1)))
 
146
 
147
  qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
148
 
149
+
150
+ # loader.py
151
+
152
  def generate_new_manual_id() -> str:
153
+ """
154
+ Generate the smallest free manual_XXXX ID based on the
155
+ current MANUAL_QA_LIST (so gaps like 11 after delete
156
+ are reused).
157
+ """
158
  import re as _re
159
+
160
  used_nums = set()
161
+
162
+ # collect all numbers that are already used in IDs
163
  for e in qa_store.MANUAL_QA_LIST:
164
  raw_id = str(e.get("id") or "")
165
  m = _re.search(r"(\d+)$", raw_id)
166
  if m:
167
  used_nums.add(int(m.group(1)))
168
+
169
+ # find the smallest positive integer that is not used
170
  i = 1
171
  while i in used_nums:
172
  i += 1
173
+
174
+ # keep the global counter roughly in sync (optional)
175
+ qa_store.NEXT_MANUAL_ID = i + 1
176
+
177
  return f"manual_{i:04d}"
178
 
179
+
180
+
181
  def save_manual_qa_file() -> None:
182
  """
183
+ Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
 
184
  """
185
  os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
 
 
186
  with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
187
  for e in qa_store.MANUAL_QA_LIST:
188
  obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
189
  f.write(json.dumps(obj, ensure_ascii=False) + "\n")
190
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  def rebuild_combined_qa() -> None:
193
+ """
194
+ Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
195
+ Manual answers override auto ones if same normalized question.
196
+ """
197
+ qa_store.QA_INDEX.clear()
198
+ qa_store.ALL_QA_KNOWLEDGE.clear()
199
+
200
+ # auto first
201
+ for item in qa_store.AUTO_QA_KNOWLEDGE:
202
+ norm_q = item["norm_q"]
203
+ qa_store.QA_INDEX[norm_q] = item["a"]
204
+ qa_store.ALL_QA_KNOWLEDGE.append(item)
205
+
206
+ # manual overrides
207
+ for e in qa_store.MANUAL_QA_LIST:
208
+ item = {
209
+ "norm_q": e["norm_q"],
210
+ "q": e["q"],
211
+ "a": e["a"],
212
+ "source": "manual",
213
+ "id": e["id"],
214
+ }
215
+ qa_store.QA_INDEX[item["norm_q"]] = item["a"]
216
+ qa_store.ALL_QA_KNOWLEDGE.append(item)
217
+
218
+
219
  def manual_qa_table_data() -> List[List[str]]:
220
+ """
221
+ Table rows for Teacher Panel.
222
+ """
223
+ return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]