Heng2004 commited on
Commit
a2877a1
·
verified ·
1 Parent(s): 840ac8c

Create loader.py

Browse files
Files changed (1) hide show
  1. loader.py +163 -0
loader.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # loader.py
2
+ import os
3
+ import json
4
+ from typing import List, Dict, Any
5
+
6
+ import qa_store
7
+
8
+ CURRICULUM_PATH = "data/1_Year_U_1.jsonl"
9
+ MANUAL_QA_PATH = "data/manual_qa.jsonl"
10
+
11
+
12
+ def load_curriculum() -> None:
13
+ """
14
+ Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
15
+ """
16
+ qa_store.ENTRIES.clear()
17
+ qa_store.AUTO_QA_KNOWLEDGE.clear()
18
+
19
+ if not os.path.exists(CURRICULUM_PATH):
20
+ qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
21
+ return
22
+
23
+ with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
24
+ for line in f:
25
+ line = line.strip()
26
+ if not line:
27
+ continue
28
+ try:
29
+ obj: Dict[str, Any] = json.loads(line)
30
+ except json.JSONDecodeError:
31
+ continue
32
+
33
+ if "text" not in obj:
34
+ continue
35
+
36
+ qa_store.ENTRIES.append(obj)
37
+
38
+ for pair in obj.get("qa", []):
39
+ q = (pair.get("q") or "").strip()
40
+ a = (pair.get("a") or "").strip()
41
+ if not q or not a:
42
+ continue
43
+ norm_q = qa_store.normalize_question(q)
44
+ qa_store.AUTO_QA_KNOWLEDGE.append(
45
+ {
46
+ "norm_q": norm_q,
47
+ "q": q,
48
+ "a": a,
49
+ "source": "auto",
50
+ "id": obj.get("id", ""),
51
+ }
52
+ )
53
+
54
+ if qa_store.ENTRIES:
55
+ qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
56
+ else:
57
+ qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."
58
+
59
+
60
+ def load_manual_qa() -> None:
61
+ """
62
+ Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
63
+ """
64
+ qa_store.MANUAL_QA_LIST.clear()
65
+ qa_store.MANUAL_QA_INDEX.clear()
66
+ max_num = 0
67
+
68
+ if not os.path.exists(MANUAL_QA_PATH):
69
+ qa_store.NEXT_MANUAL_ID = 1
70
+ return
71
+
72
+ with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
73
+ for line in f:
74
+ line = line.strip()
75
+ if not line:
76
+ continue
77
+ try:
78
+ obj = json.loads(line)
79
+ except json.JSONDecodeError:
80
+ continue
81
+
82
+ q = (obj.get("q") or "").strip()
83
+ a = (obj.get("a") or "").strip()
84
+ if not q or not a:
85
+ continue
86
+
87
+ entry_id = str(obj.get("id") or "")
88
+ if not entry_id:
89
+ max_num += 1
90
+ entry_id = f"manual_{max_num:04d}"
91
+
92
+ # track biggest number in id
93
+ import re as _re
94
+
95
+ m = _re.search(r"(\d+)$", entry_id)
96
+ if m:
97
+ max_num = max(max_num, int(m.group(1)))
98
+
99
+ norm_q = qa_store.normalize_question(q)
100
+ entry = {
101
+ "id": entry_id,
102
+ "q": q,
103
+ "a": a,
104
+ "norm_q": norm_q,
105
+ }
106
+ qa_store.MANUAL_QA_LIST.append(entry)
107
+ qa_store.MANUAL_QA_INDEX[norm_q] = entry
108
+
109
+ qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1
110
+
111
+
112
+ def generate_new_manual_id() -> str:
113
+ """
114
+ Generate a new manual entry ID and increment the counter.
115
+ """
116
+ new_id = f"manual_{qa_store.NEXT_MANUAL_ID:04d}"
117
+ qa_store.NEXT_MANUAL_ID += 1
118
+ return new_id
119
+
120
+
121
+ def save_manual_qa_file() -> None:
122
+ """
123
+ Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
124
+ """
125
+ os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
126
+ with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
127
+ for e in qa_store.MANUAL_QA_LIST:
128
+ obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
129
+ f.write(json.dumps(obj, ensure_ascii=False) + "\n")
130
+
131
+
132
+ def rebuild_combined_qa() -> None:
133
+ """
134
+ Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
135
+ Manual answers override auto ones if same normalized question.
136
+ """
137
+ qa_store.QA_INDEX.clear()
138
+ qa_store.ALL_QA_KNOWLEDGE.clear()
139
+
140
+ # auto first
141
+ for item in qa_store.AUTO_QA_KNOWLEDGE:
142
+ norm_q = item["norm_q"]
143
+ qa_store.QA_INDEX[norm_q] = item["a"]
144
+ qa_store.ALL_QA_KNOWLEDGE.append(item)
145
+
146
+ # manual overrides
147
+ for e in qa_store.MANUAL_QA_LIST:
148
+ item = {
149
+ "norm_q": e["norm_q"],
150
+ "q": e["q"],
151
+ "a": e["a"],
152
+ "source": "manual",
153
+ "id": e["id"],
154
+ }
155
+ qa_store.QA_INDEX[item["norm_q"]] = item["a"]
156
+ qa_store.ALL_QA_KNOWLEDGE.append(item)
157
+
158
+
159
+ def manual_qa_table_data() -> List[List[str]]:
160
+ """
161
+ Table rows for Teacher Panel.
162
+ """
163
+ return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]