Heng2004 commited on
Commit
a0daaac
·
verified ·
1 Parent(s): 5b247a6

Delete rag

Browse files
Files changed (1) hide show
  1. rag/retrieval.py +0 -99
rag/retrieval.py DELETED
@@ -1,99 +0,0 @@
1
- # rag/retrieval.py
2
-
3
- import re
4
- from typing import List, Dict
5
-
6
- from data.loader import ENTRIES, RAW_KNOWLEDGE
7
-
8
-
9
- def retrieve_context(question: str, max_entries: int = 2) -> str:
10
- """
11
- Keyword-based retrieval over ENTRIES using text + title + keywords.
12
- Works with your JSONL schema:
13
- {
14
- "id", "grade", "chapter", "section", "part",
15
- "title", "topic", "text", "summary", "keywords", "qa"
16
- }
17
- """
18
- if not ENTRIES:
19
- return RAW_KNOWLEDGE
20
-
21
- q = question.lower().strip()
22
- # simple token split (ok-ish for Lao)
23
- terms = [t for t in re.split(r"\s+", q) if len(t) > 1]
24
-
25
- if not terms:
26
- chosen = ENTRIES[:max_entries]
27
- return "\n\n".join(
28
- f"[ຊັ້ນ {e.get('grade','')}, ບົດ {e.get('chapter','')}, "
29
- f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]\n{e.get('text','')}"
30
- for e in chosen
31
- )
32
-
33
- scored: List[tuple[int, Dict]] = []
34
-
35
- for e in ENTRIES:
36
- text = e.get("text", "")
37
- title = e.get("title", "")
38
- kws = e.get("keywords", [])
39
- topic = e.get("topic", "")
40
-
41
- base = (text + " " + title).lower()
42
- score = 0
43
-
44
- # matches in main text + title
45
- for t in terms:
46
- score += base.count(t)
47
-
48
- # extra weight for keyword matches
49
- for kw in kws:
50
- kw_lower = kw.lower()
51
- for t in terms:
52
- if t in kw_lower:
53
- score += 2
54
-
55
- # tiny bonus if question mentions English topic word
56
- if topic and any(t in topic for t in terms):
57
- score += 1
58
-
59
- if score > 0:
60
- scored.append((score, e))
61
-
62
- scored.sort(key=lambda x: x[0], reverse=True)
63
- top_entries = [e for _, e in scored[:max_entries]]
64
-
65
- if not top_entries:
66
- top_entries = ENTRIES[:max_entries]
67
-
68
- context_blocks = []
69
- for e in top_entries:
70
- header = (
71
- f"[ຊັ້ນ {e.get('grade','')}, "
72
- f"ບົດ {e.get('chapter','')}, "
73
- f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]"
74
- )
75
- context_blocks.append(f"{header}\n{e.get('text','')}")
76
-
77
- return "\n\n".join(context_blocks)
78
-
79
-
80
- SYSTEM_PROMPT = (
81
- "ທ່ານແມ່ນຜູ້ຊ່ວຍເຫຼືອດ້ານປະຫວັດສາດຂອງປະເທດລາວ "
82
- "ສໍາລັບນັກຮຽນຊັ້ນ ມ.1. "
83
- "ຕອບແຕ່ພາສາລາວ ໃຫ້ຕອບສັ້ນໆ 2–3 ປະໂຫຍກ ແລະເຂົ້າໃຈງ່າຍ. "
84
- "ໃຫ້ອີງຈາກຂໍ້ມູນຂ້າງລຸ່ມນີ້ເທົ່ານັ້ນ. "
85
- "ຖ້າຂໍ້ມູນບໍ່ພຽງພໍ ຫຼືບໍ່ຊັດເຈນ ໃຫ້ບອກວ່າບໍ່ແນ່ໃຈ."
86
- )
87
-
88
-
89
- def build_prompt(question: str) -> str:
90
- context = retrieve_context(question)
91
- prompt = f"""{SYSTEM_PROMPT}
92
-
93
- ຂໍ້ມູນອ້າງອີງ:
94
- {context}
95
-
96
- ຄຳຖາມ: {question}
97
-
98
- ຄຳຕອບດ້ວຍພາສາລາວ:"""
99
- return prompt