Delete rag
Browse files- rag/retrieval.py +0 -99
rag/retrieval.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
# rag/retrieval.py
|
| 2 |
-
|
| 3 |
-
import re
|
| 4 |
-
from typing import List, Dict
|
| 5 |
-
|
| 6 |
-
from data.loader import ENTRIES, RAW_KNOWLEDGE
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def retrieve_context(question: str, max_entries: int = 2) -> str:
|
| 10 |
-
"""
|
| 11 |
-
Keyword-based retrieval over ENTRIES using text + title + keywords.
|
| 12 |
-
Works with your JSONL schema:
|
| 13 |
-
{
|
| 14 |
-
"id", "grade", "chapter", "section", "part",
|
| 15 |
-
"title", "topic", "text", "summary", "keywords", "qa"
|
| 16 |
-
}
|
| 17 |
-
"""
|
| 18 |
-
if not ENTRIES:
|
| 19 |
-
return RAW_KNOWLEDGE
|
| 20 |
-
|
| 21 |
-
q = question.lower().strip()
|
| 22 |
-
# simple token split (ok-ish for Lao)
|
| 23 |
-
terms = [t for t in re.split(r"\s+", q) if len(t) > 1]
|
| 24 |
-
|
| 25 |
-
if not terms:
|
| 26 |
-
chosen = ENTRIES[:max_entries]
|
| 27 |
-
return "\n\n".join(
|
| 28 |
-
f"[ຊັ້ນ {e.get('grade','')}, ບົດ {e.get('chapter','')}, "
|
| 29 |
-
f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]\n{e.get('text','')}"
|
| 30 |
-
for e in chosen
|
| 31 |
-
)
|
| 32 |
-
|
| 33 |
-
scored: List[tuple[int, Dict]] = []
|
| 34 |
-
|
| 35 |
-
for e in ENTRIES:
|
| 36 |
-
text = e.get("text", "")
|
| 37 |
-
title = e.get("title", "")
|
| 38 |
-
kws = e.get("keywords", [])
|
| 39 |
-
topic = e.get("topic", "")
|
| 40 |
-
|
| 41 |
-
base = (text + " " + title).lower()
|
| 42 |
-
score = 0
|
| 43 |
-
|
| 44 |
-
# matches in main text + title
|
| 45 |
-
for t in terms:
|
| 46 |
-
score += base.count(t)
|
| 47 |
-
|
| 48 |
-
# extra weight for keyword matches
|
| 49 |
-
for kw in kws:
|
| 50 |
-
kw_lower = kw.lower()
|
| 51 |
-
for t in terms:
|
| 52 |
-
if t in kw_lower:
|
| 53 |
-
score += 2
|
| 54 |
-
|
| 55 |
-
# tiny bonus if question mentions English topic word
|
| 56 |
-
if topic and any(t in topic for t in terms):
|
| 57 |
-
score += 1
|
| 58 |
-
|
| 59 |
-
if score > 0:
|
| 60 |
-
scored.append((score, e))
|
| 61 |
-
|
| 62 |
-
scored.sort(key=lambda x: x[0], reverse=True)
|
| 63 |
-
top_entries = [e for _, e in scored[:max_entries]]
|
| 64 |
-
|
| 65 |
-
if not top_entries:
|
| 66 |
-
top_entries = ENTRIES[:max_entries]
|
| 67 |
-
|
| 68 |
-
context_blocks = []
|
| 69 |
-
for e in top_entries:
|
| 70 |
-
header = (
|
| 71 |
-
f"[ຊັ້ນ {e.get('grade','')}, "
|
| 72 |
-
f"ບົດ {e.get('chapter','')}, "
|
| 73 |
-
f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]"
|
| 74 |
-
)
|
| 75 |
-
context_blocks.append(f"{header}\n{e.get('text','')}")
|
| 76 |
-
|
| 77 |
-
return "\n\n".join(context_blocks)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
SYSTEM_PROMPT = (
|
| 81 |
-
"ທ່ານແມ່ນຜູ້ຊ່ວຍເຫຼືອດ້ານປະຫວັດສາດຂອງປະເທດລາວ "
|
| 82 |
-
"ສໍາລັບນັກຮຽນຊັ້ນ ມ.1. "
|
| 83 |
-
"ຕອບແຕ່ພາສາລາວ ໃຫ້ຕອບສັ້ນໆ 2–3 ປະໂຫຍກ ແລະເຂົ້າໃຈງ່າຍ. "
|
| 84 |
-
"ໃຫ້ອີງຈາກຂໍ້ມູນຂ້າງລຸ່ມນີ້ເທົ່ານັ້ນ. "
|
| 85 |
-
"ຖ້າຂໍ້ມູນບໍ່ພຽງພໍ ຫຼືບໍ່ຊັດເຈນ ໃຫ້ບອກວ່າບໍ່ແນ່ໃຈ."
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def build_prompt(question: str) -> str:
|
| 90 |
-
context = retrieve_context(question)
|
| 91 |
-
prompt = f"""{SYSTEM_PROMPT}
|
| 92 |
-
|
| 93 |
-
ຂໍ້ມູນອ້າງອີງ:
|
| 94 |
-
{context}
|
| 95 |
-
|
| 96 |
-
ຄຳຖາມ: {question}
|
| 97 |
-
|
| 98 |
-
ຄຳຕອບດ້ວຍພາສາລາວ:"""
|
| 99 |
-
return prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|