Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Sleeping

App Files Files Community

Heng2004 commited on Dec 5, 2025

Commit

a0daaac

verified ·

1 Parent(s): 5b247a6

Delete rag

Browse files

Files changed (1) hide show

rag/retrieval.py +0 -99

rag/retrieval.py DELETED Viewed

@@ -1,99 +0,0 @@
-# rag/retrieval.py
-import re
-from typing import List, Dict
-from data.loader import ENTRIES, RAW_KNOWLEDGE
-def retrieve_context(question: str, max_entries: int = 2) -> str:
-    """
-    Keyword-based retrieval over ENTRIES using text + title + keywords.
-    Works with your JSONL schema:
-    {
-      "id", "grade", "chapter", "section", "part",
-      "title", "topic", "text", "summary", "keywords", "qa"
-    }
-    """
-    if not ENTRIES:
-        return RAW_KNOWLEDGE
-    q = question.lower().strip()
-    # simple token split (ok-ish for Lao)
-    terms = [t for t in re.split(r"\s+", q) if len(t) > 1]
-    if not terms:
-        chosen = ENTRIES[:max_entries]
-        return "\n\n".join(
-            f"[ຊັ້ນ {e.get('grade','')}, ບົດ {e.get('chapter','')}, "
-            f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]\n{e.get('text','')}"
-            for e in chosen
-        )
-    scored: List[tuple[int, Dict]] = []
-    for e in ENTRIES:
-        text = e.get("text", "")
-        title = e.get("title", "")
-        kws = e.get("keywords", [])
-        topic = e.get("topic", "")
-        base = (text + " " + title).lower()
-        score = 0
-        # matches in main text + title
-        for t in terms:
-            score += base.count(t)
-        # extra weight for keyword matches
-        for kw in kws:
-            kw_lower = kw.lower()
-            for t in terms:
-                if t in kw_lower:
-                    score += 2
-        # tiny bonus if question mentions English topic word
-        if topic and any(t in topic for t in terms):
-            score += 1
-        if score > 0:
-            scored.append((score, e))
-    scored.sort(key=lambda x: x[0], reverse=True)
-    top_entries = [e for _, e in scored[:max_entries]]
-    if not top_entries:
-        top_entries = ENTRIES[:max_entries]
-    context_blocks = []
-    for e in top_entries:
-        header = (
-            f"[ຊັ້ນ {e.get('grade','')}, "
-            f"ບົດ {e.get('chapter','')}, "
-            f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]"
-        )
-        context_blocks.append(f"{header}\n{e.get('text','')}")
-    return "\n\n".join(context_blocks)
-SYSTEM_PROMPT = (
-    "ທ່ານແມ່ນຜູ້ຊ່ວຍເຫຼືອດ້ານປະຫວັດສາດຂອງປະເທດລາວ "
-    "ສໍາລັບນັກຮຽນຊັ້ນ ມ.1. "
-    "ຕອບແຕ່ພາສາລາວ ໃຫ້ຕອບສັ້ນໆ 2–3 ປະໂຫຍກ ແລະເຂົ້າໃຈງ່າຍ. "
-    "ໃຫ້ອີງຈາກຂໍ້ມູນຂ້າງລຸ່ມນີ້ເທົ່ານັ້ນ. "
-    "ຖ້າຂໍ້ມູນບໍ່ພຽງພໍ ຫຼືບໍ່ຊັດເຈນ ໃຫ້ບອກວ່າບໍ່ແນ່ໃຈ."
-)
-def build_prompt(question: str) -> str:
-    context = retrieve_context(question)
-    prompt = f"""{SYSTEM_PROMPT}
-ຂໍ້ມູນອ້າງອີງ:
-{context}
-ຄຳຖາມ: {question}
-ຄຳຕອບດ້ວຍພາສາລາວ:"""
-    return prompt