Spaces:

FabIndy
/

code-education-rag

Running

App Files Files Community

FabIndy commited on Jan 14

Commit

b00b20e

1 Parent(s): 14fa239

Fix EXPLAIN mode: dedicated LLM prompt, longer output, strict article handling

Browse files

Files changed (1) hide show

src/rag_core.py +140 -323

src/rag_core.py CHANGED Viewed

@@ -2,23 +2,7 @@
 # -*- coding: utf-8 -*-
 """
-rag_core.py
-Transposition FIDÈLE de rag_chat_llama.py (mêmes règles, mêmes seuils, même prompt,
-même validation anti-hallucination), mais sans boucle interactive : on expose
-une fonction answer_query(question) utilisable par une app Hugging Face.
-ROUTAGE AUTO :
-- FULLTEXT : demande "texte exact / intégral / article X" => impression exacte depuis JSONL (SANS LLM)
-- LIST     : demande "quels articles parlent ..." => liste articles + extrait (SANS LLM)
-- EXPLAIN  : demande "explique/résume..." + ID article => LLM sur 1 article (RAG strict)
-            demande "explique/résume..." sans ID => REFUS (orienter vers LIST/FULLTEXT)
-- QA       : RAG => LLM + prompt strict + VALIDATION (anti-hallucinations)
-Prérequis :
-- data/chunks_articles.jsonl (article-level)
-- db/faiss_code_edu_by_article (FAISS)
-- models/model.gguf (GGUF)
 """
 import json
@@ -31,56 +15,47 @@ from langchain_huggingface import HuggingFaceEmbeddings
 from llama_cpp import Llama
-# -------------------- CONFIG --------------------
 CHUNKS_PATH = Path("data/chunks_articles.jsonl")
 DB_DIR = Path("db/faiss_code_edu_by_article")
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-TOP_K_FETCH = 30            # nb de docs candidats récupérés
-TOP_K_FINAL = 3             # nb max envoyés au LLM (QA)
-SCORE_THRESHOLD = 1.10      # à ajuster (voir affichage des scores)
-MAX_CHARS_PER_DOC = 800
 SNIPPET_CHARS = 260
-# Déclencheurs FULLTEXT
-FULLTEXT_TRIGGERS = [
-    "contenu exact", "texte exact", "texte intégral", "texte integral",
-    "intégral", "integral", "cite intégralement", "cite integralement",
-    "donne l'intégralité", "donne l'integralite", "recopie", "reproduis",
-    "affiche l'article", "donne l'article", "donne moi l'article",
 ]
-# Déclencheurs LIST
 LIST_TRIGGERS = [
-    "quels articles", "quelles dispositions", "articles parlent",
-    "articles qui parlent", "articles sur", "donne les articles",
-    "cite les articles", "références", "references",
 ]
-# Déclencheurs EXPLAIN (reformulation)
-EXPLAIN_TRIGGERS = [
-    "explique", "expliquer", "explication",
-    "résume", "resume", "résumé", "resume-moi", "résume-moi",
-    "reformule", "reformuler",
-    "simplifie", "simplifier",
-    "en termes simples", "très simple", "tres simple",
-    "vulgarise", "vulgariser",
-    "clarifie", "clarifier",
 ]
-# Regex article id
-ARTICLE_ID_RE = re.compile(
-    r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
-    flags=re.IGNORECASE
 )
-EPLE_RE = re.compile(r"\bEPLE\b", flags=re.IGNORECASE)
-# Pour valider les sorties "Articles cités : ..."
-ARTICLES_CITES_RE = re.compile(r"Articles cités\s*:\s*(.*)$", flags=re.IGNORECASE | re.MULTILINE)
-# -------------------- LLM INIT (FIDÈLE) --------------------
 llm = Llama(
     model_path="models/model.gguf",
     n_ctx=2048,
@@ -90,7 +65,8 @@ llm = Llama(
 )
-def llm_generate(prompt: str) -> str:
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.1,
@@ -99,29 +75,30 @@ def llm_generate(prompt: str) -> str:
     return out["choices"][0]["message"]["content"].strip()
-# -------------------- UTILS (FIDÈLES) --------------------
 def normalize_article_id(raw: str) -> str:
-    s = raw.strip().upper().replace(" ", "")
-    s = s.replace(".", "-")
-    return s
 def extract_article_id(q: str) -> Optional[str]:
     m = ARTICLE_ID_RE.search(q)
-    if not m:
-        return None
-    return normalize_article_id(m.group(1))
-def is_fulltext_request(q: str) -> bool:
     ql = q.lower()
-    if any(t in ql for t in FULLTEXT_TRIGGERS):
-        return True
-    aid = extract_article_id(q)
-    if aid and len(ql) <= 25:
-        return True
-    return False
 def is_list_request(q: str) -> bool:
@@ -129,309 +106,149 @@ def is_list_request(q: str) -> bool:
     return any(t in ql for t in LIST_TRIGGERS)
-def is_explain_request(q: str) -> bool:
     ql = q.lower()
-    return any(t in ql for t in EXPLAIN_TRIGGERS)
-def dedupe_keep_order(items: Iterable[str]) -> List[str]:
-    seen = set()
-    out = []
-    for x in items:
-        if x not in seen:
-            out.append(x)
-            seen.add(x)
-    return out
-def safe_snippet(text: str, n: int) -> str:
-    t = " ".join((text or "").split())
-    if len(t) <= n:
-        return t
-    return t[:n].rstrip() + "…"
 def load_article_text(article_id: str) -> Optional[str]:
-    if not CHUNKS_PATH.exists():
-        raise FileNotFoundError(f"Fichier chunks introuvable : {CHUNKS_PATH}")
     with CHUNKS_PATH.open("r", encoding="utf-8") as f:
         for line in f:
-            if not line.strip():
-                continue
             obj = json.loads(line)
-            aid = normalize_article_id(obj.get("article_id", ""))
-            if aid == article_id:
-                return (obj.get("text") or "").strip()
     return None
-def load_vectorstore() -> FAISS:
-    if not DB_DIR.exists():
-        raise FileNotFoundError(f"Index FAISS introuvable : {DB_DIR}")
-    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
-    return FAISS.load_local(str(DB_DIR), embeddings, allow_dangerous_deserialization=True)
-def retrieve_scored(vs: FAISS, query: str) -> List[Tuple[object, float]]:
-    """
-    Retourne liste (Document, score). Plus le score est PETIT, plus c'est proche (distance).
-    """
-    return vs.similarity_search_with_score(query, k=TOP_K_FETCH)
-def filter_docs(scored: List[Tuple[object, float]]) -> List[Tuple[object, float]]:
-    """
-    Filtre simple par seuil + garde TOP_K_FINAL.
-    """
-    kept = [(d, s) for (d, s) in scored if s <= SCORE_THRESHOLD]
-    if not kept:
-        # fallback : au moins TOP_K_FINAL meilleurs, sinon tu refuses trop souvent
-        kept = sorted(scored, key=lambda x: x[1])[:TOP_K_FINAL]
-    else:
-        kept = sorted(kept, key=lambda x: x[1])[:TOP_K_FINAL]
-    return kept
-def build_context(scored_docs: List[Tuple[object, float]]) -> Tuple[str, List[str], Dict[str, str], Dict[str, float]]:
-    used = []
-    by_id: Dict[str, str] = {}
-    by_score: Dict[str, float] = {}
-    blocks = []
-    for d, s in scored_docs:
-        aid = d.metadata.get("article_id", "UNKNOWN")
-        aid_norm = normalize_article_id(aid)
-        used.append(aid_norm)
-        txt = (d.page_content or "").strip()
-        by_id[aid_norm] = txt
-        by_score[aid_norm] = float(s)
-        if len(txt) > MAX_CHARS_PER_DOC:
-            txt = txt[:MAX_CHARS_PER_DOC].rstrip() + "\n[.]"
-        blocks.append(f"[{aid_norm}]\n{txt}")
-    used = dedupe_keep_order(used)
-    return "\n\n".join(blocks), used, by_id, by_score
-def eple_context_ok(question: str, by_id: Dict[str, str]) -> bool:
-    """
-    Si la question contient "EPLE", on veut que le contexte contienne explicitement
-    des indices "collège/lycée/établissement public local d'enseignement".
-    """
-    if not EPLE_RE.search(question):
-        return True
-    joined = "\n".join(by_id.values()).lower()
-    signals = [
-        "établissement public local d'enseignement",
-        "etablissement public local d'enseignement",
-        "collège", "college", "lycée", "lycee",
-        "chef d'établissement", "chef d'etablissement",
-    ]
-    return any(sig in joined for sig in signals)
-def extract_cited_articles(answer: str) -> List[str]:
-    m = ARTICLES_CITES_RE.search(answer)
-    if not m:
-        return []
-    tail = m.group(1).strip()
-    if not tail:
-        return []
-    parts = re.split(r"[,\s]+", tail)
-    out = []
-    for p in parts:
-        p = p.strip()
-        if not p:
-            continue
-        # tolère "D422-15." ou "[D422-15]"
-        p = p.strip("[]().;:")
-        if ARTICLE_ID_RE.match(p) or re.match(r"^[LDR]\d", p, flags=re.I):
-            out.append(normalize_article_id(p))
-    return dedupe_keep_order(out)
-def validate_answer(answer: str, allowed_articles: List[str]) -> bool:
-    cited = extract_cited_articles(answer)
-    allowed_set = set(allowed_articles)
-    # si le LLM ne cite rien => on refuse (sinon il peut raconter)
-    if not cited:
-        return False
-    # interdit de citer un article non présent dans la liste autorisée
-    if any(c not in allowed_set for c in cited):
-        return False
-    return True
-def build_prompt(question: str, context: str, allowed_articles: List[str]) -> str:
-    allowed = ", ".join(allowed_articles)
-    return f"""Tu es un assistant juridique spécialisé dans le Code de l'éducation (France).
-RÈGLES ABSOLUES (non négociables) :
-1) Tu réponds UNIQUEMENT à partir du CONTEXTE fourni ci-dessous.
-2) Tu n'inventes rien, tu ne complètes pas, tu ne "supposes" pas. Interdiction d'utiliser :
-   "on peut supposer", "il est possible que", "on peut déduire", "probablement", etc.
-3) Si le CONTEXTE ne permet pas de répondre, tu dis exactement :
-   "Je ne peux pas répondre avec certitude à partir des articles fournis."
-4) Tu DOIS citer uniquement des articles présents dans la liste autorisée :
-   {allowed}
-5) Attention au sigle EPLE :
-   - EPLE = établissement public local d'enseignement (collèges/lycées).
-   - Ne confonds pas avec d'autres établissements.
-   Si le CONTEXTE ne traite pas clairement des EPLE au sens collèges/lycées, tu refuses de conclure.
-QUESTION :
-{question}
-CONTEXTE :
-{context}
-FORMAT DE SORTIE OBLIGATOIRE :
-- Une réponse courte et factuelle.
-- Dernière ligne STRICTE : "Articles cités : A, B, C" (uniquement parmi la liste autorisée).
 """
-# -------------------- CORE API (HF) --------------------
-_REFUSAL = "Je ne peux pas répondre avec certitude à partir des articles fournis."
-_EXPLAIN_REFUSAL = (
-    "Pour expliquer ou résumer, indique un identifiant d’article (ex : D422-5). "
-    "Sinon, commence par : \"Quels articles parlent de … ?\""
-)
-# cache pour éviter de recharger FAISS à chaque call
-_VS: Optional[FAISS] = None
-def get_vectorstore() -> FAISS:
-    global _VS
-    if _VS is None:
-        _VS = load_vectorstore()
-    return _VS
 def answer_query(q: str) -> Dict[str, Any]:
-    """
-    API équivalente à la boucle interactive de rag_chat_llama.py.
-    Retourne un dict structuré :
-    - mode: "FULLTEXT" | "LIST" | "EXPLAIN" | "QA"
-    - answer: str (réponse finale ou refus)
-    - articles: liste des articles récupérés (pour debug/affichage)
-    - scores: dict {article: score} (pour debug/affichage)
-    - snippets: (LIST) dict {article: snippet}
-    - fulltext: (FULLTEXT) texte exact
-    """
-    q = (q or "").strip()
     if not q:
-        return {"mode": "QA", "answer": _REFUSAL, "articles": [], "scores": {}}
-    # --- EXPLAIN sans ID => REFUS (robuste) ---
-    # On refuse explicitement pour forcer l'utilisateur à donner un article.
-    aid = extract_article_id(q)
-    if is_explain_request(q) and not aid:
-        return {"mode": "EXPLAIN", "answer": _EXPLAIN_REFUSAL, "articles": [], "scores": {}}
-    vs = get_vectorstore()
-    # --- FULLTEXT ---
-    if aid and is_fulltext_request(q):
-        txt = load_article_text(aid)
-        if not txt:
             return {
-                "mode": "FULLTEXT",
-                "answer": f"Je ne trouve pas l'article {aid} dans {CHUNKS_PATH}.",
-                "articles": [],
-                "scores": {},
-                "fulltext": None,
             }
-        return {
-            "mode": "FULLTEXT",
-            "answer": txt,
-            "articles": [aid],
-            "scores": {},
-            "fulltext": txt,
-        }
-    # --- EXPLAIN (article unique forcé) ---
-    # Si l'utilisateur demande une explication ET fournit un ID,
-    # on force le contexte à cet article (plus fiable + souvent plus rapide).
-    if aid and is_explain_request(q):
-        txt = load_article_text(aid)
-        if not txt:
             return {
                 "mode": "EXPLAIN",
-                "answer": f"Je ne trouve pas l'article {aid} dans {CHUNKS_PATH}.",
-                "articles": [],
-                "scores": {},
             }
-        context = f"[{aid}]\n{txt}"
-        articles = [aid]
-        by_id = {aid: txt}
-        # --- EPLE safety gate (inchangé) ---
-        if not eple_context_ok(q, by_id):
-            return {"mode": "EXPLAIN", "answer": _REFUSAL, "articles": articles, "scores": {}}
-        prompt = build_prompt(q, context, articles)
-        answer = llm_generate(prompt)
-        # --- VALIDATION (inchangée) ---
-        if not validate_answer(answer, articles):
-            return {"mode": "EXPLAIN", "answer": _REFUSAL, "articles": articles, "scores": {}}
-        return {"mode": "EXPLAIN", "answer": answer, "articles": articles, "scores": {}}
-    # --- RETRIEVE (scored) ---
-    scored = retrieve_scored(vs, q)
-    scored = filter_docs(scored)
-    context, articles, by_id, by_score = build_context(scored)
-    # --- LIST ---
     if is_list_request(q):
-        snippets = {a: safe_snippet(by_id.get(a, ""), SNIPPET_CHARS) for a in articles}
         return {
             "mode": "LIST",
             "answer": "",
-            "articles": articles,
-            "scores": by_score,
-            "snippets": snippets,
-        }
-    # --- EPLE safety gate ---
-    if not eple_context_ok(q, by_id):
-        return {
-            "mode": "QA",
-            "answer": _REFUSAL,
-            "articles": articles,
-            "scores": by_score,
         }
-    # --- QA (LLM) ---
-    prompt = build_prompt(q, context, articles)
-    answer = llm_generate(prompt)
-    # --- VALIDATION ---
-    if not validate_answer(answer, articles):
-        return {
-            "mode": "QA",
-            "answer": _REFUSAL,
-            "articles": articles,
-            "scores": by_score,
-        }
     return {
         "mode": "QA",
         "answer": answer,
-        "articles": articles,
-        "scores": by_score,
     }

 # -*- coding: utf-8 -*-
 """
+rag_core.py – version corrigée EXPLAIN
 """
 import json
 from llama_cpp import Llama
+# ==================== CONFIG ====================
 CHUNKS_PATH = Path("data/chunks_articles.jsonl")
 DB_DIR = Path("db/faiss_code_edu_by_article")
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+TOP_K_FETCH = 30
+TOP_K_FINAL = 3
+SCORE_THRESHOLD = 1.10
+MAX_CHARS_PER_DOC = 1200
 SNIPPET_CHARS = 260
+ARTICLE_ID_RE = re.compile(
+    r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
+    flags=re.IGNORECASE
+)
+EXPLAIN_TRIGGERS = [
+    "explique", "explication", "résume", "resume",
+    "simplifie", "en termes simples", "vulgarise"
 ]
 LIST_TRIGGERS = [
+    "quels articles", "articles qui", "articles sur", "références"
 ]
+FULLTEXT_TRIGGERS = [
+    "texte exact", "texte intégral", "donne l'article", "intégralité"
 ]
+_REFUSAL = "Je ne peux pas répondre avec certitude à partir des articles fournis."
+_EXPLAIN_REFUSAL = (
+    "Pour expliquer un article, indique explicitement son identifiant "
+    "(ex : D422-5)."
 )
+# ==================== LLM INIT ====================
 llm = Llama(
     model_path="models/model.gguf",
     n_ctx=2048,
 )
+def llm_generate_qa(prompt: str) -> str:
+    """Réponse courte, stricte"""
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.1,
     return out["choices"][0]["message"]["content"].strip()
+def llm_generate_explain(prompt: str) -> str:
+    """Réponse explicative (plus longue)"""
+    out = llm.create_chat_completion(
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.2,
+        max_tokens=500,
+    )
+    return out["choices"][0]["message"]["content"].strip()
+# ==================== UTILS ====================
 def normalize_article_id(raw: str) -> str:
+    return raw.strip().upper().replace(" ", "").replace(".", "-")
 def extract_article_id(q: str) -> Optional[str]:
     m = ARTICLE_ID_RE.search(q)
+    return normalize_article_id(m.group(1)) if m else None
+def is_explain_request(q: str) -> bool:
     ql = q.lower()
+    return any(t in ql for t in EXPLAIN_TRIGGERS)
 def is_list_request(q: str) -> bool:
     return any(t in ql for t in LIST_TRIGGERS)
+def is_fulltext_request(q: str) -> bool:
     ql = q.lower()
+    return any(t in ql for t in FULLTEXT_TRIGGERS)
 def load_article_text(article_id: str) -> Optional[str]:
     with CHUNKS_PATH.open("r", encoding="utf-8") as f:
         for line in f:
             obj = json.loads(line)
+            if normalize_article_id(obj.get("article_id", "")) == article_id:
+                return obj.get("text", "").strip()
     return None
+# ==================== VECTORSTORE ====================
+_VS: Optional[FAISS] = None
+def get_vectorstore() -> FAISS:
+    global _VS
+    if _VS is None:
+        embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+        _VS = FAISS.load_local(
+            str(DB_DIR),
+            embeddings,
+            allow_dangerous_deserialization=True
+        )
+    return _VS
+# ==================== PROMPTS ====================
+def build_explain_prompt(article_id: str, article_text: str, level: str) -> str:
+    return f"""
+Tu es un assistant pédagogique spécialisé dans le Code de l'éducation.
+ARTICLE :
+[{article_id}]
+{article_text}
+TÂCHE :
+Explique cet article de façon {level}, fidèle au texte, sans rien inventer.
+INTERDICTIONS :
+- Pas d'ajout juridique
+- Pas de généralisation
+- Pas de suppositions
+FORMAT :
+- Explication structurée
+- Ton clair et accessible
+- Aucune citation d'autres articles
 """
+def build_qa_prompt(question: str, context: str, allowed: List[str]) -> str:
+    return f"""
+Tu es un assistant juridique spécialisé dans le Code de l'éducation.
+RÈGLES STRICTES :
+- Tu réponds uniquement à partir du contexte
+- Tu cites uniquement : {", ".join(allowed)}
+- Sinon tu refuses
+QUESTION :
+{question}
+CONTEXTE :
+{context}
+FORMAT FINAL :
+Réponse courte.
+Dernière ligne : Articles cités : A, B
+"""
+# ==================== CORE ====================
 def answer_query(q: str) -> Dict[str, Any]:
+    q = q.strip()
     if not q:
+        return {"mode": "QA", "answer": _REFUSAL, "articles": []}
+    article_id = extract_article_id(q)
+    # ---------- EXPLAIN ----------
+    if is_explain_request(q):
+        if not article_id:
             return {
+                "mode": "EXPLAIN",
+                "answer": _EXPLAIN_REFUSAL,
+                "articles": []
             }
+        text = load_article_text(article_id)
+        if not text:
             return {
                 "mode": "EXPLAIN",
+                "answer": f"Article {article_id} introuvable.",
+                "articles": []
             }
+        prompt = build_explain_prompt(article_id, text, "simple")
+        answer = llm_generate_explain(prompt)
+        return {
+            "mode": "EXPLAIN",
+            "answer": answer,
+            "articles": [article_id]
+        }
+    # ---------- FULLTEXT ----------
+    if article_id and is_fulltext_request(q):
+        text = load_article_text(article_id)
+        return {
+            "mode": "FULLTEXT",
+            "answer": text or _REFUSAL,
+            "articles": [article_id]
+        }
+    # ---------- LIST ----------
     if is_list_request(q):
+        vs = get_vectorstore()
+        docs = vs.similarity_search(q, k=5)
+        arts = list({normalize_article_id(d.metadata["article_id"]) for d in docs})
         return {
             "mode": "LIST",
             "answer": "",
+            "articles": arts
         }
+    # ---------- QA ----------
+    vs = get_vectorstore()
+    docs = vs.similarity_search(q, k=TOP_K_FINAL)
+    context = "\n\n".join(d.page_content for d in docs)
+    articles = [normalize_article_id(d.metadata["article_id"]) for d in docs]
+    prompt = build_qa_prompt(q, context, articles)
+    answer = llm_generate_qa(prompt)
     return {
         "mode": "QA",
         "answer": answer,
+        "articles": articles
     }