Spaces:

FabIndy
/

code-education-rag

Sleeping

App Files Files Community

FabIndy commited on Jan 15

Commit

1e8b426

1 Parent(s): b00b20e

Speed up EXPLAIN: reduce LLM input and allow extractive-style explanation

Browse files

Files changed (1) hide show

src/rag_core.py +197 -82

src/rag_core.py CHANGED Viewed

@@ -2,13 +2,27 @@
 # -*- coding: utf-8 -*-
 """
-rag_core.py – version corrigée EXPLAIN
 """
 import json
 import re
 from pathlib import Path
-from typing import List, Tuple, Optional, Dict, Iterable, Any
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
@@ -22,43 +36,55 @@ DB_DIR = Path("db/faiss_code_edu_by_article")
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-TOP_K_FETCH = 30
 TOP_K_FINAL = 3
-SCORE_THRESHOLD = 1.10
-MAX_CHARS_PER_DOC = 1200
 SNIPPET_CHARS = 260
 ARTICLE_ID_RE = re.compile(
     r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
-    flags=re.IGNORECASE
 )
 EXPLAIN_TRIGGERS = [
-    "explique", "explication", "résume", "resume",
-    "simplifie", "en termes simples", "vulgarise"
 ]
 LIST_TRIGGERS = [
-    "quels articles", "articles qui", "articles sur", "références"
 ]
 FULLTEXT_TRIGGERS = [
-    "texte exact", "texte intégral", "donne l'article", "intégralité"
 ]
 _REFUSAL = "Je ne peux pas répondre avec certitude à partir des articles fournis."
 _EXPLAIN_REFUSAL = (
-    "Pour expliquer un article, indique explicitement son identifiant "
-    "(ex : D422-5)."
 )
-# ==================== LLM INIT ====================
 llm = Llama(
     model_path="models/model.gguf",
-    n_ctx=2048,
     n_threads=10,
     n_batch=128,
     verbose=False,
@@ -66,7 +92,6 @@ llm = Llama(
 def llm_generate_qa(prompt: str) -> str:
-    """Réponse courte, stricte"""
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.1,
@@ -75,12 +100,15 @@ def llm_generate_qa(prompt: str) -> str:
     return out["choices"][0]["message"]["content"].strip()
-def llm_generate_explain(prompt: str) -> str:
-    """Réponse explicative (plus longue)"""
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.2,
-        max_tokens=500,
     )
     return out["choices"][0]["message"]["content"].strip()
@@ -92,31 +120,42 @@ def normalize_article_id(raw: str) -> str:
 def extract_article_id(q: str) -> Optional[str]:
-    m = ARTICLE_ID_RE.search(q)
     return normalize_article_id(m.group(1)) if m else None
 def is_explain_request(q: str) -> bool:
-    ql = q.lower()
     return any(t in ql for t in EXPLAIN_TRIGGERS)
 def is_list_request(q: str) -> bool:
-    ql = q.lower()
     return any(t in ql for t in LIST_TRIGGERS)
 def is_fulltext_request(q: str) -> bool:
-    ql = q.lower()
     return any(t in ql for t in FULLTEXT_TRIGGERS)
 def load_article_text(article_id: str) -> Optional[str]:
     with CHUNKS_PATH.open("r", encoding="utf-8") as f:
         for line in f:
             obj = json.loads(line)
-            if normalize_article_id(obj.get("article_id", "")) == article_id:
-                return obj.get("text", "").strip()
     return None
@@ -132,33 +171,120 @@ def get_vectorstore() -> FAISS:
         _VS = FAISS.load_local(
             str(DB_DIR),
             embeddings,
-            allow_dangerous_deserialization=True
         )
     return _VS
-# ==================== PROMPTS ====================
-def build_explain_prompt(article_id: str, article_text: str, level: str) -> str:
-    return f"""
-Tu es un assistant pédagogique spécialisé dans le Code de l'éducation.
-ARTICLE :
-[{article_id}]
-{article_text}
-TÂCHE :
-Explique cet article de façon {level}, fidèle au texte, sans rien inventer.
-INTERDICTIONS :
-- Pas d'ajout juridique
-- Pas de généralisation
-- Pas de suppositions
-FORMAT :
-- Explication structurée
-- Ton clair et accessible
-- Aucune citation d'autres articles
 """
@@ -180,75 +306,64 @@ CONTEXTE :
 FORMAT FINAL :
 Réponse courte.
 Dernière ligne : Articles cités : A, B
-"""
 # ==================== CORE ====================
 def answer_query(q: str) -> Dict[str, Any]:
-    q = q.strip()
     if not q:
         return {"mode": "QA", "answer": _REFUSAL, "articles": []}
     article_id = extract_article_id(q)
-    # ---------- EXPLAIN ----------
     if is_explain_request(q):
         if not article_id:
-            return {
-                "mode": "EXPLAIN",
-                "answer": _EXPLAIN_REFUSAL,
-                "articles": []
-            }
         text = load_article_text(article_id)
         if not text:
-            return {
-                "mode": "EXPLAIN",
-                "answer": f"Article {article_id} introuvable.",
-                "articles": []
-            }
-        prompt = build_explain_prompt(article_id, text, "simple")
-        answer = llm_generate_explain(prompt)
-        return {
-            "mode": "EXPLAIN",
-            "answer": answer,
-            "articles": [article_id]
-        }
     # ---------- FULLTEXT ----------
     if article_id and is_fulltext_request(q):
         text = load_article_text(article_id)
-        return {
-            "mode": "FULLTEXT",
-            "answer": text or _REFUSAL,
-            "articles": [article_id]
-        }
     # ---------- LIST ----------
     if is_list_request(q):
         vs = get_vectorstore()
         docs = vs.similarity_search(q, k=5)
-        arts = list({normalize_article_id(d.metadata["article_id"]) for d in docs})
-        return {
-            "mode": "LIST",
-            "answer": "",
-            "articles": arts
-        }
     # ---------- QA ----------
     vs = get_vectorstore()
     docs = vs.similarity_search(q, k=TOP_K_FINAL)
     context = "\n\n".join(d.page_content for d in docs)
-    articles = [normalize_article_id(d.metadata["article_id"]) for d in docs]
     prompt = build_qa_prompt(q, context, articles)
     answer = llm_generate_qa(prompt)
-    return {
-        "mode": "QA",
-        "answer": answer,
-        "articles": articles
-    }

 # -*- coding: utf-8 -*-
 """
+rag_core.py – EXPLAIN ultra rapide via résumé extractif (text mining)
+Objectif :
+- LIST & FULLTEXT restent instantanés (pas de LLM)
+- EXPLAIN devient très rapide : extraction de 3–6 segments clés de l’article
+- QA reste possible (LLM), mais lent (CPU)
+Principe EXPLAIN :
+- ID d’article obligatoire, sinon refus.
+- On charge le texte exact de l’article depuis chunks_articles.jsonl
+- On produit une "explication" par extraction (aucune génération) -> zéro hallucination
+- Optionnel : reformulation LLM sur le résumé (désactivé par défaut)
+Ce fichier remplace le précédent (qui envoyait l’article intégral au LLM en EXPLAIN).
 """
 import json
+import os
 import re
 from pathlib import Path
+from typing import List, Optional, Dict, Any
 from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 TOP_K_FINAL = 3
 SNIPPET_CHARS = 260
+# --- Résumé extractif ---
+EXTRACT_MAX_SEGMENTS = 5          # nb max de segments extraits
+EXTRACT_MAX_CHARS_TOTAL = 900     # garde-fou (résumé total)
+EXTRACT_MIN_SEG_LEN = 30          # ignore segments trop courts
+EXTRACT_MAX_SEG_LEN = 420         # tronque segments trop longs
+# option : reformulation LLM sur résumé extractif (OFF par défaut)
+EXPLAIN_USE_LLM = os.environ.get("EXPLAIN_USE_LLM", "0").strip() == "1"
 ARTICLE_ID_RE = re.compile(
     r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
+    flags=re.IGNORECASE,
 )
 EXPLAIN_TRIGGERS = [
+    "explique", "expliquer", "explication",
+    "résume", "resume", "résumé", "reformule", "simplifie",
+    "en termes simples", "vulgarise", "clarifie",
 ]
 LIST_TRIGGERS = [
+    "quels articles", "quelles dispositions", "articles parlent",
+    "articles qui parlent", "articles sur", "donne les articles",
+    "cite les articles", "références", "references",
 ]
 FULLTEXT_TRIGGERS = [
+    "contenu exact", "texte exact", "texte intégral", "texte integral",
+    "intégral", "integral", "cite intégralement", "cite integralement",
+    "donne l'intégralité", "donne l'integralite", "recopie", "reproduis",
+    "affiche l'article", "donne l'article", "donne moi l'article",
 ]
 _REFUSAL = "Je ne peux pas répondre avec certitude à partir des articles fournis."
 _EXPLAIN_REFUSAL = (
+    "Pour expliquer ou résumer, indique un identifiant d’article (ex : D422-5). "
+    "Sinon, commence par : \"Quels articles parlent de … ?\""
 )
+# ==================== LLM INIT (QA + option EXPLAIN LLM) ====================
+# Le LLM est utile pour QA. Pour EXPLAIN "très vite", on le désactive par défaut.
 llm = Llama(
     model_path="models/model.gguf",
+    n_ctx=1024,         # réduit pour CPU
     n_threads=10,
     n_batch=128,
     verbose=False,
 def llm_generate_qa(prompt: str) -> str:
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.1,
     return out["choices"][0]["message"]["content"].strip()
+def llm_generate_explain_from_summary(prompt: str) -> str:
+    """
+    Reformulation optionnelle du résumé extractif.
+    On reste court pour ne pas exploser la latence CPU.
+    """
     out = llm.create_chat_completion(
         messages=[{"role": "user", "content": prompt}],
         temperature=0.2,
+        max_tokens=160,
     )
     return out["choices"][0]["message"]["content"].strip()
 def extract_article_id(q: str) -> Optional[str]:
+    m = ARTICLE_ID_RE.search(q or "")
     return normalize_article_id(m.group(1)) if m else None
 def is_explain_request(q: str) -> bool:
+    ql = (q or "").lower()
     return any(t in ql for t in EXPLAIN_TRIGGERS)
 def is_list_request(q: str) -> bool:
+    ql = (q or "").lower()
     return any(t in ql for t in LIST_TRIGGERS)
 def is_fulltext_request(q: str) -> bool:
+    ql = (q or "").lower()
     return any(t in ql for t in FULLTEXT_TRIGGERS)
+def safe_snippet(text: str, n: int) -> str:
+    t = " ".join((text or "").split())
+    return t if len(t) <= n else t[:n].rstrip() + "…"
 def load_article_text(article_id: str) -> Optional[str]:
+    if not CHUNKS_PATH.exists():
+        raise FileNotFoundError(f"Fichier chunks introuvable : {CHUNKS_PATH}")
     with CHUNKS_PATH.open("r", encoding="utf-8") as f:
         for line in f:
+            if not line.strip():
+                continue
             obj = json.loads(line)
+            aid = normalize_article_id(obj.get("article_id", ""))
+            if aid == article_id:
+                return (obj.get("text") or "").strip()
     return None
         _VS = FAISS.load_local(
             str(DB_DIR),
             embeddings,
+            allow_dangerous_deserialization=True,
         )
     return _VS
+# ==================== EXTRACTIVE SUMMARY (FAST) ====================
+_NORMATIVE_PATTERNS = [
+    # Verbes normatifs / obligations
+    r"\bdoit\b", r"\bdoivent\b", r"\best\b", r"\bsont\b",
+    r"\bpeut\b", r"\bpeuvent\b",
+    r"\best tenu\b", r"\bsont tenus\b", r"\best tenu de\b",
+    r"\best interdit\b", r"\bsont interdits\b", r"\bil est interdit\b",
+    r"\bobligatoire\b", r"\bobligation\b",
+    # Conditions / exceptions
+    r"\bsi\b", r"\blorsque\b", r"\bsauf\b", r"\bà condition\b", r"\ba condition\b",
+    r"\bdans le cas\b", r"\ben cas\b", r"\btoutefois\b",
+    # Structure
+    r"\bI\.\b", r"\bII\.\b", r"\bIII\.\b", r"\b1°\b", r"\b2°\b", r"\b3°\b",
+]
+def _split_into_segments(text: str) -> List[str]:
+    """
+    Découpe grossière mais robuste pour du juridique :
+    - on coupe par lignes / alinéas
+    - puis on recoupe si lignes trop longues via ; .
+    """
+    if not text:
+        return []
+    # 1) alinéas
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    segs: List[str] = []
+    for ln in lines:
+        # 2) recoupe douce
+        if len(ln) > 600:
+            parts = re.split(r"(?<=[.;:])\s+", ln)
+            segs.extend([p.strip() for p in parts if p.strip()])
+        else:
+            segs.append(ln)
+    return segs
+def _score_segment(seg: str) -> int:
+    s = 0
+    low = seg.lower()
+    for pat in _NORMATIVE_PATTERNS:
+        if re.search(pat, low, flags=re.IGNORECASE):
+            s += 2
+    # bonus si segment contient des marqueurs juridiques
+    if re.search(r"\b(décret|arrêté|loi|code)\b", low):
+        s += 1
+    # pénalité si segment trop long (moins lisible)
+    if len(seg) > 450:
+        s -= 1
+    return s
+def extractive_explain(article_id: str, article_text: str) -> str:
+    """
+    Produit une 'explication' très rapide :
+    - sélection de segments clés (extraction)
+    - aucune génération => zéro hallucination
+    """
+    segs = _split_into_segments(article_text)
+    cleaned = []
+    for s in segs:
+        s = " ".join(s.split())
+        if len(s) < EXTRACT_MIN_SEG_LEN:
+            continue
+        if len(s) > EXTRACT_MAX_SEG_LEN:
+            s = s[:EXTRACT_MAX_SEG_LEN].rstrip() + "…"
+        cleaned.append(s)
+    if not cleaned:
+        return f"Résumé impossible : texte vide ou non exploitable.\n\nArticles cités : {article_id}"
+    scored = sorted((( _score_segment(s), s) for s in cleaned), key=lambda x: x[0], reverse=True)
+    # garde ceux qui ont un score positif, sinon fallback sur les premiers segments
+    picked = [s for (sc, s) in scored if sc > 0][:EXTRACT_MAX_SEGMENTS]
+    if not picked:
+        picked = cleaned[:min(EXTRACT_MAX_SEGMENTS, len(cleaned))]
+    # garde-fou longueur totale
+    out_parts = []
+    total = 0
+    for s in picked:
+        if total + len(s) > EXTRACT_MAX_CHARS_TOTAL and out_parts:
+            break
+        out_parts.append(f"- {s}")
+        total += len(s)
+    body = (
+        "Points clés (extraction du texte, sans reformulation) :\n"
+        + "\n".join(out_parts)
+    )
+    return f"{body}\n\nArticles cités : {article_id}"
+def build_explain_llm_prompt(article_id: str, extractive_summary: str) -> str:
+    """
+    Reformulation LLM optionnelle sur RÉSUMÉ COURT (pas sur l’article int��gral).
+    """
+    return f"""Tu es un assistant pédagogique. Tu dois reformuler en termes simples le contenu fourni.
+Interdictions : rien inventer, rien ajouter, pas d’autres articles.
+Tu dois rester fidèle aux points ci-dessous.
+CONTENU (extrait du texte) :
+{extractive_summary}
+Donne une explication en 4–6 phrases maximum.
+Dernière ligne : Articles cités : {article_id}
 """
 FORMAT FINAL :
 Réponse courte.
 Dernière ligne : Articles cités : A, B
+""".strip()
 # ==================== CORE ====================
 def answer_query(q: str) -> Dict[str, Any]:
+    q = (q or "").strip()
     if not q:
         return {"mode": "QA", "answer": _REFUSAL, "articles": []}
     article_id = extract_article_id(q)
+    # ---------- EXPLAIN (FAST) ----------
     if is_explain_request(q):
         if not article_id:
+            return {"mode": "EXPLAIN", "answer": _EXPLAIN_REFUSAL, "articles": []}
         text = load_article_text(article_id)
         if not text:
+            return {"mode": "EXPLAIN", "answer": f"Article {article_id} introuvable.", "articles": []}
+        # 1) explication immédiate par extraction (très rapide)
+        extractive = extractive_explain(article_id, text)
+        # 2) optionnel : mini reformulation LLM sur le résumé (pas sur l’article)
+        if EXPLAIN_USE_LLM:
+            try:
+                prompt = build_explain_llm_prompt(article_id, extractive)
+                llm_ans = llm_generate_explain_from_summary(prompt).strip()
+                # garantie citation
+                if "Articles cités" not in llm_ans:
+                    llm_ans = llm_ans.rstrip() + f"\n\nArticles cités : {article_id}"
+                return {"mode": "EXPLAIN", "answer": llm_ans, "articles": [article_id]}
+            except Exception:
+                # fallback extractif si souci LLM
+                return {"mode": "EXPLAIN", "answer": extractive, "articles": [article_id]}
+        return {"mode": "EXPLAIN", "answer": extractive, "articles": [article_id]}
     # ---------- FULLTEXT ----------
     if article_id and is_fulltext_request(q):
         text = load_article_text(article_id)
+        return {"mode": "FULLTEXT", "answer": text or _REFUSAL, "articles": [article_id]}
     # ---------- LIST ----------
     if is_list_request(q):
         vs = get_vectorstore()
         docs = vs.similarity_search(q, k=5)
+        arts = list({normalize_article_id(d.metadata.get("article_id", "")) for d in docs})
+        return {"mode": "LIST", "answer": "", "articles": arts}
     # ---------- QA ----------
     vs = get_vectorstore()
     docs = vs.similarity_search(q, k=TOP_K_FINAL)
     context = "\n\n".join(d.page_content for d in docs)
+    articles = [normalize_article_id(d.metadata.get("article_id", "")) for d in docs]
     prompt = build_qa_prompt(q, context, articles)
     answer = llm_generate_qa(prompt)
+    return {"mode": "QA", "answer": answer, "articles": articles}