Spaces:

FabIndy
/

code-education-rag

Sleeping

App Files Files Community

FabIndy commited on Jan 26

Commit

247f65e

1 Parent(s): 456aba5

Fix imports: use src package everywhere

Browse files

Files changed (5) hide show

src/__init__.py +0 -0
src/list.py +69 -26
src/rag_core.py +12 -7
src/resources.py +2 -1
src/utils.py +3 -1

src/__init__.py ADDED Viewed

File without changes

src/list.py CHANGED Viewed

@@ -1,6 +1,9 @@
 # src/list.py
 from __future__ import annotations
-from typing import Dict, List, Tuple
 import re
@@ -8,11 +11,18 @@ import re
 # Configuration algorithmique
 # -----------------------------
-MAX_NGRAM = 5
-MIN_DOC_FREQ = 2
-WINDOW = 80
-SCORE_THRESHOLD = 60
 # -----------------------------
@@ -20,7 +30,7 @@ SCORE_THRESHOLD = 60
 # -----------------------------
 def normalize(text: str) -> str:
-    text = text.lower()
     text = re.sub(r"[’']", " ", text)
     text = re.sub(r"[^a-zàâçéèêëîïôûùüÿñæœ\s]", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
@@ -31,13 +41,12 @@ def tokenize(text: str) -> List[str]:
     return text.split()
-def generate_ngrams(tokens: List[str]) -> List[Tuple[str, int]]:
-    ngrams = []
     n = len(tokens)
-    for size in range(1, min(MAX_NGRAM, n) + 1):
         for i in range(n - size + 1):
-            seg = " ".join(tokens[i : i + size])
-            ngrams.append((seg, size))
     return ngrams
@@ -45,14 +54,14 @@ def generate_ngrams(tokens: List[str]) -> List[Tuple[str, int]]:
 # Phrase pivot (corpus-driven)
 # -----------------------------
-def extract_phrase_pivot(query: str, articles: Dict[str, str]) -> str | None:
     q_norm = normalize(query)
     tokens = tokenize(q_norm)
-    candidates = generate_ngrams(tokens)
     stats = []
-    for seg, size in candidates:
         seg_re = re.compile(rf"\b{re.escape(seg)}\b")
         doc_freq = 0
@@ -60,8 +69,9 @@ def extract_phrase_pivot(query: str, articles: Dict[str, str]) -> str | None:
             if seg_re.search(normalize(text)):
                 doc_freq += 1
-        if doc_freq >= MIN_DOC_FREQ:
-            stats.append((seg, size, doc_freq))
     if not stats:
         return None
@@ -100,40 +110,73 @@ def centrality_factor(text: str, pivot: str) -> float:
 # Score lexical
 # -----------------------------
-def lexical_score(text: str, pivot: str) -> int:
     text_norm = normalize(text)
     pivot_norm = normalize(pivot)
     score = 0
     for m in re.finditer(rf"\b{re.escape(pivot_norm)}\b", text_norm):
-        start = max(0, m.start() - WINDOW)
-        end = min(len(text_norm), m.end() + WINDOW)
         score += (end - start)
     return score
 # -----------------------------
-# API principale LIST
 # -----------------------------
-def list_articles(query: str, articles: Dict[str, str], top_k: int = 15) -> List[str]:
-    pivot = extract_phrase_pivot(query, articles)
     if not pivot:
         return []
-    scored = []
     for aid, text in articles.items():
-        s_lex = lexical_score(text, pivot)
         if s_lex == 0:
             continue
         factor = centrality_factor(text, pivot)
         s_final = s_lex * factor
-        if s_final >= SCORE_THRESHOLD:
             scored.append((aid, s_final))
     scored.sort(key=lambda x: x[1], reverse=True)
-    return [aid for aid, _ in scored[:top_k]]

 # src/list.py
 from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, Any, Callable
 import re
 # Configuration algorithmique
 # -----------------------------
+@dataclass
+class ListConfig:
+    # n-grams
+    max_ngram: int = 5
+    min_doc_freq: int = 2
+    # scoring
+    window: int = 80
+    score_threshold: float = 60.0
+    # output
+    top_k: int = 15
 # -----------------------------
 # -----------------------------
 def normalize(text: str) -> str:
+    text = (text or "").lower()
     text = re.sub(r"[’']", " ", text)
     text = re.sub(r"[^a-zàâçéèêëîïôûùüÿñæœ\s]", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
     return text.split()
+def generate_ngrams(tokens: List[str], max_ngram: int) -> List[str]:
+    ngrams: List[str] = []
     n = len(tokens)
+    for size in range(1, min(max_ngram, n) + 1):
         for i in range(n - size + 1):
+            ngrams.append(" ".join(tokens[i : i + size]))
     return ngrams
 # Phrase pivot (corpus-driven)
 # -----------------------------
+def extract_phrase_pivot(query: str, articles: Dict[str, str], cfg: ListConfig) -> str | None:
     q_norm = normalize(query)
     tokens = tokenize(q_norm)
+    candidates = generate_ngrams(tokens, cfg.max_ngram)
     stats = []
+    for seg in candidates:
         seg_re = re.compile(rf"\b{re.escape(seg)}\b")
         doc_freq = 0
             if seg_re.search(normalize(text)):
                 doc_freq += 1
+        if doc_freq >= cfg.min_doc_freq:
+            # longueur = nb de mots (préférence aux pivots plus spécifiques)
+            stats.append((seg, len(seg.split()), doc_freq))
     if not stats:
         return None
 # Score lexical
 # -----------------------------
+def lexical_score(text: str, pivot: str, window: int) -> int:
     text_norm = normalize(text)
     pivot_norm = normalize(pivot)
     score = 0
     for m in re.finditer(rf"\b{re.escape(pivot_norm)}\b", text_norm):
+        start = max(0, m.start() - window)
+        end = min(len(text_norm), m.end() + window)
         score += (end - start)
     return score
 # -----------------------------
+# Algorithme LIST (coeur)
 # -----------------------------
+def list_articles_lexical(query: str, articles: Dict[str, str], cfg: ListConfig) -> List[str]:
+    pivot = extract_phrase_pivot(query, articles, cfg)
     if not pivot:
         return []
+    scored: List[tuple[str, float]] = []
     for aid, text in articles.items():
+        s_lex = lexical_score(text, pivot, cfg.window)
         if s_lex == 0:
             continue
         factor = centrality_factor(text, pivot)
         s_final = s_lex * factor
+        if s_final >= cfg.score_threshold:
             scored.append((aid, s_final))
     scored.sort(key=lambda x: x[1], reverse=True)
+    return [aid for aid, _ in scored[: cfg.top_k]]
+# -----------------------------
+# API attendue par rag_core.py
+# -----------------------------
+def list_articles(
+    query: str,
+    articles: Dict[str, str],
+    vs: Any = None,  # fallback possible plus tard
+    normalize_article_id: Callable[[str], str] | None = None,
+    list_triggers: List[str] | None = None,
+    cfg: ListConfig | None = None,
+) -> Dict[str, Any]:
+    """
+    Signature compatible avec rag_core.py.
+    Pour l'instant : lexical-only (ton algo).
+    Le paramètre `vs` est accepté pour compatibilité, mais pas utilisé ici.
+    """
+    cfg = cfg or ListConfig()
+    q = (query or "").strip()
+    if not q:
+        return {"mode": "LIST", "answer": "", "articles": []}
+    ids = list_articles_lexical(q, articles, cfg)
+    return {
+        "mode": "LIST",
+        "answer": "",
+        "articles": ids,
+    }

src/rag_core.py CHANGED Viewed

@@ -3,12 +3,15 @@ from __future__ import annotations
 from typing import Dict, Any, List
 import json
-import list as list_mode
-import fulltext as fulltext_mode
-import synthesis as synthesis_mode
-import qa as qa_mode
-from config import (
     CHUNKS_PATH,
     LIST_TRIGGERS,
     REFUSAL,
@@ -19,14 +22,16 @@ from config import (
     QA_MAX_TOKENS,
     QA_TEMPERATURE,
 )
-from utils import (
     normalize_article_id,
     extract_article_id,
     is_list_request,
     is_fulltext_request,
     is_synthesis_request,
 )
-from resources import get_vectorstore, get_llm
 # ====================

 from typing import Dict, Any, List
 import json
+from src import list as list_mode
+from src import fulltext as fulltext_mode
+from src import synthesis as synthesis_mode
+from src import qa as qa_mode
+from src import resources
+from src.config import (
     CHUNKS_PATH,
     LIST_TRIGGERS,
     REFUSAL,
     QA_MAX_TOKENS,
     QA_TEMPERATURE,
 )
+from src.utils import (
     normalize_article_id,
     extract_article_id,
     is_list_request,
     is_fulltext_request,
     is_synthesis_request,
 )
+from src.resources import get_vectorstore, get_llm
 # ====================

src/resources.py CHANGED Viewed

@@ -6,7 +6,8 @@ from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings
 from llama_cpp import Llama
-from config import DB_DIR, EMBED_MODEL, LLM_MODEL_PATH, LLM_N_CTX, LLM_N_THREADS, LLM_N_BATCH
 _VS: Optional[FAISS] = None

 from langchain_huggingface import HuggingFaceEmbeddings
 from llama_cpp import Llama
+from src.config import DB_DIR, EMBED_MODEL, LLM_MODEL_PATH, LLM_N_CTX, LLM_N_THREADS, LLM_N_BATCH
 _VS: Optional[FAISS] = None

src/utils.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # src/utils.py
 from __future__ import annotations
 from typing import Optional
-from config import ARTICLE_ID_RE, LIST_TRIGGERS, FULLTEXT_TRIGGERS, EXPLAIN_TRIGGERS
 def normalize_article_id(raw: str) -> str:

 # src/utils.py
 from __future__ import annotations
 from typing import Optional
+from src.config import ARTICLE_ID_RE, LIST_TRIGGERS, FULLTEXT_TRIGGERS, EXPLAIN_TRIGGERS
 def normalize_article_id(raw: str) -> str: