Spaces:

renatavl
/

RAG

Sleeping

App Files Files Community

renatavl commited on Dec 16, 2025

Commit

860ef8a

0 Parent(s):

init

Browse files

Files changed (2) hide show

app.py +454 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import os
+import re
+import ast
+import threading
+from dataclasses import dataclass
+from typing import List, Tuple, Optional, Dict, Any
+from itertools import islice
+import numpy as np
+import gradio as gr
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from litellm import completion
+from datasets import load_dataset
+# -----------------------------
+# Config
+# -----------------------------
+HF_DATASET_NAME = "CodeKapital/CookingRecipes"
+DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+CHUNK_SIZE_WORDS = 350
+CHUNK_OVERLAP_WORDS = 60
+TOPK_BM25 = 25
+TOPK_DENSE = 25
+TOPK_AFTER_RERANK = 6
+OLLAMA_BASE_URL = "http://localhost:11434"  # локальний Ollama
+DEFAULT_N_RECORDS = 500
+# -----------------------------
+# Data structures
+# -----------------------------
+@dataclass
+class Chunk:
+    chunk_id: str
+    source: str
+    text: str
+# -----------------------------
+# Preprocessing + chunking
+# -----------------------------
+_whitespace_re = re.compile(r"\s+")
+_token_re = re.compile(r"[A-Za-zА-Яа-яІіЇїЄє0-9]+")
+def normalize_text(text: str) -> str:
+    text = (text or "").replace("\u00a0", " ")
+    text = _whitespace_re.sub(" ", text).strip()
+    return text
+def tokenize_for_bm25(text: str) -> List[str]:
+    return [t.lower() for t in _token_re.findall(text or "")]
+def chunk_text(
+    source: str,
+    text: str,
+    chunk_size_words: int = CHUNK_SIZE_WORDS,
+    overlap_words: int = CHUNK_OVERLAP_WORDS
+) -> List[Chunk]:
+    """Чанкання по словам з overlap."""
+    words = (text or "").split()
+    if not words:
+        return []
+    chunks: List[Chunk] = []
+    start = 0
+    idx = 0
+    while start < len(words):
+        end = min(start + chunk_size_words, len(words))
+        chunk_str = " ".join(words[start:end]).strip()
+        if chunk_str:
+            chunks.append(Chunk(
+                chunk_id=f"{source}::chunk{idx}",
+                source=source,
+                text=chunk_str
+            ))
+            idx += 1
+        if end == len(words):
+            break
+        start = max(0, end - overlap_words)
+    return chunks
+# -----------------------------
+# HF dataset helpers
+# -----------------------------
+def _to_list(x: Any) -> List[str]:
+    """ingredients/directions можуть бути list або строкою зі списком."""
+    if x is None:
+        return []
+    if isinstance(x, list):
+        return [str(i).strip() for i in x if str(i).strip()]
+    if isinstance(x, str):
+        s = x.strip()
+        if not s:
+            return []
+        try:
+            v = ast.literal_eval(s)
+            if isinstance(v, list):
+                return [str(i).strip() for i in v if str(i).strip()]
+        except Exception:
+            pass
+        if "\n" in s:
+            parts = [p.strip(" -•\t") for p in s.splitlines()]
+        else:
+            parts = [p.strip() for p in s.split(",")]
+        return [p for p in parts if p]
+    return [str(x).strip()] if str(x).strip() else []
+def recipe_row_to_doc(row: Dict[str, Any], idx: int) -> Tuple[str, str]:
+    """Повертає (source_name, full_text) для одного рецепта."""
+    title = (row.get("title") or "").strip()
+    link = (row.get("link") or "").strip()
+    src = (row.get("source") or "").strip()
+    ingredients = _to_list(row.get("ingredients"))
+    directions = _to_list(row.get("directions"))
+    safe_title = title[:80].replace("\n", " ").strip()
+    source_name = f"CookingRecipes#{idx}"
+    if safe_title:
+        source_name += f" | {safe_title}"
+    if link:
+        source_name += f" | {link}"
+    parts = []
+    parts.append(f"Title: {title or '(unknown)'}")
+    if src:
+        parts.append(f"Source: {src}")
+    if link:
+        parts.append(f"Link: {link}")
+    if ingredients:
+        parts.append("Ingredients:\n" + "\n".join(f"- {i}" for i in ingredients))
+    if directions:
+        parts.append("Directions:\n" + "\n".join(f"{i+1}. {d}" for i, d in enumerate(directions)))
+    full_text = normalize_text("\n\n".join(parts))
+    return source_name, full_text
+def load_first_n_recipes(n: int, streaming: bool = True) -> List[Tuple[str, str]]:
+    n = int(max(0, n))
+    if n == 0:
+        return []
+    if streaming:
+        ds = load_dataset(HF_DATASET_NAME, split="train", streaming=True)
+        iterator = islice(ds, n)
+    else:
+        ds = load_dataset(HF_DATASET_NAME, split=f"train[:{n}]")
+        iterator = ds
+    docs: List[Tuple[str, str]] = []
+    for idx, row in enumerate(iterator):
+        source_name, text = recipe_row_to_doc(row, idx)
+        if text.strip():
+            docs.append((source_name, text))
+    return docs
+# -----------------------------
+# RAG Engine
+# -----------------------------
+class RAGEngine:
+    def __init__(self):
+        self.chunks: List[Chunk] = []
+        self.bm25: Optional[BM25Okapi] = None
+        self.bm25_corpus_tokens: List[List[str]] = []
+        self.dense_model: Optional[SentenceTransformer] = None
+        self.rerank_model: Optional[CrossEncoder] = None
+        self.chunk_embeddings: Optional[np.ndarray] = None
+        self.last_build_info: str = "Index not built yet."
+    def ensure_models(self) -> None:
+        if self.dense_model is None:
+            self.dense_model = SentenceTransformer(DENSE_MODEL_NAME)
+        if self.rerank_model is None:
+            self.rerank_model = CrossEncoder(RERANK_MODEL_NAME)
+    def build_from_dataset(self, n_records: int, streaming: bool) -> None:
+        docs = load_first_n_recipes(n_records, streaming=streaming)
+        all_chunks: List[Chunk] = []
+        for source, text in docs:
+            all_chunks.extend(chunk_text(source, text))
+        self.chunks = all_chunks
+        if not self.chunks:
+            self.bm25 = None
+            self.chunk_embeddings = None
+            self.last_build_info = "No chunks built (N too small or empty rows)."
+            return
+        # Models
+        self.ensure_models()
+        # BM25
+        self.bm25_corpus_tokens = [tokenize_for_bm25(c.text) for c in self.chunks]
+        self.bm25 = BM25Okapi(self.bm25_corpus_tokens)
+        # Dense embeddings
+        embs = self.dense_model.encode(
+            [c.text for c in self.chunks],
+            batch_size=64,
+            show_progress_bar=True,
+            normalize_embeddings=True
+        )
+        self.chunk_embeddings = np.asarray(embs, dtype=np.float32)
+        self.last_build_info = (
+            f"Built index from {len(docs)} recipes → {len(self.chunks)} chunks. "
+            f"Streaming={streaming}."
+        )
+    def retrieve_candidates(
+        self,
+        query: str,
+        use_bm25: bool,
+        use_dense: bool,
+        topk_bm25: int = TOPK_BM25,
+        topk_dense: int = TOPK_DENSE
+    ) -> List[int]:
+        if not self.chunks:
+            return []
+        candidate_ids = set()
+        if use_bm25 and self.bm25 is not None:
+            q_tokens = tokenize_for_bm25(query)
+            scores = self.bm25.get_scores(q_tokens)
+            top_idx = np.argsort(scores)[::-1][:int(topk_bm25)]
+            candidate_ids.update(top_idx.tolist())
+        if use_dense and self.dense_model is not None and self.chunk_embeddings is not None:
+            q_emb = self.dense_model.encode([query], normalize_embeddings=True)
+            q_emb = np.asarray(q_emb, dtype=np.float32)[0]
+            sims = self.chunk_embeddings @ q_emb
+            top_idx = np.argsort(sims)[::-1][:int(topk_dense)]
+            candidate_ids.update(top_idx.tolist())
+        return list(candidate_ids)
+    def rerank(self, query: str, candidate_idx: List[int], top_n: int = TOPK_AFTER_RERANK) -> List[int]:
+        if not candidate_idx:
+            return []
+        if self.rerank_model is None:
+            return candidate_idx[:int(top_n)]
+        pairs = [(query, self.chunks[i].text) for i in candidate_idx]
+        scores = self.rerank_model.predict(pairs)
+        order = np.argsort(scores)[::-1]
+        return [candidate_idx[i] for i in order[:int(top_n)]]
+    def build_context(self, selected_idx: List[int]) -> str:
+        blocks = []
+        for j, i in enumerate(selected_idx, start=1):
+            c = self.chunks[i]
+            blocks.append(
+                f"[{j}] Source: {c.source} | {c.chunk_id}\n{c.text}"
+            )
+        return "\n\n---\n\n".join(blocks)
+    def answer_with_llm(self, query: str, context: str, model: str, api_key: str, temperature: float = 0.2) -> str:
+        model = (model or "").strip()
+        api_key = (api_key or "").strip()
+        if not model:
+            return "Model is empty."
+        if model.startswith("openai/") or model.startswith("gpt-"):
+            if api_key:
+                os.environ["OPENAI_API_KEY"] = api_key
+        elif model.startswith("openrouter/"):
+            if api_key:
+                os.environ["OPENROUTER_API_KEY"] = api_key
+        elif model.startswith("groq/"):
+            if api_key:
+                os.environ["GROQ_API_KEY"] = api_key
+        system = (
+            "You are a helpful QA assistant.\n"
+            "Answer the user's question using ONLY the provided context.\n"
+            "If the answer is not in the context, say you don't know.\n"
+            "When you use facts from the context, add citations like [1] referring to the chunk numbers."
+        )
+        user = f"Question: {query}\n\nContext:\n{context}"
+        extra = {}
+        if model.startswith("ollama/"):
+            extra["api_base"] = OLLAMA_BASE_URL
+        resp = completion(
+            model=model,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            temperature=temperature,
+            api_key=api_key if api_key else None,
+            **extra
+        )
+        return resp["choices"][0]["message"]["content"]
+# -----------------------------
+# Global engine + lock
+# -----------------------------
+ENGINE = RAGEngine()
+ENGINE_LOCK = threading.Lock()
+# build once on startup
+with ENGINE_LOCK:
+    ENGINE.build_from_dataset(DEFAULT_N_RECORDS, streaming=True)
+# -----------------------------
+# Gradio UI callbacks
+# -----------------------------
+def rebuild_index(n_records: int, streaming: bool) -> str:
+    with ENGINE_LOCK:
+        ENGINE.build_from_dataset(int(n_records), bool(streaming))
+        return ENGINE.last_build_info
+def qa(
+    question: str,
+    use_bm25: bool,
+    use_dense: bool,
+    use_rerank: bool,
+    model: str,
+    api_key: str,
+    topk_bm25: int,
+    topk_dense: int,
+    topk_final: int
+):
+    question = (question or "").strip()
+    if not question:
+        return "Type a question.", ""
+    if not use_bm25 and not use_dense:
+        return "Enable BM25 and/or Dense retrieval (otherwise there is no context).", ""
+    with ENGINE_LOCK:
+        if not ENGINE.chunks:
+            return "Index is empty. Click 'Rebuild index' with N>0.", ""
+        cands = ENGINE.retrieve_candidates(
+            question,
+            use_bm25=use_bm25,
+            use_dense=use_dense,
+            topk_bm25=int(topk_bm25),
+            topk_dense=int(topk_dense)
+        )
+        if not cands:
+            return "No candidates retrieved.", ""
+        if use_rerank:
+            selected = ENGINE.rerank(question, cands, top_n=int(topk_final))
+        else:
+            selected = cands[:int(topk_final)]
+        context = ENGINE.build_context(selected)
+        try:
+            answer = ENGINE.answer_with_llm(question, context, model=model, api_key=api_key)
+        except Exception as e:
+            answer = f"LLM call failed: {type(e).__name__}: {e}"
+    return answer, context
+# -----------------------------
+# Launch UI
+# -----------------------------
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="RAG QA on CookingRecipes (BM25 + Dense + Rerank)") as demo:
+        gr.Markdown(
+            "# RAG QA (CookingRecipes)\n"
+            f"Dataset: `{HF_DATASET_NAME}`. Індексуємо **перші N рецептів**.\n\n"
+        )
+        with gr.Row():
+            n_records = gr.Slider(50, 5000, value=DEFAULT_N_RECORDS, step=50, label="N recipes to index (first N)")
+            streaming = gr.Checkbox(value=True, label="Use streaming (recommended)")
+        build_btn = gr.Button("Rebuild index")
+        build_status = gr.Markdown(value=f"**Status:** {ENGINE.last_build_info}")
+        build_btn.click(fn=rebuild_index, inputs=[n_records, streaming], outputs=[build_status])
+        gr.Markdown("---")
+        with gr.Row():
+            question = gr.Textbox(label="Question", placeholder="Ask about recipes...", lines=2)
+        with gr.Row():
+            use_bm25 = gr.Checkbox(value=True, label="Use BM25 (keyword)")
+            use_dense = gr.Checkbox(value=True, label="Use Dense (embeddings)")
+            use_rerank = gr.Checkbox(value=True, label="Use Cross-Encoder Reranker")
+        with gr.Row():
+            model = gr.Textbox(
+                label="LLM model (LiteLLM)",
+                value="openai/gpt-4o-mini",
+                placeholder="e.g. openai/gpt-4o-mini OR groq/... OR openrouter/..."
+            )
+            api_key = gr.Textbox(
+                label="API key (leave empty for Ollama)",
+                placeholder="Empty for local ollama",
+                type="password"
+            )
+        with gr.Row():
+            topk_bm25 = gr.Slider(5, 80, value=TOPK_BM25, step=1, label="Top-K BM25 candidates")
+            topk_dense = gr.Slider(5, 80, value=TOPK_DENSE, step=1, label="Top-K Dense candidates")
+            topk_final = gr.Slider(1, 12, value=TOPK_AFTER_RERANK, step=1, label="Chunks to LLM (final)")
+        run_btn = gr.Button("Answer")
+        answer = gr.Markdown(label="Answer")
+        context = gr.Textbox(label="Retrieved context (debug)", lines=16)
+        run_btn.click(
+            fn=qa,
+            inputs=[question, use_bm25, use_dense, use_rerank, model, api_key, topk_bm25, topk_dense, topk_final],
+            outputs=[answer, context]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()
+    # for local run with fixed port:
+    # demo.launch(server_name="127.0.0.1", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0.0
+numpy>=1.24.0
+rank-bm25>=0.2.2
+sentence-transformers>=2.6.0
+litellm>=1.40.0
+pypdf>=4.0.0
+datasets>=2.18.0