Spaces:

edamonia
/

Batch_RAG

Sleeping

App Files Files Community

DolAr1610 commited on Dec 15, 2025

Commit

a5c9fa3

1 Parent(s): cde7325

add new logic

Browse files

Files changed (8) hide show

ingestion/ingest_text.py +15 -3
llm.py +44 -18
main.py +105 -10
requirements.txt +3 -2
search/bm_25_index.py +27 -0
search/bm_25_search.py +24 -0
search/reranker.py +23 -0
search/search_classical.py +76 -10

ingestion/ingest_text.py CHANGED Viewed

@@ -3,6 +3,7 @@ from tqdm import tqdm
 from db.text_db import init_chroma, add_document_text
 from embeddings.text_embedder import get_text_embedding
 from ingestion.config import JSON_PATH
 def chunk_text(text, chunk_size=400, overlap=50):
@@ -23,6 +24,8 @@ def ingest_texts():
         articles = json.load(f)
     print(f"Found {len(articles)} articles.")
     for article in tqdm(articles, desc="Indexing texts"):
         full_text = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
@@ -38,12 +41,21 @@ def ingest_texts():
         chunks = chunk_text(full_text, chunk_size=400, overlap=50)
         for i, chunk in enumerate(chunks):
             emb = get_text_embedding(chunk)
             if emb:
-                doc_id = f"{metadata['source_url']}#chunk{i}" if metadata[
-                    "source_url"] else f"{metadata['title']}#chunk{i}"
                 add_document_text(vectordb, doc_id, emb, chunk, metadata)
             else:
                 print(f"Failed to embed chunk {i} of {metadata['title']}")
-    print("Done indexing texts.")

 from db.text_db import init_chroma, add_document_text
 from embeddings.text_embedder import get_text_embedding
 from ingestion.config import JSON_PATH
+from search.bm_25_index import build_and_save_bm25
 def chunk_text(text, chunk_size=400, overlap=50):
         articles = json.load(f)
     print(f"Found {len(articles)} articles.")
+    bm25_chunks = []
     for article in tqdm(articles, desc="Indexing texts"):
         full_text = f"{article.get('title', '')}\n{article.get('description', '')}\n{article.get('content', '')}"
         chunks = chunk_text(full_text, chunk_size=400, overlap=50)
         for i, chunk in enumerate(chunks):
+            doc_id = f"{metadata['source_url']}#chunk{i}" if metadata["source_url"] else f"{metadata['title']}#chunk{i}"
+            bm25_chunks.append({
+                "chunk_id": doc_id,
+                "chunk_text": chunk,
+                "metadata": metadata,
+            })
             emb = get_text_embedding(chunk)
             if emb:
                 add_document_text(vectordb, doc_id, emb, chunk, metadata)
             else:
                 print(f"Failed to embed chunk {i} of {metadata['title']}")
+    build_and_save_bm25(bm25_chunks)
+    print("Done indexing texts.")

llm.py CHANGED Viewed

@@ -7,22 +7,43 @@ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 def generate_response(question, retrieved_docs, model="meta-llama/llama-3-8b-instruct"):
-    context = "\n\n".join(
-        f"Title: {doc.get('title', 'N/A')}\n"
-        f"Description: {doc.get('description', 'N/A')}\n"
-        f"Content: {doc.get('content', 'N/A')}\n"
-        for doc in retrieved_docs
     )
-    prompt = (
-        "You are a polite assistant who provides clear and detailed answers based solely on the information from The Batch articles.\n\n"
-        "Rules:\n"
-        "- Answer only using the knowledge from The Batch articles.\n"
-        "- Do not mention other sources or questions; provide only accurate, detailed, and understandable answers.\n"
-        "- If the information is present in the context, give a clear answer.\n"
-        "- If the information is missing, respond with: 'Sorry, I could not find the answer in the provided context.'\n"
-        "- Do not guess, fabricate information, or go beyond the given context.\n\n"
-        f"Context for the answer:\n{context}"
     )
     headers = {
@@ -33,13 +54,18 @@ def generate_response(question, retrieved_docs, model="meta-llama/llama-3-8b-ins
     data = {
         "model": model,
         "messages": [
-            {"role": "system", "content": prompt},
-            {"role": "user", "content": question}
         ],
-        "temperature": 0.3
     }
-    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
     if response.status_code == 200:
         return response.json()['choices'][0]['message']['content'].strip()

 def generate_response(question, retrieved_docs, model="meta-llama/llama-3-8b-instruct"):
+    # 1) Нумеруємо джерела, щоб модель могла цитувати [1], [2], ...
+    sources_lines = []
+    for i, doc in enumerate(retrieved_docs, start=1):
+        title = doc.get("title", "N/A")
+        desc = doc.get("description", "")
+        content = doc.get("content", "")
+        # трохи обмежимо розмір (щоб не вбивати контекст)
+        content = (content or "")[:2000]
+        sources_lines.append(
+            f"[{i}] Title: {title}\n"
+            f"Description: {desc}\n"
+            f"Content: {content}\n"
+        )
+    sources = "\n\n".join(sources_lines).strip()
+    system_prompt = (
+        "You are a Retrieval-Augmented Question Answering assistant for The Batch articles.\n"
+        "Answer the user ONLY using the SOURCES provided.\n\n"
+        "Hard rules:\n"
+        "1) Use ONLY facts that appear in the SOURCES. Do NOT use outside knowledge.\n"
+        "2) Every factual claim MUST have a citation like [1] or [2].\n"
+        "   - If a sentence contains multiple facts from different sources, cite all relevant sources: [1][3].\n"
+        "3) If the SOURCES do not contain enough information to answer, say:\n"
+        "   \"Sorry, I could not find the answer in the provided sources.\" (and do not add citations)\n"
+        "4) Do not invent titles, dates, links, or quotes.\n"
+        "5) Keep the answer concise and clear.\n\n"
+        "Output format:\n"
+        "Answer: <your answer with citations>\n"
+        "Used sources: <list of source numbers you actually cited, e.g. [1], [3]>\n"
     )
+    user_prompt = (
+        f"SOURCES:\n{sources}\n\n"
+        f"QUESTION:\n{question}\n"
     )
     headers = {
     data = {
         "model": model,
         "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
         ],
+        "temperature": 0.2
     }
+    response = requests.post(
+        "https://openrouter.ai/api/v1/chat/completions",
+        headers=headers,
+        json=data,
+        timeout=60
+    )
     if response.status_code == 200:
         return response.json()['choices'][0]['message']['content'].strip()

main.py CHANGED Viewed

@@ -1,8 +1,18 @@
 import streamlit as st
-from search.search_classical import classical_search
 from search.search_best_pair import best_pair_search
 from llm import generate_response
 st.set_page_config(page_title="🔍 Multimodal Search The Batch")
 st.image("data/the-batch-logo.webp", width=300)
 st.title("Multimodal Assistant")
@@ -10,9 +20,24 @@ st.title("Multimodal Assistant")
 mode = st.selectbox("🔎 Select the search mode:", ["Classical RAG", "Multimodal RAG"])
 query = st.text_input("📝 Enter the text query:")
 if query:
     if mode == "Classical RAG":
-        results = classical_search(query, k=3)
     else:
         results = best_pair_search(query, k=3)
@@ -35,16 +60,86 @@ if query:
             st.markdown(f"[🔗 Read the full article →]({meta['source_url']})")
         st.markdown("---")
-    if st.button("🧠 Generate a response to a query"):
-        docs = [
-            {
                 "title": meta.get("title", ""),
                 "description": meta.get("description", ""),
-                "content": meta.get("content", "")
-            }
-            for meta in results
-        ]
         response = generate_response(query, docs)
         st.markdown("### 🤖 Generated Response:")
-        st.success(response)

 import streamlit as st
+from search.search_classical import classical_search, classical_retrieve_chunks
 from search.search_best_pair import best_pair_search
 from llm import generate_response
+def pick_mode(label: str) -> str:
+    if label.startswith("Semantic"):
+        return "semantic"
+    if label.startswith("Keyword"):
+        return "bm25"
+    return "hybrid"
 st.set_page_config(page_title="🔍 Multimodal Search The Batch")
 st.image("data/the-batch-logo.webp", width=300)
 st.title("Multimodal Assistant")
 mode = st.selectbox("🔎 Select the search mode:", ["Classical RAG", "Multimodal RAG"])
 query = st.text_input("📝 Enter the text query:")
+# --- Classical controls ---
+classical_retriever = "Semantic (embeddings)"
+use_reranker = True
+if mode == "Classical RAG":
+    classical_retriever = st.radio(
+        "🧩 Classical retrieval:",
+        ["Semantic (embeddings)", "Keyword (BM25)", "Hybrid (BM25 + Semantic)"],
+        horizontal=True
+    )
+    use_reranker = st.checkbox("✨ Use reranker (cross-encoder)", value=True)
+# --- Preview results ---
+results = []
 if query:
     if mode == "Classical RAG":
+        search_mode = pick_mode(classical_retriever)
+        results = classical_search(query, k=3, mode=search_mode)
     else:
         results = best_pair_search(query, k=3)
             st.markdown(f"[🔗 Read the full article →]({meta['source_url']})")
         st.markdown("---")
+# --- Generate answer ---
+if query and st.button("🧠 Generate a response to a query"):
+    if mode == "Classical RAG":
+        search_mode = pick_mode(classical_retriever)
+        chunks = classical_retrieve_chunks(
+            query=query,
+            mode=search_mode,
+            fetch_k=50,
+            rerank_k=5,
+            use_reranker=use_reranker
+        )
+        docs = []
+        for idx, c in enumerate(chunks, start=1):
+            meta = c.get("metadata", {})
+            docs.append({
+                "id": idx,
                 "title": meta.get("title", ""),
                 "description": meta.get("description", ""),
+                "source_url": meta.get("source_url", ""),
+                "content": c.get("chunk_text", ""),
+                "retriever": c.get("retriever", ""),
+                "rerank_score": c.get("rerank_score", None),
+            })
         response = generate_response(query, docs)
         st.markdown("### 🤖 Generated Response:")
+        st.success(response)
+        st.markdown("### 📌 Sources")
+        for d in docs:
+            st.markdown(f"**[{d['id']}] {d.get('title','')}**")
+            if d.get("source_url"):
+                st.markdown(d["source_url"])
+            st.write((d.get("content") or "")[:450] + "...")
+            if d.get("retriever"):
+                st.caption(f"retriever: {d['retriever']}")
+            if d.get("rerank_score") is not None:
+                st.caption(f"rerank_score: {d['rerank_score']:.4f}")
+            st.markdown("---")
+    else:
+        # ✅ Multimodal mode:
+        # Preview stays multimodal (best_pair_search),
+        # but the ANSWER is generated from TEXT chunks (hybrid) for reliable QA + citations.
+        chunks = classical_retrieve_chunks(
+            query=query,
+            mode="hybrid",
+            fetch_k=50,
+            rerank_k=5,
+            use_reranker=True
+        )
+        docs = []
+        for idx, c in enumerate(chunks, start=1):
+            meta = c.get("metadata", {})
+            docs.append({
+                "id": idx,
+                "title": meta.get("title", ""),
+                "description": meta.get("description", ""),
+                "source_url": meta.get("source_url", ""),
+                "content": c.get("chunk_text", ""),
+                "retriever": c.get("retriever", ""),
+                "rerank_score": c.get("rerank_score", None),
+            })
+        response = generate_response(query, docs)
+        st.markdown("### 🤖 Generated Response:")
+        st.success(response)
+        st.markdown("### 📌 Sources (text chunks)")
+        for d in docs:
+            st.markdown(f"**[{d['id']}] {d.get('title','')}**")
+            if d.get("source_url"):
+                st.markdown(d["source_url"])
+            st.write((d.get("content") or "")[:450] + "...")
+            if d.get("retriever"):
+                st.caption(f"retriever: {d['retriever']}")
+            if d.get("rerank_score") is not None:
+                st.caption(f"rerank_score: {d['rerank_score']:.4f}")
+            st.markdown("---")

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ langchain
 sentence-transformers
 transformers
 torch
-chromadb==0.4.22
 nltk
 requests
 tqdm
@@ -13,4 +13,5 @@ selenium
 webdriver-manager
 langchain-community
 emoji
-numpy==1.26.4

 sentence-transformers
 transformers
 torch
+chromadb==1.3.6
 nltk
 requests
 tqdm
 webdriver-manager
 langchain-community
 emoji
+numpy==1.26.4
+rank-bm25

search/bm_25_index.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+import pickle
+import re
+from rank_bm25 import BM25Okapi
+BM25_PATH = "bm_25_index/bm25.pkl"
+def tokenize(text: str):
+    text = (text or "").lower()
+    return re.findall(r"[a-z0-9]+", text)
+def build_and_save_bm25(chunks: list[dict], path: str = BM25_PATH) -> None:
+    """
+    chunks: [{ "chunk_id": str, "chunk_text": str, "metadata": dict }, ...]
+    """
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    corpus_tokens = [tokenize(c["chunk_text"]) for c in chunks]
+    bm25 = BM25Okapi(corpus_tokens)
+    payload = {
+        "bm25": bm25,
+        "chunks": chunks,  # зберігаємо, щоб потім віддати метадані
+    }
+    with open(path, "wb") as f:
+        pickle.dump(payload, f)

search/bm_25_search.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pickle
+import re
+BM25_PATH = "bm_25_index/bm25.pkl"
+def tokenize(text: str):
+    return re.findall(r"[a-z0-9]+", (text or "").lower())
+def bm25_search(query: str, k: int = 50):
+    with open(BM25_PATH, "rb") as f:
+        payload = pickle.load(f)
+    bm25 = payload["bm25"]
+    chunks = payload["chunks"]
+    scores = bm25.get_scores(tokenize(query))
+    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
+    return [{
+        "chunk_id": chunks[i]["chunk_id"],
+        "chunk_text": chunks[i]["chunk_text"],
+        "metadata": chunks[i]["metadata"],
+        "score": float(scores[i]),
+    } for i in top_idx]

search/reranker.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from sentence_transformers import CrossEncoder
+# легка і популярна модель для rerank
+_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+_ce = None
+def rerank(query: str, chunks: list[dict], top_k: int = 5) -> list[dict]:
+    """
+    chunks: [{ "chunk_id":..., "chunk_text":..., "metadata":..., ... }, ...]
+    returns same dicts + "rerank_score"
+    """
+    global _ce
+    if _ce is None:
+        _ce = CrossEncoder(_MODEL_NAME)
+    pairs = [(query, c.get("chunk_text", "")) for c in chunks]
+    scores = _ce.predict(pairs)
+    for c, s in zip(chunks, scores):
+        c["rerank_score"] = float(s)
+    chunks.sort(key=lambda x: x.get("rerank_score", 0.0), reverse=True)
+    return chunks[:top_k]

search/search_classical.py CHANGED Viewed

@@ -1,18 +1,85 @@
 from db.text_db import init_chroma
 from embeddings.text_embedder import get_text_embedding
-def classical_search(query, k=5):
-    db = init_chroma()
-    emb = get_text_embedding(query)
-    results = db.similarity_search_by_vector(emb, k=k)
-    articles = []
     seen = set()
-    for r in results:
-        meta = r.metadata
         aid = meta.get("source_url") or meta.get("title")
-        if aid not in seen:
             seen.add(aid)
             articles.append({
                 "title": meta.get("title"),
@@ -22,5 +89,4 @@ def classical_search(query, k=5):
                 "content": meta.get("content"),
                 "source_url": meta.get("source_url"),
             })
-    return articles

 from db.text_db import init_chroma
 from embeddings.text_embedder import get_text_embedding
+from search.bm_25_search import bm25_search
+from search.reranker import rerank
+import re
+def classical_retrieve_chunks(
+    query: str,
+    mode: str = "semantic",
+    fetch_k: int = 50,
+    rerank_k: int = 5,
+    use_reranker: bool = True,
+    year_filter: int | None = None
+) -> list[dict]:
+    """
+    returns chunks:
+    [{chunk_id, chunk_text, metadata, score? , rerank_score?}, ...]
+    """
+    chunks = []
+    if mode in ("semantic", "hybrid"):
+        db = init_chroma()
+        emb = get_text_embedding(query)
+        dense_res = db.similarity_search_by_vector(emb, k=fetch_k)
+        for i, r in enumerate(dense_res):
+            chunks.append({
+                "chunk_id": f"semantic_{i}",
+                "chunk_text": r.page_content,
+                "metadata": r.metadata,
+                "retriever": "semantic",
+                "score": None,
+            })
+    if mode in ("bm25", "hybrid"):
+        bm25_res = bm25_search(query, k=fetch_k)
+        for r in bm25_res:
+            r["retriever"] = "bm25"
+            chunks.append(r)
+    # deduplicate by chunk_text
     seen = set()
+    unique_chunks = []
+    for c in chunks:
+        key = c["chunk_text"][:200]  # хеш по тексту
+        if key not in seen:
+            seen.add(key)
+            unique_chunks.append(c)
+    chunks = unique_chunks
+    if use_reranker and chunks:
+        chunks = rerank(query, chunks, top_k=rerank_k)
+    else:
+        chunks = chunks[:rerank_k]
+    return chunks
+def classical_search(query, k=5, mode="semantic"):
+    """
+    для UI результатів (статті), як у тебе було
+    """
+    chunks = classical_retrieve_chunks(
+        query=query,
+        mode=mode,
+        fetch_k=max(50, k * 20),
+        rerank_k=max(10, k * 5),
+        use_reranker=False,     # для списку статей можна без reranker
+        year_filter=None
+    )
+    # дедуп по статтях
+    articles = []
+    seen = set()
+    for c in chunks:
+        meta = c["metadata"]
         aid = meta.get("source_url") or meta.get("title")
+        if aid and aid not in seen:
             seen.add(aid)
             articles.append({
                 "title": meta.get("title"),
                 "content": meta.get("content"),
                 "source_url": meta.get("source_url"),
             })
+    return articles[:k]