Spaces:

samithcs
/

RAG_Book_QA_System

Sleeping

App Files Files Community

samithcs commited on Sep 24, 2025

Commit

63105da

verified ·

1 Parent(s): a1ff2af

Pipeline added

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pipeline/__init__.py +1 -0
pipeline/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/chunking/__init__.py +10 -0
pipeline/chunking/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc +0 -0
pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc +0 -0
pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc +0 -0
pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc +0 -0
pipeline/chunking/chunk_benchmark.py +24 -0
pipeline/chunking/fixed_chunker.py +20 -0
pipeline/chunking/semantic_chunker.py +103 -0
pipeline/chunking/splitter_base.py +7 -0
pipeline/embeddings/__init__.py +1 -0
pipeline/embeddings/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc +0 -0
pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc +0 -0
pipeline/embeddings/embedder_base.py +18 -0
pipeline/embeddings/sentence_transformer_embed.py +10 -0
pipeline/ingest/__init__.py +0 -0
pipeline/ingest/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc +0 -0
pipeline/ingest/__pycache__/html_parser.cpython-313.pyc +0 -0
pipeline/ingest/__pycache__/parser_base.cpython-313.pyc +0 -0
pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc +0 -0
pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc +0 -0
pipeline/ingest/docx_parser.py +18 -0
pipeline/ingest/html_parser.py +20 -0
pipeline/ingest/parser_base.py +6 -0
pipeline/ingest/pdf_parser.py +26 -0
pipeline/ingest/txt_parser.py +15 -0
pipeline/monitoring/__init__.py +0 -0
pipeline/monitoring/drift_detection.py +0 -0
pipeline/monitoring/feedback.py +0 -0
pipeline/rag/__init__.py +0 -0
pipeline/rag/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc +0 -0
pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc +0 -0
pipeline/rag/prompt_templates.py +12 -0
pipeline/rag/retrieval_engine.py +58 -0
pipeline/vector_store/__init__.py +1 -0
pipeline/vector_store/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc +0 -0
pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc +0 -0
pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc +0 -0
pipeline/vector_store/__pycache__/store_base.cpython-313.pyc +0 -0
pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc +0 -0
pipeline/vector_store/bm25_keyword_store.py +21 -0
pipeline/vector_store/faiss_store.py +59 -0
pipeline/vector_store/hybrid_retriever.py +32 -0
pipeline/vector_store/store_base.py +6 -0

pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

pipeline/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (153 Bytes). View file

pipeline/chunking/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .fixed_chunker import FixedChunker
+from .semantic_chunker import SemanticChunker
+def chunk_text(text: str, chunk_size: int, overlap: int, method="fixed"):
+    if method == "fixed":
+        return FixedChunker().chunk(text, chunk_size, overlap)
+    elif method == "semantic":
+        return SemanticChunker().chunk(text, chunk_size, overlap)
+    else:
+        raise ValueError("Unknown chunking method: " + str(method))

pipeline/chunking/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (814 Bytes). View file

pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc ADDED Viewed

Binary file (2.25 kB). View file

pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc ADDED Viewed

Binary file (1.22 kB). View file

pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc ADDED Viewed

Binary file (5.51 kB). View file

pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc ADDED Viewed

Binary file (791 Bytes). View file

pipeline/chunking/chunk_benchmark.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import time
+from . import chunk_text
+def benchmark_chunker(text, chunk_size, overlap, method):
+    print(f"Benchmarking {method} chunker...")
+    t0 = time.time()
+    chunks = chunk_text(text, chunk_size, overlap, method)
+    t1 = time.time()
+    lens = [len(c["text"]) for c in chunks]
+    print(f"Total Chunks: {len(chunks)}")
+    print(f"Avg Chunk Size: {sum(lens)/len(lens):.1f}")
+    print(f"Min/Max Chunk Size: {min(lens)}/{max(lens)}")
+    print(f"Time Taken: {t1-t0:.4f}s")
+    print("Sample metadata:", chunks[0]["meta"] if chunks else None)
+    print("--- Sample chunk ---")
+    if chunks:
+        print(chunks[0]["text"][:200])
+    print("-" * 40)
+if __name__ == "__main__":
+    text = ("This is a sample paragraph. " * 20 + "\n\n") * 100
+    benchmark_chunker(text, chunk_size=300, overlap=50, method="fixed")
+    benchmark_chunker(text, chunk_size=300, overlap=0, method="semantic")

pipeline/chunking/fixed_chunker.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import List, Dict
+from .splitter_base import SplitterBase
+class FixedChunker(SplitterBase):
+    def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
+        chunks = []
+        idx = 0
+        while idx < len(text):
+            end = min(idx + chunk_size, len(text))
+            chunk_text = text[idx:end]
+            chunks.append({
+                "text": chunk_text,
+                "start": idx,
+                "end": end,
+                "meta": {"source": "fixed"}
+            })
+            if end == len(text):
+                break
+            idx += chunk_size - overlap
+        return chunks

pipeline/chunking/semantic_chunker.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import re
+from typing import List, Dict
+from .splitter_base import SplitterBase
+HEADING_PATTERNS = [
+    r"^(CHAPTER|Chapter|Section)\s+\d+",
+    r"^[A-Z][A-Z ]{5,}$",
+    r"^(\d+\.){1,3}\s+\w+",
+]
+PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f")
+FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE)
+def find_headings(lines):
+    headings = []
+    for i, line in enumerate(lines):
+        for pat in HEADING_PATTERNS:
+            if re.match(pat, line.strip()):
+                headings.append((i, line.strip()))
+                break
+    return headings
+def split_by_size(text, chunk_size, overlap):
+    subsections = []
+    i = 0
+    while i < len(text):
+        end_i = min(i + chunk_size, len(text))
+        chunk = text[i:end_i]
+        if chunk.strip():
+            subsections.append((i, end_i, chunk))
+        if end_i == len(text):
+            break
+        i += chunk_size - overlap
+    return subsections
+class SemanticChunker(SplitterBase):
+    def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
+        lines = text.splitlines()
+        cur_section = None
+        cur_page = 1
+        chunks = []
+        line_pages = {}
+        for i, line in enumerate(lines):
+            m = PAGE_PATTERN.search(line)
+            if m and m.group(1):
+                cur_page = int(m.group(1))
+            line_pages[i] = cur_page
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS):
+                cur_section = line.strip()
+                i += 1
+                continue
+            if FIGURE_PATTERN.match(line):
+                chunks.append({
+                    "text": line.strip(),
+                    "start": i,
+                    "end": i + 1,
+                    "meta": {
+                        "section": cur_section or "NO_SECTION",
+                        "page": line_pages.get(i, 1),
+                        "type": "figure"
+                    }
+                })
+                i += 1
+                continue
+            if PAGE_PATTERN.search(line):
+                i += 1
+                continue
+            para_lines = []
+            para_start = i
+            while (i < len(lines) and lines[i].strip() and
+                   not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and
+                   not FIGURE_PATTERN.match(lines[i]) and
+                   not PAGE_PATTERN.search(lines[i])):
+                para_lines.append(lines[i])
+                i += 1
+            para_text = "\n".join(para_lines).strip()
+            if para_text:
+                subchunks = split_by_size(para_text, chunk_size, overlap)
+                for substart, subend, chunk_str in subchunks:
+                    chunks.append({
+                        "text": chunk_str,
+                        "start": para_start,
+                        "end": i,
+                        "meta": {
+                            "section": cur_section or "NO_SECTION",
+                            "page": line_pages.get(para_start, 1),
+                            "source": "semantic"
+                        }
+                    })
+            while i < len(lines) and not lines[i].strip():
+                i += 1
+        return chunks

pipeline/chunking/splitter_base.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict
+class SplitterBase(ABC):
+    @abstractmethod
+    def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
+        pass

pipeline/embeddings/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .embedder_base import embed_chunks

pipeline/embeddings/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (217 Bytes). View file

pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc ADDED Viewed

Binary file (1.35 kB). View file

pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc ADDED Viewed

Binary file (836 Bytes). View file

pipeline/embeddings/embedder_base.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from . import sentence_transformer_embed
+EMBEDDING_BACKENDS = {
+    "sentence_transformers": sentence_transformer_embed
+}
+def embed_chunks(chunks, backend: str, model_name: str, version: str = None):
+    mod = EMBEDDING_BACKENDS.get(backend)
+    if not mod:
+        raise ValueError(f"Unknown backend: {backend}")
+    texts = [c["text"] if isinstance(c, dict) else c for c in chunks]
+    metas = [c.get("meta", {}) if isinstance(c, dict) else {} for c in chunks]
+    embeddings = mod.embed(texts, model_name)
+    version = version or f"{backend}:{model_name}"
+    return [
+        {"embedding": emb, "meta": meta, "version": version}
+        for emb, meta in zip(embeddings, metas)
+    ]

pipeline/embeddings/sentence_transformer_embed.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from sentence_transformers import SentenceTransformer
+def embed(texts, model_name="all-MiniLM-L6-v2"):
+    model = SentenceTransformer(model_name)
+    return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
+def embed_chunks(chunks, model_name="all-MiniLM-L6-v2"):
+    texts = [chunk['text'] for chunk in chunks]
+    return embed(texts, model_name=model_name)

pipeline/ingest/__init__.py ADDED Viewed

File without changes

pipeline/ingest/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (160 Bytes). View file

pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc ADDED Viewed

Binary file (1.33 kB). View file

pipeline/ingest/__pycache__/html_parser.cpython-313.pyc ADDED Viewed

Binary file (1.5 kB). View file

pipeline/ingest/__pycache__/parser_base.cpython-313.pyc ADDED Viewed

Binary file (710 Bytes). View file

pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc ADDED Viewed

Binary file (1.48 kB). View file

pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc ADDED Viewed

Binary file (1.18 kB). View file

pipeline/ingest/docx_parser.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from docx import Document
+from pathlib import Path
+from .parser_base import ParserBase
+from typing import Tuple, Dict
+class DOCXParser(ParserBase):
+    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
+        doc = Document(filepath)
+        text_list = []
+        for para in doc.paragraphs:
+            text_list.append(para.text)
+        text = "\n".join(text_list)
+        metadata = {
+            "filetype": "docx",
+            "filename": str(Path(filepath).name),
+            "num_paragraphs": len(doc.paragraphs)
+        }
+        return text, metadata

pipeline/ingest/html_parser.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from bs4 import BeautifulSoup
+from pathlib import Path
+from .parser_base import ParserBase
+from typing import Tuple, Dict
+class HTMLParser(ParserBase):
+    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
+        with open(filepath, "r", encoding="utf-8") as f:
+            html = f.read()
+        soup = BeautifulSoup(html, "html.parser")
+        # Extract all visible text (ignore script, style)
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        text = soup.get_text(separator="\n", strip=True)
+        metadata = {
+            "filetype": "html",
+            "filename": str(Path(filepath).name),
+            "length": len(text)
+        }
+        return text, metadata

pipeline/ingest/parser_base.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from abc import ABC, abstractmethod
+class ParserBase(ABC):
+    @abstractmethod
+    def extract_text_and_metadata(self, filepath: str) -> (str, dict):
+        pass

pipeline/ingest/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import fitz
+from pathlib import Path
+from .parser_base import ParserBase
+from typing import Tuple, Dict
+class PDFParser(ParserBase):
+    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
+        doc = fitz.open(filepath)
+        text = ""
+        pages_metadata = []
+        for i, page in enumerate(doc):
+            page_text = page.get_text()
+            text += page_text + "\n"
+            pages_metadata.append({
+                "page_num": i+1,
+                "length": len(page_text),
+                'first_100_chars': page_text[:100],
+            })
+        metadata = {
+            "filetype": "pdf",
+            "n_pages": doc.page_count,
+            "pages": pages_metadata,
+            "filename": str(Path(filepath).name)
+        }
+        return text, metadata

pipeline/ingest/txt_parser.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pathlib import Path
+from .parser_base import ParserBase
+from typing import Tuple, Dict
+class TXTParser(ParserBase):
+    def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
+        with open(filepath, "r", encoding="utf-8") as f:
+            text = f.read()
+        metadata = {
+            "filetype": "txt",
+            "filename": str(Path(filepath).name),
+            "length": len(text)
+        }
+        return text, metadata

pipeline/monitoring/__init__.py ADDED Viewed

File without changes

pipeline/monitoring/drift_detection.py ADDED Viewed

File without changes

pipeline/monitoring/feedback.py ADDED Viewed

File without changes

pipeline/rag/__init__.py ADDED Viewed

File without changes

pipeline/rag/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (157 Bytes). View file

pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc ADDED Viewed

Binary file (488 Bytes). View file

pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc ADDED Viewed

Binary file (2.2 kB). View file

pipeline/rag/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,12 @@

+DEFAULT_PROMPT_TEMPLATE = """
+You are an AI assistant helping answer book/document-based questions.
+Use ONLY the provided context to answer the user's question. If the answer is not found in the context, say "I cannot answer based on the provided information."
+Context:
+{context}
+Question: {question}
+Answer:
+"""

pipeline/rag/retrieval_engine.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from pipeline.embeddings import embed_chunks
+from pipeline.vector_store import get_store
+from llm import get_llm
+from pipeline.rag.prompt_templates import DEFAULT_PROMPT_TEMPLATE
+def answer_question(
+    question: str,
+    embed_model: str = "all-MiniLM-L6-v2",
+    store_type: str = "faiss",
+    store_kwargs: dict = None,
+    llm_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
+    prompt_template: str = None,
+    top_k: int = 5,
+    rerank_fn=None,
+):
+    q_chunk = {"text": question}
+    q_embeds = embed_chunks([q_chunk], backend="sentence_transformers", model_name=embed_model)
+    if isinstance(q_embeds[0], dict):
+        q_embed = q_embeds[0]["embedding"]
+    else:
+        q_embed = q_embeds[0]
+    if store_kwargs is None:
+        store_kwargs = {"dim": 384}
+    vector_store = get_store(store_type, **store_kwargs)
+    if hasattr(vector_store, "load"):
+        vector_store.load()
+    if store_type == "hybrid":
+        results = vector_store.search(q_embed, question, k=top_k)
+    else:
+        results = vector_store.search(q_embed, k=top_k)
+    print("answer_question: top-k results:", [r["text"][:60] for r in results])
+    if rerank_fn:
+        results = rerank_fn(question, results)[:top_k]
+    context = "\n\n".join([r["text"] for r in results])
+    if prompt_template is None:
+        prompt_template = DEFAULT_PROMPT_TEMPLATE
+    prompt = prompt_template.format(context=context, question=question)
+    llm = get_llm(llm_name)
+    answer = llm.generate(prompt)
+    return {
+        "answer": answer,
+        "chunks": results,
+        "question": question,
+        "context": context,
+        "prompt": prompt
+    }

pipeline/vector_store/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .store_registry import get_store

pipeline/vector_store/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (217 Bytes). View file

pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc ADDED Viewed

Binary file (2.27 kB). View file

pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc ADDED Viewed

Binary file (4.09 kB). View file

pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc ADDED Viewed

Binary file (2.62 kB). View file

pipeline/vector_store/__pycache__/store_base.cpython-313.pyc ADDED Viewed

Binary file (787 Bytes). View file

pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc ADDED Viewed

Binary file (901 Bytes). View file

pipeline/vector_store/bm25_keyword_store.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from rank_bm25 import BM25Okapi
+from .store_base import VectorStoreBase
+class BM25KeywordStore(VectorStoreBase):
+    def __init__(self):
+        self.corpus = []
+        self.bm25 = None
+        self.metadatas = []
+    def add_documents(self, chunks, embeddings=None, metadatas=None):
+        self.corpus.extend([chunk["text"] for chunk in chunks])
+        self.metadatas.extend(metadatas or [{} for _ in chunks])
+        self.bm25 = BM25Okapi([doc.split(" ") for doc in self.corpus])
+    def search(self, query_text, k=5, method=None):
+        scores = self.bm25.get_scores(query_text.split(" "))
+        best_idx = sorted(range(len(scores)), key=lambda i: -scores[i])[:k]
+        return [
+            {"text": self.corpus[i], "meta": self.metadatas[i], "score": scores[i]}
+            for i in best_idx
+        ]

pipeline/vector_store/faiss_store.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import faiss
+import numpy as np
+import pickle
+from .store_base import VectorStoreBase
+class VectorStoreFAISS(VectorStoreBase):
+    def __init__(self, dim, index_path=None, metadata_path=None):
+        self.dim = dim
+        self.index = faiss.IndexFlatL2(dim)
+        self.embeddings = []
+        self.metadatas = []
+        self.texts = []
+        self.index_path = index_path or "faiss.index"
+        self.metadata_path = metadata_path or "faiss.meta.pkl"
+    def add_documents(self, chunks, embeddings, metadatas):
+        arr = np.array(embeddings).astype('float32')
+        self.index.add(arr)
+        self.embeddings.extend(embeddings)
+        self.texts.extend([chunk["text"] for chunk in chunks])
+        self.metadatas.extend(metadatas)
+        self.save()
+    def search(self, query_embed, k=5, method=None, max_distance=0.8):
+        query = np.array(query_embed).reshape(1, -1).astype('float32')
+        D, I = self.index.search(query, k)
+        results = []
+        for score, idx in zip(D[0], I[0]):
+            print(f"Chunk idx: {idx}, L2 distance: {score:.4f}")
+            if idx < 0 or idx >= len(self.texts):
+                continue
+            if score <= max_distance:
+                results.append({
+                    "text": self.texts[idx],
+                    "embedding": self.embeddings[idx],
+                    "meta": self.metadatas[idx],
+                    "distance": score
+                })
+        return results
+    def save(self):
+        faiss.write_index(self.index, self.index_path)
+        with open(self.metadata_path, "wb") as f:
+            pickle.dump({
+                "texts": self.texts,
+                "embeddings": self.embeddings,
+                "metadatas": self.metadatas
+            }, f)
+    def load(self):
+        self.index = faiss.read_index(self.index_path)
+        with open(self.metadata_path, "rb") as f:
+            data = pickle.load(f)
+            self.texts = data["texts"]
+            self.embeddings = data["embeddings"]
+            self.metadatas = data["metadatas"]
+FaissStore = VectorStoreFAISS

pipeline/vector_store/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from .faiss_store import VectorStoreFAISS
+from .bm25_keyword_store import BM25KeywordStore
+from .store_base import VectorStoreBase
+class HybridRetriever(VectorStoreBase):
+    def __init__(self, faiss_store, bm25_store, alpha=0.5):
+        self.faiss_store = faiss_store
+        self.bm25_store = bm25_store
+        self.alpha = alpha
+    def add_documents(self, chunks, embeddings, metadatas):
+        self.faiss_store.add_documents(chunks, embeddings, metadatas)
+        self.bm25_store.add_documents(chunks, None, metadatas)
+    def search(self, query_embed, query_text, k=5, method=None):
+        faiss_hits = self.faiss_store.search(query_embed, k)
+        bm25_hits = self.bm25_store.search(query_text, k)
+        # Simple hybrid: combine and sort by average rank/score (tune as desired)
+        faiss_ids = {hit["text"]: i for i, hit in enumerate(faiss_hits)}
+        bm25_ids = {hit["text"]: i for i, hit in enumerate(bm25_hits)}
+        all_texts = set(faiss_ids) | set(bm25_ids)
+        hybrid = []
+        for text in all_texts:
+            f_rank = faiss_ids.get(text, k)
+            b_rank = bm25_ids.get(text, k)
+            joint_score = self.alpha * (k - f_rank) + (1 - self.alpha) * (k - b_rank)
+            # Prefer faiss meta but fallback to bm25
+            meta = faiss_hits[faiss_ids[text]]["meta"] if text in faiss_ids else bm25_hits[bm25_ids[text]]["meta"]
+            hybrid.append({"text": text, "meta": meta, "score": joint_score})
+        return sorted(hybrid, key=lambda x: -x["score"])[:k]

pipeline/vector_store/store_base.py ADDED Viewed

	@@ -0,0 +1,6 @@

+class VectorStoreBase:
+    def add_documents(self, chunks, embeddings, metadatas):
+        raise NotImplementedError
+    def search(self, query_embed=None, query_text=None, k=5, method=None):
+        raise NotImplementedError