diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86 --- /dev/null +++ b/pipeline/__init__.py @@ -0,0 +1 @@ + diff --git a/pipeline/__pycache__/__init__.cpython-313.pyc b/pipeline/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..200a80cd1777cecc8d0fc17d91473e04df42c9a9 Binary files /dev/null and b/pipeline/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/chunking/__init__.py b/pipeline/chunking/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..92f7bfc1812306b65644d4de1cce2eb3085265e6 --- /dev/null +++ b/pipeline/chunking/__init__.py @@ -0,0 +1,10 @@ +from .fixed_chunker import FixedChunker +from .semantic_chunker import SemanticChunker + +def chunk_text(text: str, chunk_size: int, overlap: int, method="fixed"): + if method == "fixed": + return FixedChunker().chunk(text, chunk_size, overlap) + elif method == "semantic": + return SemanticChunker().chunk(text, chunk_size, overlap) + else: + raise ValueError("Unknown chunking method: " + str(method)) \ No newline at end of file diff --git a/pipeline/chunking/__pycache__/__init__.cpython-313.pyc b/pipeline/chunking/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc868a007d4e49666c869bd8efe92bf77bcead4d Binary files /dev/null and b/pipeline/chunking/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc b/pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04548f8d9cd369480a5a083b2bb27a273cb98851 Binary files /dev/null and b/pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc differ diff --git a/pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc b/pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adaadcb92fcc8de10e9987d84b146a3ae0a85fd2 Binary files /dev/null and b/pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc differ diff --git a/pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc b/pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67404bf3d30ae1844bb21e7d8a14ae66db733d96 Binary files /dev/null and b/pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc differ diff --git a/pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc b/pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..620ac21b91467c82e7c651656202379762416ae6 Binary files /dev/null and b/pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc differ diff --git a/pipeline/chunking/chunk_benchmark.py b/pipeline/chunking/chunk_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..c918cadc3492c3026763785b921baddd8038b168 --- /dev/null +++ b/pipeline/chunking/chunk_benchmark.py @@ -0,0 +1,24 @@ +import time +from . import chunk_text + +def benchmark_chunker(text, chunk_size, overlap, method): + print(f"Benchmarking {method} chunker...") + t0 = time.time() + chunks = chunk_text(text, chunk_size, overlap, method) + t1 = time.time() + lens = [len(c["text"]) for c in chunks] + print(f"Total Chunks: {len(chunks)}") + print(f"Avg Chunk Size: {sum(lens)/len(lens):.1f}") + print(f"Min/Max Chunk Size: {min(lens)}/{max(lens)}") + print(f"Time Taken: {t1-t0:.4f}s") + print("Sample metadata:", chunks[0]["meta"] if chunks else None) + print("--- Sample chunk ---") + if chunks: + print(chunks[0]["text"][:200]) + print("-" * 40) + +if __name__ == "__main__": + + text = ("This is a sample paragraph. " * 20 + "\n\n") * 100 + benchmark_chunker(text, chunk_size=300, overlap=50, method="fixed") + benchmark_chunker(text, chunk_size=300, overlap=0, method="semantic") \ No newline at end of file diff --git a/pipeline/chunking/fixed_chunker.py b/pipeline/chunking/fixed_chunker.py new file mode 100644 index 0000000000000000000000000000000000000000..6c35536ed2d1c8f8f8827a826173301f1ace5ddb --- /dev/null +++ b/pipeline/chunking/fixed_chunker.py @@ -0,0 +1,20 @@ +from typing import List, Dict +from .splitter_base import SplitterBase + +class FixedChunker(SplitterBase): + def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]: + chunks = [] + idx = 0 + while idx < len(text): + end = min(idx + chunk_size, len(text)) + chunk_text = text[idx:end] + chunks.append({ + "text": chunk_text, + "start": idx, + "end": end, + "meta": {"source": "fixed"} + }) + if end == len(text): + break + idx += chunk_size - overlap + return chunks diff --git a/pipeline/chunking/semantic_chunker.py b/pipeline/chunking/semantic_chunker.py new file mode 100644 index 0000000000000000000000000000000000000000..97a6a87fa59c2d92902032740644294a31ad47a7 --- /dev/null +++ b/pipeline/chunking/semantic_chunker.py @@ -0,0 +1,103 @@ +import re +from typing import List, Dict +from .splitter_base import SplitterBase + +HEADING_PATTERNS = [ + r"^(CHAPTER|Chapter|Section)\s+\d+", + r"^[A-Z][A-Z ]{5,}$", + r"^(\d+\.){1,3}\s+\w+", +] +PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f") +FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE) + +def find_headings(lines): + headings = [] + for i, line in enumerate(lines): + for pat in HEADING_PATTERNS: + if re.match(pat, line.strip()): + headings.append((i, line.strip())) + break + return headings + +def split_by_size(text, chunk_size, overlap): + + subsections = [] + i = 0 + while i < len(text): + end_i = min(i + chunk_size, len(text)) + chunk = text[i:end_i] + if chunk.strip(): + subsections.append((i, end_i, chunk)) + if end_i == len(text): + break + i += chunk_size - overlap + return subsections + +class SemanticChunker(SplitterBase): + def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]: + lines = text.splitlines() + cur_section = None + cur_page = 1 + chunks = [] + + line_pages = {} + for i, line in enumerate(lines): + m = PAGE_PATTERN.search(line) + if m and m.group(1): + cur_page = int(m.group(1)) + line_pages[i] = cur_page + + i = 0 + while i < len(lines): + line = lines[i] + + if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS): + cur_section = line.strip() + i += 1 + continue + + if FIGURE_PATTERN.match(line): + chunks.append({ + "text": line.strip(), + "start": i, + "end": i + 1, + "meta": { + "section": cur_section or "NO_SECTION", + "page": line_pages.get(i, 1), + "type": "figure" + } + }) + i += 1 + continue + + if PAGE_PATTERN.search(line): + i += 1 + continue + + para_lines = [] + para_start = i + while (i < len(lines) and lines[i].strip() and + not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and + not FIGURE_PATTERN.match(lines[i]) and + not PAGE_PATTERN.search(lines[i])): + para_lines.append(lines[i]) + i += 1 + para_text = "\n".join(para_lines).strip() + + if para_text: + subchunks = split_by_size(para_text, chunk_size, overlap) + for substart, subend, chunk_str in subchunks: + chunks.append({ + "text": chunk_str, + "start": para_start, + "end": i, + "meta": { + "section": cur_section or "NO_SECTION", + "page": line_pages.get(para_start, 1), + "source": "semantic" + } + }) + + while i < len(lines) and not lines[i].strip(): + i += 1 + return chunks diff --git a/pipeline/chunking/splitter_base.py b/pipeline/chunking/splitter_base.py new file mode 100644 index 0000000000000000000000000000000000000000..56a653f2dad5982768cd5f8eebe5f6d057fe47fb --- /dev/null +++ b/pipeline/chunking/splitter_base.py @@ -0,0 +1,7 @@ +from abc import ABC, abstractmethod +from typing import List, Dict + +class SplitterBase(ABC): + @abstractmethod + def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]: + pass diff --git a/pipeline/embeddings/__init__.py b/pipeline/embeddings/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a94a99402c9aa1f0784bb4305b05b4d8779bfe3 --- /dev/null +++ b/pipeline/embeddings/__init__.py @@ -0,0 +1 @@ +from .embedder_base import embed_chunks \ No newline at end of file diff --git a/pipeline/embeddings/__pycache__/__init__.cpython-313.pyc b/pipeline/embeddings/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c238317f282445deba4122cb818b6b94113d968 Binary files /dev/null and b/pipeline/embeddings/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc b/pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..584b8dd9c7b9f68e4ef87f4730c7e6d096b5900e Binary files /dev/null and b/pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc differ diff --git a/pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc b/pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c612c7c64ded0fe6016d6bc23c0b9511aa46f3c Binary files /dev/null and b/pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc differ diff --git a/pipeline/embeddings/embedder_base.py b/pipeline/embeddings/embedder_base.py new file mode 100644 index 0000000000000000000000000000000000000000..1ee47a36ff8601d66fd9cec1390e935ebb1d2865 --- /dev/null +++ b/pipeline/embeddings/embedder_base.py @@ -0,0 +1,18 @@ +from . import sentence_transformer_embed + +EMBEDDING_BACKENDS = { + "sentence_transformers": sentence_transformer_embed +} + +def embed_chunks(chunks, backend: str, model_name: str, version: str = None): + mod = EMBEDDING_BACKENDS.get(backend) + if not mod: + raise ValueError(f"Unknown backend: {backend}") + texts = [c["text"] if isinstance(c, dict) else c for c in chunks] + metas = [c.get("meta", {}) if isinstance(c, dict) else {} for c in chunks] + embeddings = mod.embed(texts, model_name) + version = version or f"{backend}:{model_name}" + return [ + {"embedding": emb, "meta": meta, "version": version} + for emb, meta in zip(embeddings, metas) + ] diff --git a/pipeline/embeddings/sentence_transformer_embed.py b/pipeline/embeddings/sentence_transformer_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..e236a72be7e584c115ebc6de06af9ac4b7aaf327 --- /dev/null +++ b/pipeline/embeddings/sentence_transformer_embed.py @@ -0,0 +1,10 @@ +from sentence_transformers import SentenceTransformer + +def embed(texts, model_name="all-MiniLM-L6-v2"): + model = SentenceTransformer(model_name) + return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist() + +def embed_chunks(chunks, model_name="all-MiniLM-L6-v2"): + + texts = [chunk['text'] for chunk in chunks] + return embed(texts, model_name=model_name) diff --git a/pipeline/ingest/__init__.py b/pipeline/ingest/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/ingest/__pycache__/__init__.cpython-313.pyc b/pipeline/ingest/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..796eab641008fee3d1fc436d42f2fd89453f5f2f Binary files /dev/null and b/pipeline/ingest/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc b/pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4ddd9eb69e6d0a06670b6ca35f3decbfc708a4b Binary files /dev/null and b/pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc differ diff --git a/pipeline/ingest/__pycache__/html_parser.cpython-313.pyc b/pipeline/ingest/__pycache__/html_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01d468619c3303a62c89fa4fdc2374f2cdbae31d Binary files /dev/null and b/pipeline/ingest/__pycache__/html_parser.cpython-313.pyc differ diff --git a/pipeline/ingest/__pycache__/parser_base.cpython-313.pyc b/pipeline/ingest/__pycache__/parser_base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b5b83ad8fc0e3dfbcd54c54c0a4f13beff2785e Binary files /dev/null and b/pipeline/ingest/__pycache__/parser_base.cpython-313.pyc differ diff --git a/pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc b/pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c37737419e2ce260b5365b66f536347821f102c3 Binary files /dev/null and b/pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc differ diff --git a/pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc b/pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3774147a7bf700cd524a6ddc96659552b56f3036 Binary files /dev/null and b/pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc differ diff --git a/pipeline/ingest/docx_parser.py b/pipeline/ingest/docx_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..20965789c60f6b1c91dda8d3661f93a96c9871f1 --- /dev/null +++ b/pipeline/ingest/docx_parser.py @@ -0,0 +1,18 @@ +from docx import Document +from pathlib import Path +from .parser_base import ParserBase +from typing import Tuple, Dict + +class DOCXParser(ParserBase): + def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]: + doc = Document(filepath) + text_list = [] + for para in doc.paragraphs: + text_list.append(para.text) + text = "\n".join(text_list) + metadata = { + "filetype": "docx", + "filename": str(Path(filepath).name), + "num_paragraphs": len(doc.paragraphs) + } + return text, metadata \ No newline at end of file diff --git a/pipeline/ingest/html_parser.py b/pipeline/ingest/html_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..98a2b65cbdb0ea0ad4c7d52880c75b57edfb7c4b --- /dev/null +++ b/pipeline/ingest/html_parser.py @@ -0,0 +1,20 @@ +from bs4 import BeautifulSoup +from pathlib import Path +from .parser_base import ParserBase +from typing import Tuple, Dict + +class HTMLParser(ParserBase): + def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]: + with open(filepath, "r", encoding="utf-8") as f: + html = f.read() + soup = BeautifulSoup(html, "html.parser") + # Extract all visible text (ignore script, style) + for tag in soup(["script", "style"]): + tag.decompose() + text = soup.get_text(separator="\n", strip=True) + metadata = { + "filetype": "html", + "filename": str(Path(filepath).name), + "length": len(text) + } + return text, metadata \ No newline at end of file diff --git a/pipeline/ingest/parser_base.py b/pipeline/ingest/parser_base.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4728dfeff55c1b9f9513954a34262f795307f4 --- /dev/null +++ b/pipeline/ingest/parser_base.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class ParserBase(ABC): + @abstractmethod + def extract_text_and_metadata(self, filepath: str) -> (str, dict): + pass \ No newline at end of file diff --git a/pipeline/ingest/pdf_parser.py b/pipeline/ingest/pdf_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..a639c2602b47205eaa39b0db918e992bc22f26fd --- /dev/null +++ b/pipeline/ingest/pdf_parser.py @@ -0,0 +1,26 @@ +import fitz +from pathlib import Path +from .parser_base import ParserBase +from typing import Tuple, Dict + +class PDFParser(ParserBase): + def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]: + + doc = fitz.open(filepath) + text = "" + pages_metadata = [] + for i, page in enumerate(doc): + page_text = page.get_text() + text += page_text + "\n" + pages_metadata.append({ + "page_num": i+1, + "length": len(page_text), + 'first_100_chars': page_text[:100], + }) + metadata = { + "filetype": "pdf", + "n_pages": doc.page_count, + "pages": pages_metadata, + "filename": str(Path(filepath).name) + } + return text, metadata diff --git a/pipeline/ingest/txt_parser.py b/pipeline/ingest/txt_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..3c448c97783a67362651cca73f4309ca796f0e5d --- /dev/null +++ b/pipeline/ingest/txt_parser.py @@ -0,0 +1,15 @@ +from pathlib import Path +from .parser_base import ParserBase +from typing import Tuple, Dict + +class TXTParser(ParserBase): + def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]: + with open(filepath, "r", encoding="utf-8") as f: + text = f.read() + metadata = { + "filetype": "txt", + "filename": str(Path(filepath).name), + "length": len(text) + } + return text, metadata + diff --git a/pipeline/monitoring/__init__.py b/pipeline/monitoring/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/monitoring/drift_detection.py b/pipeline/monitoring/drift_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/monitoring/feedback.py b/pipeline/monitoring/feedback.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/rag/__init__.py b/pipeline/rag/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/rag/__pycache__/__init__.cpython-313.pyc b/pipeline/rag/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29e0a03bef2668c1d981f4fd18af7c3619317c80 Binary files /dev/null and b/pipeline/rag/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc b/pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1965090938a93f980df2aeb088a86a50cad3e881 Binary files /dev/null and b/pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc differ diff --git a/pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc b/pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e07406f369617504184646f89520609c23274ed Binary files /dev/null and b/pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc differ diff --git a/pipeline/rag/prompt_templates.py b/pipeline/rag/prompt_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..b88dcd6a2685e3d82607da3d9782a6553ccc01d7 --- /dev/null +++ b/pipeline/rag/prompt_templates.py @@ -0,0 +1,12 @@ +DEFAULT_PROMPT_TEMPLATE = """ +You are an AI assistant helping answer book/document-based questions. + +Use ONLY the provided context to answer the user's question. If the answer is not found in the context, say "I cannot answer based on the provided information." + +Context: +{context} + +Question: {question} + +Answer: +""" diff --git a/pipeline/rag/retrieval_engine.py b/pipeline/rag/retrieval_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..eb21e1546ecb18a0833a25c92c55bd0399021e25 --- /dev/null +++ b/pipeline/rag/retrieval_engine.py @@ -0,0 +1,58 @@ +from pipeline.embeddings import embed_chunks +from pipeline.vector_store import get_store +from llm import get_llm +from pipeline.rag.prompt_templates import DEFAULT_PROMPT_TEMPLATE + +def answer_question( + question: str, + embed_model: str = "all-MiniLM-L6-v2", + store_type: str = "faiss", + store_kwargs: dict = None, + llm_name: str = "mistralai/Mistral-7B-Instruct-v0.2", + prompt_template: str = None, + top_k: int = 5, + rerank_fn=None, +): + + q_chunk = {"text": question} + + q_embeds = embed_chunks([q_chunk], backend="sentence_transformers", model_name=embed_model) + + if isinstance(q_embeds[0], dict): + q_embed = q_embeds[0]["embedding"] + else: + q_embed = q_embeds[0] + + + if store_kwargs is None: + store_kwargs = {"dim": 384} + vector_store = get_store(store_type, **store_kwargs) + if hasattr(vector_store, "load"): + vector_store.load() + if store_type == "hybrid": + results = vector_store.search(q_embed, question, k=top_k) + else: + results = vector_store.search(q_embed, k=top_k) + print("answer_question: top-k results:", [r["text"][:60] for r in results]) + + + if rerank_fn: + results = rerank_fn(question, results)[:top_k] + + + context = "\n\n".join([r["text"] for r in results]) + if prompt_template is None: + prompt_template = DEFAULT_PROMPT_TEMPLATE + prompt = prompt_template.format(context=context, question=question) + + + llm = get_llm(llm_name) + answer = llm.generate(prompt) + + return { + "answer": answer, + "chunks": results, + "question": question, + "context": context, + "prompt": prompt + } diff --git a/pipeline/vector_store/__init__.py b/pipeline/vector_store/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4360a418a16ba7c272c4eb9502001e9306092aa0 --- /dev/null +++ b/pipeline/vector_store/__init__.py @@ -0,0 +1 @@ +from .store_registry import get_store \ No newline at end of file diff --git a/pipeline/vector_store/__pycache__/__init__.cpython-313.pyc b/pipeline/vector_store/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25ab957a1b65d7752f2c2c7a34986115227c240d Binary files /dev/null and b/pipeline/vector_store/__pycache__/__init__.cpython-313.pyc differ diff --git a/pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc b/pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad5f94e30e860b03050a112ea64a3ce073c856ee Binary files /dev/null and b/pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc differ diff --git a/pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc b/pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..900228f71e024b62e4c3d6d3644a4792911404bc Binary files /dev/null and b/pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc differ diff --git a/pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc b/pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd913ec67e2968bac5e978a904ccc70ec154f7e1 Binary files /dev/null and b/pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc differ diff --git a/pipeline/vector_store/__pycache__/store_base.cpython-313.pyc b/pipeline/vector_store/__pycache__/store_base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4afb0ae5b63d627ceb215c3bc95985cd52759bc Binary files /dev/null and b/pipeline/vector_store/__pycache__/store_base.cpython-313.pyc differ diff --git a/pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc b/pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6c5ab04d0223f078882abe97bb481ac60bf857f Binary files /dev/null and b/pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc differ diff --git a/pipeline/vector_store/bm25_keyword_store.py b/pipeline/vector_store/bm25_keyword_store.py new file mode 100644 index 0000000000000000000000000000000000000000..ff67beeed2ed43f7392745d94fbe2c77fbe6fa09 --- /dev/null +++ b/pipeline/vector_store/bm25_keyword_store.py @@ -0,0 +1,21 @@ +from rank_bm25 import BM25Okapi +from .store_base import VectorStoreBase + +class BM25KeywordStore(VectorStoreBase): + def __init__(self): + self.corpus = [] + self.bm25 = None + self.metadatas = [] + + def add_documents(self, chunks, embeddings=None, metadatas=None): + self.corpus.extend([chunk["text"] for chunk in chunks]) + self.metadatas.extend(metadatas or [{} for _ in chunks]) + self.bm25 = BM25Okapi([doc.split(" ") for doc in self.corpus]) + + def search(self, query_text, k=5, method=None): + scores = self.bm25.get_scores(query_text.split(" ")) + best_idx = sorted(range(len(scores)), key=lambda i: -scores[i])[:k] + return [ + {"text": self.corpus[i], "meta": self.metadatas[i], "score": scores[i]} + for i in best_idx + ] diff --git a/pipeline/vector_store/faiss_store.py b/pipeline/vector_store/faiss_store.py new file mode 100644 index 0000000000000000000000000000000000000000..1e82d5ab34f38bf3f125ee90babcf7a29fcc1fd6 --- /dev/null +++ b/pipeline/vector_store/faiss_store.py @@ -0,0 +1,59 @@ +import faiss +import numpy as np +import pickle +from .store_base import VectorStoreBase + +class VectorStoreFAISS(VectorStoreBase): + def __init__(self, dim, index_path=None, metadata_path=None): + self.dim = dim + self.index = faiss.IndexFlatL2(dim) + self.embeddings = [] + self.metadatas = [] + self.texts = [] + self.index_path = index_path or "faiss.index" + self.metadata_path = metadata_path or "faiss.meta.pkl" + + def add_documents(self, chunks, embeddings, metadatas): + arr = np.array(embeddings).astype('float32') + self.index.add(arr) + self.embeddings.extend(embeddings) + self.texts.extend([chunk["text"] for chunk in chunks]) + self.metadatas.extend(metadatas) + self.save() + + def search(self, query_embed, k=5, method=None, max_distance=0.8): + query = np.array(query_embed).reshape(1, -1).astype('float32') + D, I = self.index.search(query, k) + results = [] + for score, idx in zip(D[0], I[0]): + print(f"Chunk idx: {idx}, L2 distance: {score:.4f}") + if idx < 0 or idx >= len(self.texts): + continue + if score <= max_distance: + results.append({ + "text": self.texts[idx], + "embedding": self.embeddings[idx], + "meta": self.metadatas[idx], + "distance": score + }) + return results + + + def save(self): + faiss.write_index(self.index, self.index_path) + with open(self.metadata_path, "wb") as f: + pickle.dump({ + "texts": self.texts, + "embeddings": self.embeddings, + "metadatas": self.metadatas + }, f) + + def load(self): + self.index = faiss.read_index(self.index_path) + with open(self.metadata_path, "rb") as f: + data = pickle.load(f) + self.texts = data["texts"] + self.embeddings = data["embeddings"] + self.metadatas = data["metadatas"] + +FaissStore = VectorStoreFAISS \ No newline at end of file diff --git a/pipeline/vector_store/hybrid_retriever.py b/pipeline/vector_store/hybrid_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..c0019419c352f3ed53e15784462aa5667ac9c5da --- /dev/null +++ b/pipeline/vector_store/hybrid_retriever.py @@ -0,0 +1,32 @@ +from .faiss_store import VectorStoreFAISS +from .bm25_keyword_store import BM25KeywordStore +from .store_base import VectorStoreBase + +class HybridRetriever(VectorStoreBase): + def __init__(self, faiss_store, bm25_store, alpha=0.5): + self.faiss_store = faiss_store + self.bm25_store = bm25_store + self.alpha = alpha + + def add_documents(self, chunks, embeddings, metadatas): + self.faiss_store.add_documents(chunks, embeddings, metadatas) + self.bm25_store.add_documents(chunks, None, metadatas) + + def search(self, query_embed, query_text, k=5, method=None): + faiss_hits = self.faiss_store.search(query_embed, k) + bm25_hits = self.bm25_store.search(query_text, k) + + # Simple hybrid: combine and sort by average rank/score (tune as desired) + faiss_ids = {hit["text"]: i for i, hit in enumerate(faiss_hits)} + bm25_ids = {hit["text"]: i for i, hit in enumerate(bm25_hits)} + all_texts = set(faiss_ids) | set(bm25_ids) + + hybrid = [] + for text in all_texts: + f_rank = faiss_ids.get(text, k) + b_rank = bm25_ids.get(text, k) + joint_score = self.alpha * (k - f_rank) + (1 - self.alpha) * (k - b_rank) + # Prefer faiss meta but fallback to bm25 + meta = faiss_hits[faiss_ids[text]]["meta"] if text in faiss_ids else bm25_hits[bm25_ids[text]]["meta"] + hybrid.append({"text": text, "meta": meta, "score": joint_score}) + return sorted(hybrid, key=lambda x: -x["score"])[:k] diff --git a/pipeline/vector_store/store_base.py b/pipeline/vector_store/store_base.py new file mode 100644 index 0000000000000000000000000000000000000000..2547e478d9a7c97bbca268cc4d71ea2eacf4b9e2 --- /dev/null +++ b/pipeline/vector_store/store_base.py @@ -0,0 +1,6 @@ +class VectorStoreBase: + def add_documents(self, chunks, embeddings, metadatas): + raise NotImplementedError + + def search(self, query_embed=None, query_text=None, k=5, method=None): + raise NotImplementedError diff --git a/pipeline/vector_store/store_registry.py b/pipeline/vector_store/store_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..4ac7a1764a683b0e59951c5f36245cfd1f5c0b09 --- /dev/null +++ b/pipeline/vector_store/store_registry.py @@ -0,0 +1,16 @@ +from .faiss_store import VectorStoreFAISS +from .bm25_keyword_store import BM25KeywordStore +from .hybrid_retriever import HybridRetriever + +def get_store(store_type, **kwargs): + if store_type == "faiss": + return VectorStoreFAISS(**kwargs) + elif store_type == "bm25": + return BM25KeywordStore() + elif store_type == "hybrid": + # Accepts 'dim', 'alpha' + faiss_store = VectorStoreFAISS(kwargs["dim"]) + bm25_store = BM25KeywordStore() + return HybridRetriever(faiss_store, bm25_store, kwargs.get("alpha", 0.5)) + else: + raise ValueError(f"Unknown store type: {store_type}")