samithcs commited on
Commit
63105da
·
verified ·
1 Parent(s): a1ff2af

Pipeline added

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. pipeline/__init__.py +1 -0
  2. pipeline/__pycache__/__init__.cpython-313.pyc +0 -0
  3. pipeline/chunking/__init__.py +10 -0
  4. pipeline/chunking/__pycache__/__init__.cpython-313.pyc +0 -0
  5. pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc +0 -0
  6. pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc +0 -0
  7. pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc +0 -0
  8. pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc +0 -0
  9. pipeline/chunking/chunk_benchmark.py +24 -0
  10. pipeline/chunking/fixed_chunker.py +20 -0
  11. pipeline/chunking/semantic_chunker.py +103 -0
  12. pipeline/chunking/splitter_base.py +7 -0
  13. pipeline/embeddings/__init__.py +1 -0
  14. pipeline/embeddings/__pycache__/__init__.cpython-313.pyc +0 -0
  15. pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc +0 -0
  16. pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc +0 -0
  17. pipeline/embeddings/embedder_base.py +18 -0
  18. pipeline/embeddings/sentence_transformer_embed.py +10 -0
  19. pipeline/ingest/__init__.py +0 -0
  20. pipeline/ingest/__pycache__/__init__.cpython-313.pyc +0 -0
  21. pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc +0 -0
  22. pipeline/ingest/__pycache__/html_parser.cpython-313.pyc +0 -0
  23. pipeline/ingest/__pycache__/parser_base.cpython-313.pyc +0 -0
  24. pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc +0 -0
  25. pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc +0 -0
  26. pipeline/ingest/docx_parser.py +18 -0
  27. pipeline/ingest/html_parser.py +20 -0
  28. pipeline/ingest/parser_base.py +6 -0
  29. pipeline/ingest/pdf_parser.py +26 -0
  30. pipeline/ingest/txt_parser.py +15 -0
  31. pipeline/monitoring/__init__.py +0 -0
  32. pipeline/monitoring/drift_detection.py +0 -0
  33. pipeline/monitoring/feedback.py +0 -0
  34. pipeline/rag/__init__.py +0 -0
  35. pipeline/rag/__pycache__/__init__.cpython-313.pyc +0 -0
  36. pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc +0 -0
  37. pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc +0 -0
  38. pipeline/rag/prompt_templates.py +12 -0
  39. pipeline/rag/retrieval_engine.py +58 -0
  40. pipeline/vector_store/__init__.py +1 -0
  41. pipeline/vector_store/__pycache__/__init__.cpython-313.pyc +0 -0
  42. pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc +0 -0
  43. pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc +0 -0
  44. pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc +0 -0
  45. pipeline/vector_store/__pycache__/store_base.cpython-313.pyc +0 -0
  46. pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc +0 -0
  47. pipeline/vector_store/bm25_keyword_store.py +21 -0
  48. pipeline/vector_store/faiss_store.py +59 -0
  49. pipeline/vector_store/hybrid_retriever.py +32 -0
  50. pipeline/vector_store/store_base.py +6 -0
pipeline/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
pipeline/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (153 Bytes). View file
 
pipeline/chunking/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from .fixed_chunker import FixedChunker
2
+ from .semantic_chunker import SemanticChunker
3
+
4
+ def chunk_text(text: str, chunk_size: int, overlap: int, method="fixed"):
5
+ if method == "fixed":
6
+ return FixedChunker().chunk(text, chunk_size, overlap)
7
+ elif method == "semantic":
8
+ return SemanticChunker().chunk(text, chunk_size, overlap)
9
+ else:
10
+ raise ValueError("Unknown chunking method: " + str(method))
pipeline/chunking/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (814 Bytes). View file
 
pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc ADDED
Binary file (2.25 kB). View file
 
pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc ADDED
Binary file (1.22 kB). View file
 
pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc ADDED
Binary file (5.51 kB). View file
 
pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc ADDED
Binary file (791 Bytes). View file
 
pipeline/chunking/chunk_benchmark.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from . import chunk_text
3
+
4
+ def benchmark_chunker(text, chunk_size, overlap, method):
5
+ print(f"Benchmarking {method} chunker...")
6
+ t0 = time.time()
7
+ chunks = chunk_text(text, chunk_size, overlap, method)
8
+ t1 = time.time()
9
+ lens = [len(c["text"]) for c in chunks]
10
+ print(f"Total Chunks: {len(chunks)}")
11
+ print(f"Avg Chunk Size: {sum(lens)/len(lens):.1f}")
12
+ print(f"Min/Max Chunk Size: {min(lens)}/{max(lens)}")
13
+ print(f"Time Taken: {t1-t0:.4f}s")
14
+ print("Sample metadata:", chunks[0]["meta"] if chunks else None)
15
+ print("--- Sample chunk ---")
16
+ if chunks:
17
+ print(chunks[0]["text"][:200])
18
+ print("-" * 40)
19
+
20
+ if __name__ == "__main__":
21
+
22
+ text = ("This is a sample paragraph. " * 20 + "\n\n") * 100
23
+ benchmark_chunker(text, chunk_size=300, overlap=50, method="fixed")
24
+ benchmark_chunker(text, chunk_size=300, overlap=0, method="semantic")
pipeline/chunking/fixed_chunker.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ from .splitter_base import SplitterBase
3
+
4
+ class FixedChunker(SplitterBase):
5
+ def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
6
+ chunks = []
7
+ idx = 0
8
+ while idx < len(text):
9
+ end = min(idx + chunk_size, len(text))
10
+ chunk_text = text[idx:end]
11
+ chunks.append({
12
+ "text": chunk_text,
13
+ "start": idx,
14
+ "end": end,
15
+ "meta": {"source": "fixed"}
16
+ })
17
+ if end == len(text):
18
+ break
19
+ idx += chunk_size - overlap
20
+ return chunks
pipeline/chunking/semantic_chunker.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+ from .splitter_base import SplitterBase
4
+
5
+ HEADING_PATTERNS = [
6
+ r"^(CHAPTER|Chapter|Section)\s+\d+",
7
+ r"^[A-Z][A-Z ]{5,}$",
8
+ r"^(\d+\.){1,3}\s+\w+",
9
+ ]
10
+ PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f")
11
+ FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE)
12
+
13
+ def find_headings(lines):
14
+ headings = []
15
+ for i, line in enumerate(lines):
16
+ for pat in HEADING_PATTERNS:
17
+ if re.match(pat, line.strip()):
18
+ headings.append((i, line.strip()))
19
+ break
20
+ return headings
21
+
22
+ def split_by_size(text, chunk_size, overlap):
23
+
24
+ subsections = []
25
+ i = 0
26
+ while i < len(text):
27
+ end_i = min(i + chunk_size, len(text))
28
+ chunk = text[i:end_i]
29
+ if chunk.strip():
30
+ subsections.append((i, end_i, chunk))
31
+ if end_i == len(text):
32
+ break
33
+ i += chunk_size - overlap
34
+ return subsections
35
+
36
+ class SemanticChunker(SplitterBase):
37
+ def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
38
+ lines = text.splitlines()
39
+ cur_section = None
40
+ cur_page = 1
41
+ chunks = []
42
+
43
+ line_pages = {}
44
+ for i, line in enumerate(lines):
45
+ m = PAGE_PATTERN.search(line)
46
+ if m and m.group(1):
47
+ cur_page = int(m.group(1))
48
+ line_pages[i] = cur_page
49
+
50
+ i = 0
51
+ while i < len(lines):
52
+ line = lines[i]
53
+
54
+ if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS):
55
+ cur_section = line.strip()
56
+ i += 1
57
+ continue
58
+
59
+ if FIGURE_PATTERN.match(line):
60
+ chunks.append({
61
+ "text": line.strip(),
62
+ "start": i,
63
+ "end": i + 1,
64
+ "meta": {
65
+ "section": cur_section or "NO_SECTION",
66
+ "page": line_pages.get(i, 1),
67
+ "type": "figure"
68
+ }
69
+ })
70
+ i += 1
71
+ continue
72
+
73
+ if PAGE_PATTERN.search(line):
74
+ i += 1
75
+ continue
76
+
77
+ para_lines = []
78
+ para_start = i
79
+ while (i < len(lines) and lines[i].strip() and
80
+ not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and
81
+ not FIGURE_PATTERN.match(lines[i]) and
82
+ not PAGE_PATTERN.search(lines[i])):
83
+ para_lines.append(lines[i])
84
+ i += 1
85
+ para_text = "\n".join(para_lines).strip()
86
+
87
+ if para_text:
88
+ subchunks = split_by_size(para_text, chunk_size, overlap)
89
+ for substart, subend, chunk_str in subchunks:
90
+ chunks.append({
91
+ "text": chunk_str,
92
+ "start": para_start,
93
+ "end": i,
94
+ "meta": {
95
+ "section": cur_section or "NO_SECTION",
96
+ "page": line_pages.get(para_start, 1),
97
+ "source": "semantic"
98
+ }
99
+ })
100
+
101
+ while i < len(lines) and not lines[i].strip():
102
+ i += 1
103
+ return chunks
pipeline/chunking/splitter_base.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Dict
3
+
4
+ class SplitterBase(ABC):
5
+ @abstractmethod
6
+ def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
7
+ pass
pipeline/embeddings/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .embedder_base import embed_chunks
pipeline/embeddings/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (217 Bytes). View file
 
pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc ADDED
Binary file (1.35 kB). View file
 
pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc ADDED
Binary file (836 Bytes). View file
 
pipeline/embeddings/embedder_base.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import sentence_transformer_embed
2
+
3
+ EMBEDDING_BACKENDS = {
4
+ "sentence_transformers": sentence_transformer_embed
5
+ }
6
+
7
+ def embed_chunks(chunks, backend: str, model_name: str, version: str = None):
8
+ mod = EMBEDDING_BACKENDS.get(backend)
9
+ if not mod:
10
+ raise ValueError(f"Unknown backend: {backend}")
11
+ texts = [c["text"] if isinstance(c, dict) else c for c in chunks]
12
+ metas = [c.get("meta", {}) if isinstance(c, dict) else {} for c in chunks]
13
+ embeddings = mod.embed(texts, model_name)
14
+ version = version or f"{backend}:{model_name}"
15
+ return [
16
+ {"embedding": emb, "meta": meta, "version": version}
17
+ for emb, meta in zip(embeddings, metas)
18
+ ]
pipeline/embeddings/sentence_transformer_embed.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ def embed(texts, model_name="all-MiniLM-L6-v2"):
4
+ model = SentenceTransformer(model_name)
5
+ return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
6
+
7
+ def embed_chunks(chunks, model_name="all-MiniLM-L6-v2"):
8
+
9
+ texts = [chunk['text'] for chunk in chunks]
10
+ return embed(texts, model_name=model_name)
pipeline/ingest/__init__.py ADDED
File without changes
pipeline/ingest/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (160 Bytes). View file
 
pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc ADDED
Binary file (1.33 kB). View file
 
pipeline/ingest/__pycache__/html_parser.cpython-313.pyc ADDED
Binary file (1.5 kB). View file
 
pipeline/ingest/__pycache__/parser_base.cpython-313.pyc ADDED
Binary file (710 Bytes). View file
 
pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc ADDED
Binary file (1.48 kB). View file
 
pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc ADDED
Binary file (1.18 kB). View file
 
pipeline/ingest/docx_parser.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ from pathlib import Path
3
+ from .parser_base import ParserBase
4
+ from typing import Tuple, Dict
5
+
6
+ class DOCXParser(ParserBase):
7
+ def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
8
+ doc = Document(filepath)
9
+ text_list = []
10
+ for para in doc.paragraphs:
11
+ text_list.append(para.text)
12
+ text = "\n".join(text_list)
13
+ metadata = {
14
+ "filetype": "docx",
15
+ "filename": str(Path(filepath).name),
16
+ "num_paragraphs": len(doc.paragraphs)
17
+ }
18
+ return text, metadata
pipeline/ingest/html_parser.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from pathlib import Path
3
+ from .parser_base import ParserBase
4
+ from typing import Tuple, Dict
5
+
6
+ class HTMLParser(ParserBase):
7
+ def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
8
+ with open(filepath, "r", encoding="utf-8") as f:
9
+ html = f.read()
10
+ soup = BeautifulSoup(html, "html.parser")
11
+ # Extract all visible text (ignore script, style)
12
+ for tag in soup(["script", "style"]):
13
+ tag.decompose()
14
+ text = soup.get_text(separator="\n", strip=True)
15
+ metadata = {
16
+ "filetype": "html",
17
+ "filename": str(Path(filepath).name),
18
+ "length": len(text)
19
+ }
20
+ return text, metadata
pipeline/ingest/parser_base.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ class ParserBase(ABC):
4
+ @abstractmethod
5
+ def extract_text_and_metadata(self, filepath: str) -> (str, dict):
6
+ pass
pipeline/ingest/pdf_parser.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ from pathlib import Path
3
+ from .parser_base import ParserBase
4
+ from typing import Tuple, Dict
5
+
6
+ class PDFParser(ParserBase):
7
+ def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
8
+
9
+ doc = fitz.open(filepath)
10
+ text = ""
11
+ pages_metadata = []
12
+ for i, page in enumerate(doc):
13
+ page_text = page.get_text()
14
+ text += page_text + "\n"
15
+ pages_metadata.append({
16
+ "page_num": i+1,
17
+ "length": len(page_text),
18
+ 'first_100_chars': page_text[:100],
19
+ })
20
+ metadata = {
21
+ "filetype": "pdf",
22
+ "n_pages": doc.page_count,
23
+ "pages": pages_metadata,
24
+ "filename": str(Path(filepath).name)
25
+ }
26
+ return text, metadata
pipeline/ingest/txt_parser.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from .parser_base import ParserBase
3
+ from typing import Tuple, Dict
4
+
5
+ class TXTParser(ParserBase):
6
+ def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
7
+ with open(filepath, "r", encoding="utf-8") as f:
8
+ text = f.read()
9
+ metadata = {
10
+ "filetype": "txt",
11
+ "filename": str(Path(filepath).name),
12
+ "length": len(text)
13
+ }
14
+ return text, metadata
15
+
pipeline/monitoring/__init__.py ADDED
File without changes
pipeline/monitoring/drift_detection.py ADDED
File without changes
pipeline/monitoring/feedback.py ADDED
File without changes
pipeline/rag/__init__.py ADDED
File without changes
pipeline/rag/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (157 Bytes). View file
 
pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc ADDED
Binary file (488 Bytes). View file
 
pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc ADDED
Binary file (2.2 kB). View file
 
pipeline/rag/prompt_templates.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEFAULT_PROMPT_TEMPLATE = """
2
+ You are an AI assistant helping answer book/document-based questions.
3
+
4
+ Use ONLY the provided context to answer the user's question. If the answer is not found in the context, say "I cannot answer based on the provided information."
5
+
6
+ Context:
7
+ {context}
8
+
9
+ Question: {question}
10
+
11
+ Answer:
12
+ """
pipeline/rag/retrieval_engine.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pipeline.embeddings import embed_chunks
2
+ from pipeline.vector_store import get_store
3
+ from llm import get_llm
4
+ from pipeline.rag.prompt_templates import DEFAULT_PROMPT_TEMPLATE
5
+
6
+ def answer_question(
7
+ question: str,
8
+ embed_model: str = "all-MiniLM-L6-v2",
9
+ store_type: str = "faiss",
10
+ store_kwargs: dict = None,
11
+ llm_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
12
+ prompt_template: str = None,
13
+ top_k: int = 5,
14
+ rerank_fn=None,
15
+ ):
16
+
17
+ q_chunk = {"text": question}
18
+
19
+ q_embeds = embed_chunks([q_chunk], backend="sentence_transformers", model_name=embed_model)
20
+
21
+ if isinstance(q_embeds[0], dict):
22
+ q_embed = q_embeds[0]["embedding"]
23
+ else:
24
+ q_embed = q_embeds[0]
25
+
26
+
27
+ if store_kwargs is None:
28
+ store_kwargs = {"dim": 384}
29
+ vector_store = get_store(store_type, **store_kwargs)
30
+ if hasattr(vector_store, "load"):
31
+ vector_store.load()
32
+ if store_type == "hybrid":
33
+ results = vector_store.search(q_embed, question, k=top_k)
34
+ else:
35
+ results = vector_store.search(q_embed, k=top_k)
36
+ print("answer_question: top-k results:", [r["text"][:60] for r in results])
37
+
38
+
39
+ if rerank_fn:
40
+ results = rerank_fn(question, results)[:top_k]
41
+
42
+
43
+ context = "\n\n".join([r["text"] for r in results])
44
+ if prompt_template is None:
45
+ prompt_template = DEFAULT_PROMPT_TEMPLATE
46
+ prompt = prompt_template.format(context=context, question=question)
47
+
48
+
49
+ llm = get_llm(llm_name)
50
+ answer = llm.generate(prompt)
51
+
52
+ return {
53
+ "answer": answer,
54
+ "chunks": results,
55
+ "question": question,
56
+ "context": context,
57
+ "prompt": prompt
58
+ }
pipeline/vector_store/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .store_registry import get_store
pipeline/vector_store/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (217 Bytes). View file
 
pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc ADDED
Binary file (2.27 kB). View file
 
pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc ADDED
Binary file (4.09 kB). View file
 
pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc ADDED
Binary file (2.62 kB). View file
 
pipeline/vector_store/__pycache__/store_base.cpython-313.pyc ADDED
Binary file (787 Bytes). View file
 
pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc ADDED
Binary file (901 Bytes). View file
 
pipeline/vector_store/bm25_keyword_store.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rank_bm25 import BM25Okapi
2
+ from .store_base import VectorStoreBase
3
+
4
+ class BM25KeywordStore(VectorStoreBase):
5
+ def __init__(self):
6
+ self.corpus = []
7
+ self.bm25 = None
8
+ self.metadatas = []
9
+
10
+ def add_documents(self, chunks, embeddings=None, metadatas=None):
11
+ self.corpus.extend([chunk["text"] for chunk in chunks])
12
+ self.metadatas.extend(metadatas or [{} for _ in chunks])
13
+ self.bm25 = BM25Okapi([doc.split(" ") for doc in self.corpus])
14
+
15
+ def search(self, query_text, k=5, method=None):
16
+ scores = self.bm25.get_scores(query_text.split(" "))
17
+ best_idx = sorted(range(len(scores)), key=lambda i: -scores[i])[:k]
18
+ return [
19
+ {"text": self.corpus[i], "meta": self.metadatas[i], "score": scores[i]}
20
+ for i in best_idx
21
+ ]
pipeline/vector_store/faiss_store.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import pickle
4
+ from .store_base import VectorStoreBase
5
+
6
+ class VectorStoreFAISS(VectorStoreBase):
7
+ def __init__(self, dim, index_path=None, metadata_path=None):
8
+ self.dim = dim
9
+ self.index = faiss.IndexFlatL2(dim)
10
+ self.embeddings = []
11
+ self.metadatas = []
12
+ self.texts = []
13
+ self.index_path = index_path or "faiss.index"
14
+ self.metadata_path = metadata_path or "faiss.meta.pkl"
15
+
16
+ def add_documents(self, chunks, embeddings, metadatas):
17
+ arr = np.array(embeddings).astype('float32')
18
+ self.index.add(arr)
19
+ self.embeddings.extend(embeddings)
20
+ self.texts.extend([chunk["text"] for chunk in chunks])
21
+ self.metadatas.extend(metadatas)
22
+ self.save()
23
+
24
+ def search(self, query_embed, k=5, method=None, max_distance=0.8):
25
+ query = np.array(query_embed).reshape(1, -1).astype('float32')
26
+ D, I = self.index.search(query, k)
27
+ results = []
28
+ for score, idx in zip(D[0], I[0]):
29
+ print(f"Chunk idx: {idx}, L2 distance: {score:.4f}")
30
+ if idx < 0 or idx >= len(self.texts):
31
+ continue
32
+ if score <= max_distance:
33
+ results.append({
34
+ "text": self.texts[idx],
35
+ "embedding": self.embeddings[idx],
36
+ "meta": self.metadatas[idx],
37
+ "distance": score
38
+ })
39
+ return results
40
+
41
+
42
+ def save(self):
43
+ faiss.write_index(self.index, self.index_path)
44
+ with open(self.metadata_path, "wb") as f:
45
+ pickle.dump({
46
+ "texts": self.texts,
47
+ "embeddings": self.embeddings,
48
+ "metadatas": self.metadatas
49
+ }, f)
50
+
51
+ def load(self):
52
+ self.index = faiss.read_index(self.index_path)
53
+ with open(self.metadata_path, "rb") as f:
54
+ data = pickle.load(f)
55
+ self.texts = data["texts"]
56
+ self.embeddings = data["embeddings"]
57
+ self.metadatas = data["metadatas"]
58
+
59
+ FaissStore = VectorStoreFAISS
pipeline/vector_store/hybrid_retriever.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .faiss_store import VectorStoreFAISS
2
+ from .bm25_keyword_store import BM25KeywordStore
3
+ from .store_base import VectorStoreBase
4
+
5
+ class HybridRetriever(VectorStoreBase):
6
+ def __init__(self, faiss_store, bm25_store, alpha=0.5):
7
+ self.faiss_store = faiss_store
8
+ self.bm25_store = bm25_store
9
+ self.alpha = alpha
10
+
11
+ def add_documents(self, chunks, embeddings, metadatas):
12
+ self.faiss_store.add_documents(chunks, embeddings, metadatas)
13
+ self.bm25_store.add_documents(chunks, None, metadatas)
14
+
15
+ def search(self, query_embed, query_text, k=5, method=None):
16
+ faiss_hits = self.faiss_store.search(query_embed, k)
17
+ bm25_hits = self.bm25_store.search(query_text, k)
18
+
19
+ # Simple hybrid: combine and sort by average rank/score (tune as desired)
20
+ faiss_ids = {hit["text"]: i for i, hit in enumerate(faiss_hits)}
21
+ bm25_ids = {hit["text"]: i for i, hit in enumerate(bm25_hits)}
22
+ all_texts = set(faiss_ids) | set(bm25_ids)
23
+
24
+ hybrid = []
25
+ for text in all_texts:
26
+ f_rank = faiss_ids.get(text, k)
27
+ b_rank = bm25_ids.get(text, k)
28
+ joint_score = self.alpha * (k - f_rank) + (1 - self.alpha) * (k - b_rank)
29
+ # Prefer faiss meta but fallback to bm25
30
+ meta = faiss_hits[faiss_ids[text]]["meta"] if text in faiss_ids else bm25_hits[bm25_ids[text]]["meta"]
31
+ hybrid.append({"text": text, "meta": meta, "score": joint_score})
32
+ return sorted(hybrid, key=lambda x: -x["score"])[:k]
pipeline/vector_store/store_base.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ class VectorStoreBase:
2
+ def add_documents(self, chunks, embeddings, metadatas):
3
+ raise NotImplementedError
4
+
5
+ def search(self, query_embed=None, query_text=None, k=5, method=None):
6
+ raise NotImplementedError