Spaces:
Sleeping
Sleeping
Pipeline added
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- pipeline/__init__.py +1 -0
- pipeline/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/chunking/__init__.py +10 -0
- pipeline/chunking/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc +0 -0
- pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc +0 -0
- pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc +0 -0
- pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc +0 -0
- pipeline/chunking/chunk_benchmark.py +24 -0
- pipeline/chunking/fixed_chunker.py +20 -0
- pipeline/chunking/semantic_chunker.py +103 -0
- pipeline/chunking/splitter_base.py +7 -0
- pipeline/embeddings/__init__.py +1 -0
- pipeline/embeddings/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc +0 -0
- pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc +0 -0
- pipeline/embeddings/embedder_base.py +18 -0
- pipeline/embeddings/sentence_transformer_embed.py +10 -0
- pipeline/ingest/__init__.py +0 -0
- pipeline/ingest/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc +0 -0
- pipeline/ingest/__pycache__/html_parser.cpython-313.pyc +0 -0
- pipeline/ingest/__pycache__/parser_base.cpython-313.pyc +0 -0
- pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc +0 -0
- pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc +0 -0
- pipeline/ingest/docx_parser.py +18 -0
- pipeline/ingest/html_parser.py +20 -0
- pipeline/ingest/parser_base.py +6 -0
- pipeline/ingest/pdf_parser.py +26 -0
- pipeline/ingest/txt_parser.py +15 -0
- pipeline/monitoring/__init__.py +0 -0
- pipeline/monitoring/drift_detection.py +0 -0
- pipeline/monitoring/feedback.py +0 -0
- pipeline/rag/__init__.py +0 -0
- pipeline/rag/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc +0 -0
- pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc +0 -0
- pipeline/rag/prompt_templates.py +12 -0
- pipeline/rag/retrieval_engine.py +58 -0
- pipeline/vector_store/__init__.py +1 -0
- pipeline/vector_store/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc +0 -0
- pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc +0 -0
- pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc +0 -0
- pipeline/vector_store/__pycache__/store_base.cpython-313.pyc +0 -0
- pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc +0 -0
- pipeline/vector_store/bm25_keyword_store.py +21 -0
- pipeline/vector_store/faiss_store.py +59 -0
- pipeline/vector_store/hybrid_retriever.py +32 -0
- pipeline/vector_store/store_base.py +6 -0
pipeline/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
pipeline/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (153 Bytes). View file
|
|
|
pipeline/chunking/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .fixed_chunker import FixedChunker
|
| 2 |
+
from .semantic_chunker import SemanticChunker
|
| 3 |
+
|
| 4 |
+
def chunk_text(text: str, chunk_size: int, overlap: int, method="fixed"):
|
| 5 |
+
if method == "fixed":
|
| 6 |
+
return FixedChunker().chunk(text, chunk_size, overlap)
|
| 7 |
+
elif method == "semantic":
|
| 8 |
+
return SemanticChunker().chunk(text, chunk_size, overlap)
|
| 9 |
+
else:
|
| 10 |
+
raise ValueError("Unknown chunking method: " + str(method))
|
pipeline/chunking/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (814 Bytes). View file
|
|
|
pipeline/chunking/__pycache__/chunk_benchmark.cpython-313.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
pipeline/chunking/__pycache__/fixed_chunker.cpython-313.pyc
ADDED
|
Binary file (1.22 kB). View file
|
|
|
pipeline/chunking/__pycache__/semantic_chunker.cpython-313.pyc
ADDED
|
Binary file (5.51 kB). View file
|
|
|
pipeline/chunking/__pycache__/splitter_base.cpython-313.pyc
ADDED
|
Binary file (791 Bytes). View file
|
|
|
pipeline/chunking/chunk_benchmark.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from . import chunk_text
|
| 3 |
+
|
| 4 |
+
def benchmark_chunker(text, chunk_size, overlap, method):
|
| 5 |
+
print(f"Benchmarking {method} chunker...")
|
| 6 |
+
t0 = time.time()
|
| 7 |
+
chunks = chunk_text(text, chunk_size, overlap, method)
|
| 8 |
+
t1 = time.time()
|
| 9 |
+
lens = [len(c["text"]) for c in chunks]
|
| 10 |
+
print(f"Total Chunks: {len(chunks)}")
|
| 11 |
+
print(f"Avg Chunk Size: {sum(lens)/len(lens):.1f}")
|
| 12 |
+
print(f"Min/Max Chunk Size: {min(lens)}/{max(lens)}")
|
| 13 |
+
print(f"Time Taken: {t1-t0:.4f}s")
|
| 14 |
+
print("Sample metadata:", chunks[0]["meta"] if chunks else None)
|
| 15 |
+
print("--- Sample chunk ---")
|
| 16 |
+
if chunks:
|
| 17 |
+
print(chunks[0]["text"][:200])
|
| 18 |
+
print("-" * 40)
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
|
| 22 |
+
text = ("This is a sample paragraph. " * 20 + "\n\n") * 100
|
| 23 |
+
benchmark_chunker(text, chunk_size=300, overlap=50, method="fixed")
|
| 24 |
+
benchmark_chunker(text, chunk_size=300, overlap=0, method="semantic")
|
pipeline/chunking/fixed_chunker.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict
|
| 2 |
+
from .splitter_base import SplitterBase
|
| 3 |
+
|
| 4 |
+
class FixedChunker(SplitterBase):
|
| 5 |
+
def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
|
| 6 |
+
chunks = []
|
| 7 |
+
idx = 0
|
| 8 |
+
while idx < len(text):
|
| 9 |
+
end = min(idx + chunk_size, len(text))
|
| 10 |
+
chunk_text = text[idx:end]
|
| 11 |
+
chunks.append({
|
| 12 |
+
"text": chunk_text,
|
| 13 |
+
"start": idx,
|
| 14 |
+
"end": end,
|
| 15 |
+
"meta": {"source": "fixed"}
|
| 16 |
+
})
|
| 17 |
+
if end == len(text):
|
| 18 |
+
break
|
| 19 |
+
idx += chunk_size - overlap
|
| 20 |
+
return chunks
|
pipeline/chunking/semantic_chunker.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
from .splitter_base import SplitterBase
|
| 4 |
+
|
| 5 |
+
HEADING_PATTERNS = [
|
| 6 |
+
r"^(CHAPTER|Chapter|Section)\s+\d+",
|
| 7 |
+
r"^[A-Z][A-Z ]{5,}$",
|
| 8 |
+
r"^(\d+\.){1,3}\s+\w+",
|
| 9 |
+
]
|
| 10 |
+
PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f")
|
| 11 |
+
FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE)
|
| 12 |
+
|
| 13 |
+
def find_headings(lines):
|
| 14 |
+
headings = []
|
| 15 |
+
for i, line in enumerate(lines):
|
| 16 |
+
for pat in HEADING_PATTERNS:
|
| 17 |
+
if re.match(pat, line.strip()):
|
| 18 |
+
headings.append((i, line.strip()))
|
| 19 |
+
break
|
| 20 |
+
return headings
|
| 21 |
+
|
| 22 |
+
def split_by_size(text, chunk_size, overlap):
|
| 23 |
+
|
| 24 |
+
subsections = []
|
| 25 |
+
i = 0
|
| 26 |
+
while i < len(text):
|
| 27 |
+
end_i = min(i + chunk_size, len(text))
|
| 28 |
+
chunk = text[i:end_i]
|
| 29 |
+
if chunk.strip():
|
| 30 |
+
subsections.append((i, end_i, chunk))
|
| 31 |
+
if end_i == len(text):
|
| 32 |
+
break
|
| 33 |
+
i += chunk_size - overlap
|
| 34 |
+
return subsections
|
| 35 |
+
|
| 36 |
+
class SemanticChunker(SplitterBase):
|
| 37 |
+
def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
|
| 38 |
+
lines = text.splitlines()
|
| 39 |
+
cur_section = None
|
| 40 |
+
cur_page = 1
|
| 41 |
+
chunks = []
|
| 42 |
+
|
| 43 |
+
line_pages = {}
|
| 44 |
+
for i, line in enumerate(lines):
|
| 45 |
+
m = PAGE_PATTERN.search(line)
|
| 46 |
+
if m and m.group(1):
|
| 47 |
+
cur_page = int(m.group(1))
|
| 48 |
+
line_pages[i] = cur_page
|
| 49 |
+
|
| 50 |
+
i = 0
|
| 51 |
+
while i < len(lines):
|
| 52 |
+
line = lines[i]
|
| 53 |
+
|
| 54 |
+
if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS):
|
| 55 |
+
cur_section = line.strip()
|
| 56 |
+
i += 1
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
if FIGURE_PATTERN.match(line):
|
| 60 |
+
chunks.append({
|
| 61 |
+
"text": line.strip(),
|
| 62 |
+
"start": i,
|
| 63 |
+
"end": i + 1,
|
| 64 |
+
"meta": {
|
| 65 |
+
"section": cur_section or "NO_SECTION",
|
| 66 |
+
"page": line_pages.get(i, 1),
|
| 67 |
+
"type": "figure"
|
| 68 |
+
}
|
| 69 |
+
})
|
| 70 |
+
i += 1
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
if PAGE_PATTERN.search(line):
|
| 74 |
+
i += 1
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
para_lines = []
|
| 78 |
+
para_start = i
|
| 79 |
+
while (i < len(lines) and lines[i].strip() and
|
| 80 |
+
not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and
|
| 81 |
+
not FIGURE_PATTERN.match(lines[i]) and
|
| 82 |
+
not PAGE_PATTERN.search(lines[i])):
|
| 83 |
+
para_lines.append(lines[i])
|
| 84 |
+
i += 1
|
| 85 |
+
para_text = "\n".join(para_lines).strip()
|
| 86 |
+
|
| 87 |
+
if para_text:
|
| 88 |
+
subchunks = split_by_size(para_text, chunk_size, overlap)
|
| 89 |
+
for substart, subend, chunk_str in subchunks:
|
| 90 |
+
chunks.append({
|
| 91 |
+
"text": chunk_str,
|
| 92 |
+
"start": para_start,
|
| 93 |
+
"end": i,
|
| 94 |
+
"meta": {
|
| 95 |
+
"section": cur_section or "NO_SECTION",
|
| 96 |
+
"page": line_pages.get(para_start, 1),
|
| 97 |
+
"source": "semantic"
|
| 98 |
+
}
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
while i < len(lines) and not lines[i].strip():
|
| 102 |
+
i += 1
|
| 103 |
+
return chunks
|
pipeline/chunking/splitter_base.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
class SplitterBase(ABC):
|
| 5 |
+
@abstractmethod
|
| 6 |
+
def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
|
| 7 |
+
pass
|
pipeline/embeddings/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .embedder_base import embed_chunks
|
pipeline/embeddings/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (217 Bytes). View file
|
|
|
pipeline/embeddings/__pycache__/embedder_base.cpython-313.pyc
ADDED
|
Binary file (1.35 kB). View file
|
|
|
pipeline/embeddings/__pycache__/sentence_transformer_embed.cpython-313.pyc
ADDED
|
Binary file (836 Bytes). View file
|
|
|
pipeline/embeddings/embedder_base.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from . import sentence_transformer_embed
|
| 2 |
+
|
| 3 |
+
EMBEDDING_BACKENDS = {
|
| 4 |
+
"sentence_transformers": sentence_transformer_embed
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
def embed_chunks(chunks, backend: str, model_name: str, version: str = None):
|
| 8 |
+
mod = EMBEDDING_BACKENDS.get(backend)
|
| 9 |
+
if not mod:
|
| 10 |
+
raise ValueError(f"Unknown backend: {backend}")
|
| 11 |
+
texts = [c["text"] if isinstance(c, dict) else c for c in chunks]
|
| 12 |
+
metas = [c.get("meta", {}) if isinstance(c, dict) else {} for c in chunks]
|
| 13 |
+
embeddings = mod.embed(texts, model_name)
|
| 14 |
+
version = version or f"{backend}:{model_name}"
|
| 15 |
+
return [
|
| 16 |
+
{"embedding": emb, "meta": meta, "version": version}
|
| 17 |
+
for emb, meta in zip(embeddings, metas)
|
| 18 |
+
]
|
pipeline/embeddings/sentence_transformer_embed.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
|
| 3 |
+
def embed(texts, model_name="all-MiniLM-L6-v2"):
|
| 4 |
+
model = SentenceTransformer(model_name)
|
| 5 |
+
return model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
|
| 6 |
+
|
| 7 |
+
def embed_chunks(chunks, model_name="all-MiniLM-L6-v2"):
|
| 8 |
+
|
| 9 |
+
texts = [chunk['text'] for chunk in chunks]
|
| 10 |
+
return embed(texts, model_name=model_name)
|
pipeline/ingest/__init__.py
ADDED
|
File without changes
|
pipeline/ingest/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (160 Bytes). View file
|
|
|
pipeline/ingest/__pycache__/docx_parser.cpython-313.pyc
ADDED
|
Binary file (1.33 kB). View file
|
|
|
pipeline/ingest/__pycache__/html_parser.cpython-313.pyc
ADDED
|
Binary file (1.5 kB). View file
|
|
|
pipeline/ingest/__pycache__/parser_base.cpython-313.pyc
ADDED
|
Binary file (710 Bytes). View file
|
|
|
pipeline/ingest/__pycache__/pdf_parser.cpython-313.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
pipeline/ingest/__pycache__/txt_parser.cpython-313.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
pipeline/ingest/docx_parser.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docx import Document
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from .parser_base import ParserBase
|
| 4 |
+
from typing import Tuple, Dict
|
| 5 |
+
|
| 6 |
+
class DOCXParser(ParserBase):
|
| 7 |
+
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
|
| 8 |
+
doc = Document(filepath)
|
| 9 |
+
text_list = []
|
| 10 |
+
for para in doc.paragraphs:
|
| 11 |
+
text_list.append(para.text)
|
| 12 |
+
text = "\n".join(text_list)
|
| 13 |
+
metadata = {
|
| 14 |
+
"filetype": "docx",
|
| 15 |
+
"filename": str(Path(filepath).name),
|
| 16 |
+
"num_paragraphs": len(doc.paragraphs)
|
| 17 |
+
}
|
| 18 |
+
return text, metadata
|
pipeline/ingest/html_parser.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from .parser_base import ParserBase
|
| 4 |
+
from typing import Tuple, Dict
|
| 5 |
+
|
| 6 |
+
class HTMLParser(ParserBase):
|
| 7 |
+
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
|
| 8 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 9 |
+
html = f.read()
|
| 10 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 11 |
+
# Extract all visible text (ignore script, style)
|
| 12 |
+
for tag in soup(["script", "style"]):
|
| 13 |
+
tag.decompose()
|
| 14 |
+
text = soup.get_text(separator="\n", strip=True)
|
| 15 |
+
metadata = {
|
| 16 |
+
"filetype": "html",
|
| 17 |
+
"filename": str(Path(filepath).name),
|
| 18 |
+
"length": len(text)
|
| 19 |
+
}
|
| 20 |
+
return text, metadata
|
pipeline/ingest/parser_base.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
class ParserBase(ABC):
|
| 4 |
+
@abstractmethod
|
| 5 |
+
def extract_text_and_metadata(self, filepath: str) -> (str, dict):
|
| 6 |
+
pass
|
pipeline/ingest/pdf_parser.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from .parser_base import ParserBase
|
| 4 |
+
from typing import Tuple, Dict
|
| 5 |
+
|
| 6 |
+
class PDFParser(ParserBase):
|
| 7 |
+
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
|
| 8 |
+
|
| 9 |
+
doc = fitz.open(filepath)
|
| 10 |
+
text = ""
|
| 11 |
+
pages_metadata = []
|
| 12 |
+
for i, page in enumerate(doc):
|
| 13 |
+
page_text = page.get_text()
|
| 14 |
+
text += page_text + "\n"
|
| 15 |
+
pages_metadata.append({
|
| 16 |
+
"page_num": i+1,
|
| 17 |
+
"length": len(page_text),
|
| 18 |
+
'first_100_chars': page_text[:100],
|
| 19 |
+
})
|
| 20 |
+
metadata = {
|
| 21 |
+
"filetype": "pdf",
|
| 22 |
+
"n_pages": doc.page_count,
|
| 23 |
+
"pages": pages_metadata,
|
| 24 |
+
"filename": str(Path(filepath).name)
|
| 25 |
+
}
|
| 26 |
+
return text, metadata
|
pipeline/ingest/txt_parser.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from .parser_base import ParserBase
|
| 3 |
+
from typing import Tuple, Dict
|
| 4 |
+
|
| 5 |
+
class TXTParser(ParserBase):
|
| 6 |
+
def extract_text_and_metadata(self, filepath: str) -> Tuple[str, Dict]:
|
| 7 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 8 |
+
text = f.read()
|
| 9 |
+
metadata = {
|
| 10 |
+
"filetype": "txt",
|
| 11 |
+
"filename": str(Path(filepath).name),
|
| 12 |
+
"length": len(text)
|
| 13 |
+
}
|
| 14 |
+
return text, metadata
|
| 15 |
+
|
pipeline/monitoring/__init__.py
ADDED
|
File without changes
|
pipeline/monitoring/drift_detection.py
ADDED
|
File without changes
|
pipeline/monitoring/feedback.py
ADDED
|
File without changes
|
pipeline/rag/__init__.py
ADDED
|
File without changes
|
pipeline/rag/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (157 Bytes). View file
|
|
|
pipeline/rag/__pycache__/prompt_templates.cpython-313.pyc
ADDED
|
Binary file (488 Bytes). View file
|
|
|
pipeline/rag/__pycache__/retrieval_engine.cpython-313.pyc
ADDED
|
Binary file (2.2 kB). View file
|
|
|
pipeline/rag/prompt_templates.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DEFAULT_PROMPT_TEMPLATE = """
|
| 2 |
+
You are an AI assistant helping answer book/document-based questions.
|
| 3 |
+
|
| 4 |
+
Use ONLY the provided context to answer the user's question. If the answer is not found in the context, say "I cannot answer based on the provided information."
|
| 5 |
+
|
| 6 |
+
Context:
|
| 7 |
+
{context}
|
| 8 |
+
|
| 9 |
+
Question: {question}
|
| 10 |
+
|
| 11 |
+
Answer:
|
| 12 |
+
"""
|
pipeline/rag/retrieval_engine.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pipeline.embeddings import embed_chunks
|
| 2 |
+
from pipeline.vector_store import get_store
|
| 3 |
+
from llm import get_llm
|
| 4 |
+
from pipeline.rag.prompt_templates import DEFAULT_PROMPT_TEMPLATE
|
| 5 |
+
|
| 6 |
+
def answer_question(
|
| 7 |
+
question: str,
|
| 8 |
+
embed_model: str = "all-MiniLM-L6-v2",
|
| 9 |
+
store_type: str = "faiss",
|
| 10 |
+
store_kwargs: dict = None,
|
| 11 |
+
llm_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
|
| 12 |
+
prompt_template: str = None,
|
| 13 |
+
top_k: int = 5,
|
| 14 |
+
rerank_fn=None,
|
| 15 |
+
):
|
| 16 |
+
|
| 17 |
+
q_chunk = {"text": question}
|
| 18 |
+
|
| 19 |
+
q_embeds = embed_chunks([q_chunk], backend="sentence_transformers", model_name=embed_model)
|
| 20 |
+
|
| 21 |
+
if isinstance(q_embeds[0], dict):
|
| 22 |
+
q_embed = q_embeds[0]["embedding"]
|
| 23 |
+
else:
|
| 24 |
+
q_embed = q_embeds[0]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if store_kwargs is None:
|
| 28 |
+
store_kwargs = {"dim": 384}
|
| 29 |
+
vector_store = get_store(store_type, **store_kwargs)
|
| 30 |
+
if hasattr(vector_store, "load"):
|
| 31 |
+
vector_store.load()
|
| 32 |
+
if store_type == "hybrid":
|
| 33 |
+
results = vector_store.search(q_embed, question, k=top_k)
|
| 34 |
+
else:
|
| 35 |
+
results = vector_store.search(q_embed, k=top_k)
|
| 36 |
+
print("answer_question: top-k results:", [r["text"][:60] for r in results])
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if rerank_fn:
|
| 40 |
+
results = rerank_fn(question, results)[:top_k]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
context = "\n\n".join([r["text"] for r in results])
|
| 44 |
+
if prompt_template is None:
|
| 45 |
+
prompt_template = DEFAULT_PROMPT_TEMPLATE
|
| 46 |
+
prompt = prompt_template.format(context=context, question=question)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
llm = get_llm(llm_name)
|
| 50 |
+
answer = llm.generate(prompt)
|
| 51 |
+
|
| 52 |
+
return {
|
| 53 |
+
"answer": answer,
|
| 54 |
+
"chunks": results,
|
| 55 |
+
"question": question,
|
| 56 |
+
"context": context,
|
| 57 |
+
"prompt": prompt
|
| 58 |
+
}
|
pipeline/vector_store/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .store_registry import get_store
|
pipeline/vector_store/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (217 Bytes). View file
|
|
|
pipeline/vector_store/__pycache__/bm25_keyword_store.cpython-313.pyc
ADDED
|
Binary file (2.27 kB). View file
|
|
|
pipeline/vector_store/__pycache__/faiss_store.cpython-313.pyc
ADDED
|
Binary file (4.09 kB). View file
|
|
|
pipeline/vector_store/__pycache__/hybrid_retriever.cpython-313.pyc
ADDED
|
Binary file (2.62 kB). View file
|
|
|
pipeline/vector_store/__pycache__/store_base.cpython-313.pyc
ADDED
|
Binary file (787 Bytes). View file
|
|
|
pipeline/vector_store/__pycache__/store_registry.cpython-313.pyc
ADDED
|
Binary file (901 Bytes). View file
|
|
|
pipeline/vector_store/bm25_keyword_store.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rank_bm25 import BM25Okapi
|
| 2 |
+
from .store_base import VectorStoreBase
|
| 3 |
+
|
| 4 |
+
class BM25KeywordStore(VectorStoreBase):
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.corpus = []
|
| 7 |
+
self.bm25 = None
|
| 8 |
+
self.metadatas = []
|
| 9 |
+
|
| 10 |
+
def add_documents(self, chunks, embeddings=None, metadatas=None):
|
| 11 |
+
self.corpus.extend([chunk["text"] for chunk in chunks])
|
| 12 |
+
self.metadatas.extend(metadatas or [{} for _ in chunks])
|
| 13 |
+
self.bm25 = BM25Okapi([doc.split(" ") for doc in self.corpus])
|
| 14 |
+
|
| 15 |
+
def search(self, query_text, k=5, method=None):
|
| 16 |
+
scores = self.bm25.get_scores(query_text.split(" "))
|
| 17 |
+
best_idx = sorted(range(len(scores)), key=lambda i: -scores[i])[:k]
|
| 18 |
+
return [
|
| 19 |
+
{"text": self.corpus[i], "meta": self.metadatas[i], "score": scores[i]}
|
| 20 |
+
for i in best_idx
|
| 21 |
+
]
|
pipeline/vector_store/faiss_store.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pickle
|
| 4 |
+
from .store_base import VectorStoreBase
|
| 5 |
+
|
| 6 |
+
class VectorStoreFAISS(VectorStoreBase):
|
| 7 |
+
def __init__(self, dim, index_path=None, metadata_path=None):
|
| 8 |
+
self.dim = dim
|
| 9 |
+
self.index = faiss.IndexFlatL2(dim)
|
| 10 |
+
self.embeddings = []
|
| 11 |
+
self.metadatas = []
|
| 12 |
+
self.texts = []
|
| 13 |
+
self.index_path = index_path or "faiss.index"
|
| 14 |
+
self.metadata_path = metadata_path or "faiss.meta.pkl"
|
| 15 |
+
|
| 16 |
+
def add_documents(self, chunks, embeddings, metadatas):
|
| 17 |
+
arr = np.array(embeddings).astype('float32')
|
| 18 |
+
self.index.add(arr)
|
| 19 |
+
self.embeddings.extend(embeddings)
|
| 20 |
+
self.texts.extend([chunk["text"] for chunk in chunks])
|
| 21 |
+
self.metadatas.extend(metadatas)
|
| 22 |
+
self.save()
|
| 23 |
+
|
| 24 |
+
def search(self, query_embed, k=5, method=None, max_distance=0.8):
|
| 25 |
+
query = np.array(query_embed).reshape(1, -1).astype('float32')
|
| 26 |
+
D, I = self.index.search(query, k)
|
| 27 |
+
results = []
|
| 28 |
+
for score, idx in zip(D[0], I[0]):
|
| 29 |
+
print(f"Chunk idx: {idx}, L2 distance: {score:.4f}")
|
| 30 |
+
if idx < 0 or idx >= len(self.texts):
|
| 31 |
+
continue
|
| 32 |
+
if score <= max_distance:
|
| 33 |
+
results.append({
|
| 34 |
+
"text": self.texts[idx],
|
| 35 |
+
"embedding": self.embeddings[idx],
|
| 36 |
+
"meta": self.metadatas[idx],
|
| 37 |
+
"distance": score
|
| 38 |
+
})
|
| 39 |
+
return results
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def save(self):
|
| 43 |
+
faiss.write_index(self.index, self.index_path)
|
| 44 |
+
with open(self.metadata_path, "wb") as f:
|
| 45 |
+
pickle.dump({
|
| 46 |
+
"texts": self.texts,
|
| 47 |
+
"embeddings": self.embeddings,
|
| 48 |
+
"metadatas": self.metadatas
|
| 49 |
+
}, f)
|
| 50 |
+
|
| 51 |
+
def load(self):
|
| 52 |
+
self.index = faiss.read_index(self.index_path)
|
| 53 |
+
with open(self.metadata_path, "rb") as f:
|
| 54 |
+
data = pickle.load(f)
|
| 55 |
+
self.texts = data["texts"]
|
| 56 |
+
self.embeddings = data["embeddings"]
|
| 57 |
+
self.metadatas = data["metadatas"]
|
| 58 |
+
|
| 59 |
+
FaissStore = VectorStoreFAISS
|
pipeline/vector_store/hybrid_retriever.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .faiss_store import VectorStoreFAISS
|
| 2 |
+
from .bm25_keyword_store import BM25KeywordStore
|
| 3 |
+
from .store_base import VectorStoreBase
|
| 4 |
+
|
| 5 |
+
class HybridRetriever(VectorStoreBase):
|
| 6 |
+
def __init__(self, faiss_store, bm25_store, alpha=0.5):
|
| 7 |
+
self.faiss_store = faiss_store
|
| 8 |
+
self.bm25_store = bm25_store
|
| 9 |
+
self.alpha = alpha
|
| 10 |
+
|
| 11 |
+
def add_documents(self, chunks, embeddings, metadatas):
|
| 12 |
+
self.faiss_store.add_documents(chunks, embeddings, metadatas)
|
| 13 |
+
self.bm25_store.add_documents(chunks, None, metadatas)
|
| 14 |
+
|
| 15 |
+
def search(self, query_embed, query_text, k=5, method=None):
|
| 16 |
+
faiss_hits = self.faiss_store.search(query_embed, k)
|
| 17 |
+
bm25_hits = self.bm25_store.search(query_text, k)
|
| 18 |
+
|
| 19 |
+
# Simple hybrid: combine and sort by average rank/score (tune as desired)
|
| 20 |
+
faiss_ids = {hit["text"]: i for i, hit in enumerate(faiss_hits)}
|
| 21 |
+
bm25_ids = {hit["text"]: i for i, hit in enumerate(bm25_hits)}
|
| 22 |
+
all_texts = set(faiss_ids) | set(bm25_ids)
|
| 23 |
+
|
| 24 |
+
hybrid = []
|
| 25 |
+
for text in all_texts:
|
| 26 |
+
f_rank = faiss_ids.get(text, k)
|
| 27 |
+
b_rank = bm25_ids.get(text, k)
|
| 28 |
+
joint_score = self.alpha * (k - f_rank) + (1 - self.alpha) * (k - b_rank)
|
| 29 |
+
# Prefer faiss meta but fallback to bm25
|
| 30 |
+
meta = faiss_hits[faiss_ids[text]]["meta"] if text in faiss_ids else bm25_hits[bm25_ids[text]]["meta"]
|
| 31 |
+
hybrid.append({"text": text, "meta": meta, "score": joint_score})
|
| 32 |
+
return sorted(hybrid, key=lambda x: -x["score"])[:k]
|
pipeline/vector_store/store_base.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class VectorStoreBase:
|
| 2 |
+
def add_documents(self, chunks, embeddings, metadatas):
|
| 3 |
+
raise NotImplementedError
|
| 4 |
+
|
| 5 |
+
def search(self, query_embed=None, query_text=None, k=5, method=None):
|
| 6 |
+
raise NotImplementedError
|