VashuTheGreat2 commited on
Commit
1e6d8a7
·
verified ·
1 Parent(s): 5cf4ac9

Upload folder using huggingface_hub

Browse files
.github/workflows/test.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python Tests
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout Code
14
+ uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+
19
+ - name: Install Dependencies
20
+ run: uv sync
21
+
22
+ - name: Run Tests
23
+ env:
24
+ GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
25
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
26
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
27
+ AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
28
+ TAVILY_API_KEY: ${{ secrets.TAVILY_API_KEY }}
29
+ Gemini_API_Key: ${{ secrets.Gemini_API_Key }}
30
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
31
+ HUGGINGFACEHUB_ACCESS_TOKEN: ${{ secrets.HUGGINGFACEHUB_ACCESS_TOKEN }}
32
+ APP_API_KEY: ${{ secrets.APP_API_KEY }}
33
+ run: uv run pytest -v
README.md CHANGED
@@ -58,12 +58,31 @@ The worker sub-graph is responsible for specialized information retrieval from v
58
  - **TXT**: Plain text analysis.
59
  - **Images (OCR)**: Extraction of text from PNG/JPG using specialized loaders.
60
  - **🤖 Autonomous Orchestration**: Uses a Llama-3.3-70B model on **AWS Bedrock** with a manual JSON fallback mechanism for 100% reliable structured output.
61
- - **🔍 Hybrid Retrieval**: Combines local FAISS vector stores with real-time Google Search integration.
 
 
 
 
62
  - **🧠 Persistence & Memory**: Full multi-turn conversation support with LangGraph checkpointers.
63
  - **⚡ Modern Tech Stack**: Built with `uv` for lightning-fast dependency management and `FastAPI` for a high-performance backend.
64
 
65
  ---
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ## 🛠️ Tech Stack
68
 
69
  - **Core**: [Python 3.12](https://www.python.org/)
 
58
  - **TXT**: Plain text analysis.
59
  - **Images (OCR)**: Extraction of text from PNG/JPG using specialized loaders.
60
  - **🤖 Autonomous Orchestration**: Uses a Llama-3.3-70B model on **AWS Bedrock** with a manual JSON fallback mechanism for 100% reliable structured output.
61
+ - **🔍 Advanced Retrieval Pipeline**:
62
+ - **Hybrid Search**: Combines semantic vector search with keyword-based BM25 for maximum precision.
63
+ - **RRF (Reciprocal Rank Fusion)**: Merges multiple retrieval streams with mathematical rigor.
64
+ - **Reranking**: Uses `Flashrank` to re-score and filter the most relevant context before generation.
65
+ - **Multi-Query Expansion**: Generates multiple perspectives of a user query to capture hidden context.
66
  - **🧠 Persistence & Memory**: Full multi-turn conversation support with LangGraph checkpointers.
67
  - **⚡ Modern Tech Stack**: Built with `uv` for lightning-fast dependency management and `FastAPI` for a high-performance backend.
68
 
69
  ---
70
 
71
+ ## 🔍 Advanced Retrieval Pipeline
72
+
73
+ Multi-RAG doesn't just "search" — it employs a sophisticated multi-stage retrieval architecture to ensure the LLM receives the most accurate and relevant context possible.
74
+
75
+ | Technique | Description | Benefit |
76
+ | :--- | :--- | :--- |
77
+ | **Hybrid Search** | Dual-path retrieval using **FAISS (Dense)** and **BM25 (Sparse)**. | Captures both deep semantic meaning and exact keyword matches. |
78
+ | **Multi-Query** | The Orchestrator decomposes complex queries into multiple specialized sub-tasks. | Ensures no part of a complex request is overlooked. |
79
+ | **RRF** | **Reciprocal Rank Fusion** algorithm to merge results from different retrievers. | Provides a unified, unbiased ranking of candidates. |
80
+ | **Reranker** | **Flashrank-based cross-encoding** to re-evaluate the top-K results. | Drastically reduces "hallucinations" by filtering out low-relevance noise. |
81
+
82
+ ---
83
+
84
+ ---
85
+
86
  ## 🛠️ Tech Stack
87
 
88
  - **Core**: [Python 3.12](https://www.python.org/)
debug_transformers.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ pipe = pipeline("image-text-to-text", model="zai-org/GLM-OCR")
graph.png CHANGED
main.py CHANGED
@@ -15,11 +15,11 @@ app.mount("/blog/images", StaticFiles(directory="images"), name="blog_images")
15
  os.makedirs(DATA_FOLDER_PATH, exist_ok=True)
16
  os.makedirs(DB_FOLDER_PATH, exist_ok=True)
17
 
18
- # if __name__ == "__main__":
19
- # uv.run(
20
- # "main:app",
21
- # host="0.0.0.0",
22
- # port=7860,
23
- # reload=False,
24
- # reload_excludes=["db/*", "data/*", "logs/*", "vector_db/*", ".venv/*"],
25
- # )
 
15
  os.makedirs(DATA_FOLDER_PATH, exist_ok=True)
16
  os.makedirs(DB_FOLDER_PATH, exist_ok=True)
17
 
18
+ if __name__ == "__main__":
19
+ uv.run(
20
+ "main:app",
21
+ host="0.0.0.0",
22
+ port=7860,
23
+ reload=False,
24
+ reload_excludes=["db/*", "data/*", "logs/*", "vector_db/*", ".venv/*"],
25
+ )
pyproject.toml CHANGED
@@ -11,6 +11,7 @@ dependencies = [
11
  "easyocr>=1.7.2",
12
  "faiss-cpu>=1.13.2",
13
  "fastapi>=0.135.1",
 
14
  "keybert>=0.9.0",
15
  "langchain>=1.2.10",
16
  "langchain-aws>=1.3.1",
@@ -28,7 +29,9 @@ dependencies = [
28
  "pi-heif>=1.3.0",
29
  "pillow>=12.1.1",
30
  "pytesseract>=0.3.13",
 
31
  "python-multipart>=0.0.22",
 
32
  "sentence-transformers>=5.2.3",
33
  "transformers>=5.3.0",
34
  "unstructured>=0.21.5",
 
11
  "easyocr>=1.7.2",
12
  "faiss-cpu>=1.13.2",
13
  "fastapi>=0.135.1",
14
+ "flashrank>=0.2.10",
15
  "keybert>=0.9.0",
16
  "langchain>=1.2.10",
17
  "langchain-aws>=1.3.1",
 
29
  "pi-heif>=1.3.0",
30
  "pillow>=12.1.1",
31
  "pytesseract>=0.3.13",
32
+ "pytest>=9.0.3",
33
  "python-multipart>=0.0.22",
34
+ "rank-bm25>=0.2.2",
35
  "sentence-transformers>=5.2.3",
36
  "transformers>=5.3.0",
37
  "unstructured>=0.21.5",
src/MultiRag/components/content_embedder.py CHANGED
@@ -21,7 +21,15 @@ class ContentRetreiver(Retreiver):
21
  self.retriever = retriever
22
 
23
  async def retreive(self, query: str):
24
- return await self.retriever.ainvoke(query)
 
 
 
 
 
 
 
 
25
  class ContentEmbedder:
26
  def __init__(self, content_embedder_config: ContentEmbedderConfig):
27
  self.content_embedder_config = content_embedder_config
 
21
  self.retriever = retriever
22
 
23
  async def retreive(self, query: str):
24
+ docs = await self.retriever.ainvoke(query)
25
+ # Ensure metadata is serializable (convert np.float32 to float)
26
+ for doc in docs:
27
+ if hasattr(doc, "metadata") and "relevance_score" in doc.metadata:
28
+ try:
29
+ doc.metadata["relevance_score"] = float(doc.metadata["relevance_score"])
30
+ except (TypeError, ValueError):
31
+ pass
32
+ return docs
33
  class ContentEmbedder:
34
  def __init__(self, content_embedder_config: ContentEmbedderConfig):
35
  self.content_embedder_config = content_embedder_config
src/MultiRag/tests/run_pipeline_test.py CHANGED
@@ -1,137 +1,137 @@
1
- import os
2
- import sys
3
- import asyncio
4
- sys.path.append(os.getcwd())
5
 
6
- from dotenv import load_dotenv
7
- load_dotenv()
8
- from logger import *
9
- import logging
10
 
11
- from src.MultiRag.pipeline.run_pipeline import RunPipeline
12
 
13
 
14
- from src.MultiRag.models.rag_model import Content
15
- from src.MultiRag.components.content_embedder import ContentEmbedder
16
- from src.MultiRag.entity.config_entity import ContentEmbedderConfig
17
- import os
18
 
19
- # ============= generating retreivers ===========================
20
 
21
- async def generate_retreivers(thread_id):
22
- for file in os.listdir("docs"):
23
- logging.info(f"Processing file: {file}")
24
 
25
- content_embedder_config = ContentEmbedderConfig(
26
- file_path=f"docs/{file}",
27
- vector_store_path=f"db/{thread_id}/{file}", # Updated path structure
28
- )
29
- component = ContentEmbedder(content_embedder_config=content_embedder_config)
30
- retreiver = await component.embed_content()
31
- logging.info(f"Generated retreiver for {file}: {retreiver}")
32
 
33
 
34
- # ============= testing pdf query loading =======================
35
- async def pdf_test():
36
 
37
- run_pipeline = RunPipeline()
38
 
39
- # Mocking user uploaded files
40
- temp_user_content = [
41
- Content(
42
- name="AI_Intro.pdf",
43
- about="An introductory document about Artificial Intelligence and Machine Learning.",
44
- path="docs/AI_Intro.pdf"
45
- )
46
- ]
47
-
48
- res = await run_pipeline.initiate(
49
- thread_id="1",
50
- query="What does the AI_Intro.pdf say about Neural Networks? Use the pdf",
51
- userContent=temp_user_content
52
- )
53
-
54
- logging.info(f"Final Pipeline Response: {res}")
55
-
56
- # ============= testing txt query loading =======================
57
- async def txt_test():
58
- run_pipeline = RunPipeline()
59
 
60
- # Mocking user uploaded files
61
- temp_user_content = [
62
- Content(
63
- name="growing_ai_tools.txt",
64
- about="General notes about growing AI tools.",
65
- path="docs/growing_ai_tools.txt"
66
- )
67
- ]
68
-
69
- res = await run_pipeline.initiate(
70
- thread_id="1",
71
- query="What does the growing_ai_tools.txt say about AI tools? use the txt file",
72
- userContent=temp_user_content
73
- )
74
-
75
- logging.info(f"Final Pipeline Response: {res}")
76
-
77
-
78
- # ============= testing docs query loading =======================
79
- async def docx_test():
80
- run_pipeline = RunPipeline()
81
 
82
- # Mocking user uploaded files
83
- temp_user_content = [
84
- Content(
85
- name="google.docx",
86
- about="General notes about company Google.",
87
- path="docs/google.docx"
88
- )
89
- ]
90
-
91
- res = await run_pipeline.initiate(
92
- thread_id="1",
93
- query="What does the google.docx say about Google? use the docx file",
94
- userContent=temp_user_content
95
- )
96
-
97
- logging.info(f"Final Pipeline Response: {res}")
98
-
99
-
100
- # ============= testing image query loading =======================
101
- async def image_test():
102
- run_pipeline = RunPipeline()
103
 
104
- # Mocking user uploaded files
105
- temp_user_content = [
106
- Content(
107
- name="lena.png",
108
- about="An image of a girl.",
109
- path="docs/lena.png"
110
- )
111
- ]
112
 
113
- res = await run_pipeline.initiate(
114
- thread_id="1",
115
- query="What does the lena.png say about the girl? use the image file",
116
- userContent=temp_user_content
117
- )
118
 
119
- logging.info(f"Final Pipeline Response: {res}")
120
 
121
 
122
 
123
 
124
- # ============== Running all the tests =============================
125
- async def main():
126
- logging.info("Starting generating retreivers...")
127
- await generate_retreivers(thread_id="1")
128
- logging.info("Retreivers generated successfully. Starting pipeline tests...")
129
- logging.info("Starting pipeline tests...")
130
- await pdf_test()
131
- await txt_test()
132
- await docx_test()
133
- await image_test()
134
- logging.info("Pipeline tests completed.")
135
 
136
 
137
- asyncio.run(main())
 
1
+ # import os
2
+ # import sys
3
+ # import asyncio
4
+ # sys.path.append(os.getcwd())
5
 
6
+ # from dotenv import load_dotenv
7
+ # load_dotenv()
8
+ # from logger import *
9
+ # import logging
10
 
11
+ # from src.MultiRag.pipeline.run_pipeline import RunPipeline
12
 
13
 
14
+ # from src.MultiRag.models.rag_model import Content
15
+ # from src.MultiRag.components.content_embedder import ContentEmbedder
16
+ # from src.MultiRag.entity.config_entity import ContentEmbedderConfig
17
+ # import os
18
 
19
+ # # ============= generating retreivers ===========================
20
 
21
+ # async def generate_retreivers(thread_id):
22
+ # for file in os.listdir("docs"):
23
+ # logging.info(f"Processing file: {file}")
24
 
25
+ # content_embedder_config = ContentEmbedderConfig(
26
+ # file_path=f"docs/{file}",
27
+ # vector_store_path=f"db/{thread_id}/{file}", # Updated path structure
28
+ # )
29
+ # component = ContentEmbedder(content_embedder_config=content_embedder_config)
30
+ # retreiver = await component.embed_content()
31
+ # logging.info(f"Generated retreiver for {file}: {retreiver}")
32
 
33
 
34
+ # # ============= testing pdf query loading =======================
35
+ # async def pdf_test():
36
 
37
+ # run_pipeline = RunPipeline()
38
 
39
+ # # Mocking user uploaded files
40
+ # temp_user_content = [
41
+ # Content(
42
+ # name="AI_Intro.pdf",
43
+ # about="An introductory document about Artificial Intelligence and Machine Learning.",
44
+ # path="docs/AI_Intro.pdf"
45
+ # )
46
+ # ]
47
+
48
+ # res = await run_pipeline.initiate(
49
+ # thread_id="1",
50
+ # query="What does the AI_Intro.pdf say about Neural Networks? Use the pdf",
51
+ # userContent=temp_user_content
52
+ # )
53
+
54
+ # logging.info(f"Final Pipeline Response: {res}")
55
+
56
+ # # ============= testing txt query loading =======================
57
+ # async def txt_test():
58
+ # run_pipeline = RunPipeline()
59
 
60
+ # # Mocking user uploaded files
61
+ # temp_user_content = [
62
+ # Content(
63
+ # name="growing_ai_tools.txt",
64
+ # about="General notes about growing AI tools.",
65
+ # path="docs/growing_ai_tools.txt"
66
+ # )
67
+ # ]
68
+
69
+ # res = await run_pipeline.initiate(
70
+ # thread_id="1",
71
+ # query="What does the growing_ai_tools.txt say about AI tools? use the txt file",
72
+ # userContent=temp_user_content
73
+ # )
74
+
75
+ # logging.info(f"Final Pipeline Response: {res}")
76
+
77
+
78
+ # # ============= testing docs query loading =======================
79
+ # async def docx_test():
80
+ # run_pipeline = RunPipeline()
81
 
82
+ # # Mocking user uploaded files
83
+ # temp_user_content = [
84
+ # Content(
85
+ # name="google.docx",
86
+ # about="General notes about company Google.",
87
+ # path="docs/google.docx"
88
+ # )
89
+ # ]
90
+
91
+ # res = await run_pipeline.initiate(
92
+ # thread_id="1",
93
+ # query="What does the google.docx say about Google? use the docx file",
94
+ # userContent=temp_user_content
95
+ # )
96
+
97
+ # logging.info(f"Final Pipeline Response: {res}")
98
+
99
+
100
+ # # ============= testing image query loading =======================
101
+ # async def image_test():
102
+ # run_pipeline = RunPipeline()
103
 
104
+ # # Mocking user uploaded files
105
+ # temp_user_content = [
106
+ # Content(
107
+ # name="lena.png",
108
+ # about="An image of a girl.",
109
+ # path="docs/lena.png"
110
+ # )
111
+ # ]
112
 
113
+ # res = await run_pipeline.initiate(
114
+ # thread_id="1",
115
+ # query="What does the lena.png say about the girl? use the image file",
116
+ # userContent=temp_user_content
117
+ # )
118
 
119
+ # logging.info(f"Final Pipeline Response: {res}")
120
 
121
 
122
 
123
 
124
+ # # ============== Running all the tests =============================
125
+ # async def main():
126
+ # logging.info("Starting generating retreivers...")
127
+ # await generate_retreivers(thread_id="1")
128
+ # logging.info("Retreivers generated successfully. Starting pipeline tests...")
129
+ # logging.info("Starting pipeline tests...")
130
+ # await pdf_test()
131
+ # await txt_test()
132
+ # await docx_test()
133
+ # await image_test()
134
+ # logging.info("Pipeline tests completed.")
135
 
136
 
137
+ # asyncio.run(main())
src/MultiRag/tests/test_run_pipeline.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import asyncio
4
+ sys.path.append(os.getcwd())
5
+
6
+ import logging
7
+ import pytest
8
+ from dotenv import load_dotenv
9
+ from logger import *
10
+
11
+ from src.MultiRag.pipeline.run_pipeline import RunPipeline
12
+ from src.MultiRag.models.rag_model import Content
13
+ from src.MultiRag.components.content_embedder import ContentEmbedder
14
+ from src.MultiRag.entity.config_entity import ContentEmbedderConfig
15
+
16
+ load_dotenv()
17
+
18
+ THREAD_ID = "1"
19
+
20
+
21
+ @pytest.fixture(scope="session", autouse=True)
22
+ def generate_retreivers():
23
+ async def _generate():
24
+ for file in os.listdir("docs"):
25
+ logging.info(f"Processing file: {file}")
26
+ content_embedder_config = ContentEmbedderConfig(
27
+ file_path=f"docs/{file}",
28
+ vector_store_path=f"db/{THREAD_ID}/{file}",
29
+ )
30
+ component = ContentEmbedder(content_embedder_config=content_embedder_config)
31
+ retreiver = await component.embed_content()
32
+ logging.info(f"Generated retreiver for {file}: {retreiver}")
33
+
34
+ asyncio.run(_generate())
35
+
36
+
37
+ def test_pdf_query():
38
+ async def _run():
39
+ run_pipeline = RunPipeline()
40
+ temp_user_content = [
41
+ Content(
42
+ name="AI_Intro.pdf",
43
+ about="An introductory document about Artificial Intelligence and Machine Learning.",
44
+ path="docs/AI_Intro.pdf"
45
+ )
46
+ ]
47
+ res = await run_pipeline.initiate(
48
+ thread_id=THREAD_ID,
49
+ query="What does the AI_Intro.pdf say about Neural Networks? Use the pdf",
50
+ userContent=temp_user_content
51
+ )
52
+ logging.info(f"Final Pipeline Response: {res}")
53
+ return res
54
+
55
+ result = asyncio.run(_run())
56
+ assert result is not None
57
+
58
+
59
+ def test_txt_query():
60
+ async def _run():
61
+ run_pipeline = RunPipeline()
62
+ temp_user_content = [
63
+ Content(
64
+ name="growing_ai_tools.txt",
65
+ about="General notes about growing AI tools.",
66
+ path="docs/growing_ai_tools.txt"
67
+ )
68
+ ]
69
+ res = await run_pipeline.initiate(
70
+ thread_id=THREAD_ID,
71
+ query="What does the growing_ai_tools.txt say about AI tools? use the txt file",
72
+ userContent=temp_user_content
73
+ )
74
+ logging.info(f"Final Pipeline Response: {res}")
75
+ return res
76
+
77
+ result = asyncio.run(_run())
78
+ assert result is not None
79
+
80
+
81
+ def test_docx_query():
82
+ async def _run():
83
+ run_pipeline = RunPipeline()
84
+ temp_user_content = [
85
+ Content(
86
+ name="google.docx",
87
+ about="General notes about company Google.",
88
+ path="docs/google.docx"
89
+ )
90
+ ]
91
+ res = await run_pipeline.initiate(
92
+ thread_id=THREAD_ID,
93
+ query="What does the google.docx say about Google? use the docx file",
94
+ userContent=temp_user_content
95
+ )
96
+ logging.info(f"Final Pipeline Response: {res}")
97
+ return res
98
+
99
+ result = asyncio.run(_run())
100
+ assert result is not None
101
+
102
+
103
+ def test_image_query():
104
+ async def _run():
105
+ run_pipeline = RunPipeline()
106
+ temp_user_content = [
107
+ Content(
108
+ name="lena.png",
109
+ about="An image of a girl.",
110
+ path="docs/lena.png"
111
+ )
112
+ ]
113
+ res = await run_pipeline.initiate(
114
+ thread_id=THREAD_ID,
115
+ query="What does the lena.png say about the girl? use the image file",
116
+ userContent=temp_user_content
117
+ )
118
+ logging.info(f"Final Pipeline Response: {res}")
119
+ return res
120
+
121
+ result = asyncio.run(_run())
122
+ assert result is not None
src/MultiRag/utils/ingestion_utils.py CHANGED
@@ -6,6 +6,10 @@ from langchain_huggingface import HuggingFaceEmbeddings
6
  from utils.asyncHandler import asyncHandler
7
  from src.MultiRag.constants import EMBEDDING_MODEL
8
  from src.MultiRag.constants import EXCEPTED_FILE_TYPE, RETREIVER_DEFAULT_K
 
 
 
 
9
  import logging
10
 
11
  # ---------------- Embedding Model ----------------
@@ -140,8 +144,36 @@ async def create_vector_store(path: str = "db", docs: str = "data"):
140
  # ---------------- Retriever ----------------
141
  @asyncHandler
142
  async def create_retreiver(vectorstore, k: int = RETREIVER_DEFAULT_K):
143
- retriever = vectorstore.as_retriever(search_kwargs={"k": k})
144
- return retriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
 
147
  # ---------------- Get Raw Documents ----------------
 
6
  from utils.asyncHandler import asyncHandler
7
  from src.MultiRag.constants import EMBEDDING_MODEL
8
  from src.MultiRag.constants import EXCEPTED_FILE_TYPE, RETREIVER_DEFAULT_K
9
+ from langchain_classic.retrievers import EnsembleRetriever
10
+ from langchain_community.retrievers import BM25Retriever
11
+ from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever
12
+ from langchain_community.document_compressors import FlashrankRerank
13
  import logging
14
 
15
  # ---------------- Embedding Model ----------------
 
144
  # ---------------- Retriever ----------------
145
  @asyncHandler
146
  async def create_retreiver(vectorstore, k: int = RETREIVER_DEFAULT_K):
147
+ # 1. Extract documents from FAISS vectorstore to use with BM25
148
+ logging.info("Extracting documents from vectorstore for BM25...")
149
+ # FAISS stores documents in docstore._dict
150
+ documents = list(vectorstore.docstore._dict.values())
151
+
152
+ # 2. Vector search retriever
153
+ # We set a slightly higher k for base retrievers to give the reranker more options
154
+ base_k = max(k * 2, 20)
155
+ vector_retriever = vectorstore.as_retriever(search_kwargs={"k": base_k})
156
+
157
+ # 3. BM25 search retriever
158
+ bm25_retriever = BM25Retriever.from_documents(documents)
159
+ bm25_retriever.k = base_k
160
+
161
+ # 4. Hybrid Searching (Ensemble)
162
+ hybrid_retriever = EnsembleRetriever(
163
+ retrievers=[vector_retriever, bm25_retriever],
164
+ weights=[0.7, 0.3]
165
+ )
166
+
167
+ # 5. Reranker
168
+ compressor = FlashrankRerank(top_n=k)
169
+
170
+ # 6. Final Compression Retriever
171
+ compression_retriever = ContextualCompressionRetriever(
172
+ base_compressor=compressor,
173
+ base_retriever=hybrid_retriever
174
+ )
175
+
176
+ return compression_retriever
177
 
178
 
179
  # ---------------- Get Raw Documents ----------------
uv.lock CHANGED
@@ -51,6 +51,7 @@ dependencies = [
51
  { name = "easyocr" },
52
  { name = "faiss-cpu" },
53
  { name = "fastapi" },
 
54
  { name = "keybert" },
55
  { name = "langchain" },
56
  { name = "langchain-aws" },
@@ -68,7 +69,9 @@ dependencies = [
68
  { name = "pi-heif" },
69
  { name = "pillow" },
70
  { name = "pytesseract" },
 
71
  { name = "python-multipart" },
 
72
  { name = "sentence-transformers" },
73
  { name = "transformers" },
74
  { name = "unstructured" },
@@ -85,6 +88,7 @@ requires-dist = [
85
  { name = "easyocr", specifier = ">=1.7.2" },
86
  { name = "faiss-cpu", specifier = ">=1.13.2" },
87
  { name = "fastapi", specifier = ">=0.135.1" },
 
88
  { name = "keybert", specifier = ">=0.9.0" },
89
  { name = "langchain", specifier = ">=1.2.10" },
90
  { name = "langchain-aws", specifier = ">=1.3.1" },
@@ -102,7 +106,9 @@ requires-dist = [
102
  { name = "pi-heif", specifier = ">=1.3.0" },
103
  { name = "pillow", specifier = ">=12.1.1" },
104
  { name = "pytesseract", specifier = ">=0.3.13" },
 
105
  { name = "python-multipart", specifier = ">=0.0.22" },
 
106
  { name = "sentence-transformers", specifier = ">=5.2.3" },
107
  { name = "transformers", specifier = ">=5.3.0" },
108
  { name = "unstructured", specifier = ">=0.21.5" },
@@ -1007,6 +1013,22 @@ wheels = [
1007
  { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
1008
  ]
1009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1010
  [[package]]
1011
  name = "flatbuffers"
1012
  version = "25.12.19"
@@ -1486,6 +1508,15 @@ wheels = [
1486
  { url = "https://files.pythonhosted.org/packages/a4/ed/1f1afb2e9e7f38a545d628f864d562a5ae64fe6f7a10e28ffb9b185b4e89/importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec", size = 37461, upload-time = "2025-01-03T18:51:54.306Z" },
1487
  ]
1488
 
 
 
 
 
 
 
 
 
 
1489
  [[package]]
1490
  name = "installer"
1491
  version = "0.7.0"
@@ -3269,6 +3300,15 @@ wheels = [
3269
  { url = "https://files.pythonhosted.org/packages/ec/d2/de599c95ba0a973b94410477f8bf0b6f0b5e67360eb89bcb1ad365258beb/pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334", size = 2546446, upload-time = "2026-02-11T04:22:50.342Z" },
3270
  ]
3271
 
 
 
 
 
 
 
 
 
 
3272
  [[package]]
3273
  name = "posthog"
3274
  version = "5.4.0"
@@ -3821,6 +3861,22 @@ wheels = [
3821
  { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
3822
  ]
3823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3824
  [[package]]
3825
  name = "python-bidi"
3826
  version = "0.6.7"
@@ -3973,6 +4029,18 @@ wheels = [
3973
  { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
3974
  ]
3975
 
 
 
 
 
 
 
 
 
 
 
 
 
3976
  [[package]]
3977
  name = "rapidfuzz"
3978
  version = "3.14.3"
 
51
  { name = "easyocr" },
52
  { name = "faiss-cpu" },
53
  { name = "fastapi" },
54
+ { name = "flashrank" },
55
  { name = "keybert" },
56
  { name = "langchain" },
57
  { name = "langchain-aws" },
 
69
  { name = "pi-heif" },
70
  { name = "pillow" },
71
  { name = "pytesseract" },
72
+ { name = "pytest" },
73
  { name = "python-multipart" },
74
+ { name = "rank-bm25" },
75
  { name = "sentence-transformers" },
76
  { name = "transformers" },
77
  { name = "unstructured" },
 
88
  { name = "easyocr", specifier = ">=1.7.2" },
89
  { name = "faiss-cpu", specifier = ">=1.13.2" },
90
  { name = "fastapi", specifier = ">=0.135.1" },
91
+ { name = "flashrank", specifier = ">=0.2.10" },
92
  { name = "keybert", specifier = ">=0.9.0" },
93
  { name = "langchain", specifier = ">=1.2.10" },
94
  { name = "langchain-aws", specifier = ">=1.3.1" },
 
106
  { name = "pi-heif", specifier = ">=1.3.0" },
107
  { name = "pillow", specifier = ">=12.1.1" },
108
  { name = "pytesseract", specifier = ">=0.3.13" },
109
+ { name = "pytest", specifier = ">=9.0.3" },
110
  { name = "python-multipart", specifier = ">=0.0.22" },
111
+ { name = "rank-bm25", specifier = ">=0.2.2" },
112
  { name = "sentence-transformers", specifier = ">=5.2.3" },
113
  { name = "transformers", specifier = ">=5.3.0" },
114
  { name = "unstructured", specifier = ">=0.21.5" },
 
1013
  { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" },
1014
  ]
1015
 
1016
+ [[package]]
1017
+ name = "flashrank"
1018
+ version = "0.2.10"
1019
+ source = { registry = "https://pypi.org/simple" }
1020
+ dependencies = [
1021
+ { name = "numpy" },
1022
+ { name = "onnxruntime" },
1023
+ { name = "requests" },
1024
+ { name = "tokenizers" },
1025
+ { name = "tqdm" },
1026
+ ]
1027
+ sdist = { url = "https://files.pythonhosted.org/packages/55/1f/176cb4a857a70c3538f637e19389ab6aed21548a1ba1d1424fccc8bba108/FlashRank-0.2.10.tar.gz", hash = "sha256:f8f82a25c32fdfc668a09dc4089421d6aab8e7f71308424b541f40bb3f01d9db", size = 18905, upload-time = "2025-01-06T13:33:01.657Z" }
1028
+ wheels = [
1029
+ { url = "https://files.pythonhosted.org/packages/ec/99/72639cc1c9221c5bc77a2df1c2d352fe11965553bdf7d3e0856e7fcc8fd6/FlashRank-0.2.10-py3-none-any.whl", hash = "sha256:5d3272ae657d793c132d1e7917ed9e2adf49e0e1c60735583a67b051c6f0434a", size = 14511, upload-time = "2025-01-06T13:32:59.42Z" },
1030
+ ]
1031
+
1032
  [[package]]
1033
  name = "flatbuffers"
1034
  version = "25.12.19"
 
1508
  { url = "https://files.pythonhosted.org/packages/a4/ed/1f1afb2e9e7f38a545d628f864d562a5ae64fe6f7a10e28ffb9b185b4e89/importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec", size = 37461, upload-time = "2025-01-03T18:51:54.306Z" },
1509
  ]
1510
 
1511
+ [[package]]
1512
+ name = "iniconfig"
1513
+ version = "2.3.0"
1514
+ source = { registry = "https://pypi.org/simple" }
1515
+ sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
1516
+ wheels = [
1517
+ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
1518
+ ]
1519
+
1520
  [[package]]
1521
  name = "installer"
1522
  version = "0.7.0"
 
3300
  { url = "https://files.pythonhosted.org/packages/ec/d2/de599c95ba0a973b94410477f8bf0b6f0b5e67360eb89bcb1ad365258beb/pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334", size = 2546446, upload-time = "2026-02-11T04:22:50.342Z" },
3301
  ]
3302
 
3303
+ [[package]]
3304
+ name = "pluggy"
3305
+ version = "1.6.0"
3306
+ source = { registry = "https://pypi.org/simple" }
3307
+ sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
3308
+ wheels = [
3309
+ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
3310
+ ]
3311
+
3312
  [[package]]
3313
  name = "posthog"
3314
  version = "5.4.0"
 
3861
  { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
3862
  ]
3863
 
3864
+ [[package]]
3865
+ name = "pytest"
3866
+ version = "9.0.3"
3867
+ source = { registry = "https://pypi.org/simple" }
3868
+ dependencies = [
3869
+ { name = "colorama", marker = "sys_platform == 'win32'" },
3870
+ { name = "iniconfig" },
3871
+ { name = "packaging" },
3872
+ { name = "pluggy" },
3873
+ { name = "pygments" },
3874
+ ]
3875
+ sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
3876
+ wheels = [
3877
+ { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
3878
+ ]
3879
+
3880
  [[package]]
3881
  name = "python-bidi"
3882
  version = "0.6.7"
 
4029
  { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
4030
  ]
4031
 
4032
+ [[package]]
4033
+ name = "rank-bm25"
4034
+ version = "0.2.2"
4035
+ source = { registry = "https://pypi.org/simple" }
4036
+ dependencies = [
4037
+ { name = "numpy" },
4038
+ ]
4039
+ sdist = { url = "https://files.pythonhosted.org/packages/fc/0a/f9579384aa017d8b4c15613f86954b92a95a93d641cc849182467cf0bb3b/rank_bm25-0.2.2.tar.gz", hash = "sha256:096ccef76f8188563419aaf384a02f0ea459503fdf77901378d4fd9d87e5e51d", size = 8347, upload-time = "2022-02-16T12:10:52.196Z" }
4040
+ wheels = [
4041
+ { url = "https://files.pythonhosted.org/packages/2a/21/f691fb2613100a62b3fa91e9988c991e9ca5b89ea31c0d3152a3210344f9/rank_bm25-0.2.2-py3-none-any.whl", hash = "sha256:7bd4a95571adadfc271746fa146a4bcfd89c0cf731e49c3d1ad863290adbe8ae", size = 8584, upload-time = "2022-02-16T12:10:50.626Z" },
4042
+ ]
4043
+
4044
  [[package]]
4045
  name = "rapidfuzz"
4046
  version = "3.14.3"
worker_sub_graph.png CHANGED