alihaiderscholar commited on
Commit
aabd1d8
·
verified ·
1 Parent(s): 063466c

Upload 19 files

Browse files
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (170 Bytes). View file
 
src/__pycache__/cache.cpython-313.pyc ADDED
Binary file (4.43 kB). View file
 
src/__pycache__/chunking.cpython-313.pyc ADDED
Binary file (2.1 kB). View file
 
src/__pycache__/database.cpython-313.pyc ADDED
Binary file (2.87 kB). View file
 
src/__pycache__/embeddings.cpython-313.pyc ADDED
Binary file (840 Bytes). View file
 
src/__pycache__/indexing.cpython-313.pyc ADDED
Binary file (2.57 kB). View file
 
src/__pycache__/ingestion.cpython-313.pyc ADDED
Binary file (7.16 kB). View file
 
src/__pycache__/retrieval.cpython-313.pyc ADDED
Binary file (4.92 kB). View file
 
src/__pycache__/vision_processor.cpython-313.pyc ADDED
Binary file (3.65 kB). View file
 
src/cache.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.http import models
5
+ from src.embeddings import get_embedding_model
6
+ from dotenv import load_dotenv
7
+
8
+ # Load secrets
9
+ load_dotenv()
10
+
11
+ class SemanticCache:
12
+ def __init__(self, collection_name: str = "pro_rag_cache"):
13
+ self.collection_name = collection_name
14
+
15
+ # --- CONNECTION LOGIC ---
16
+ qdrant_url = os.getenv("QDRANT_URL")
17
+ qdrant_key = os.getenv("QDRANT_API_KEY")
18
+
19
+ if qdrant_url and qdrant_key:
20
+ print(f"☁️ [Cache] Connecting to Qdrant Cloud...")
21
+ self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
22
+ else:
23
+ print(f"🏠 [Cache] Connecting to Local Docker...")
24
+ self.client = QdrantClient(url="http://localhost:6333")
25
+
26
+ self.embedding_model = get_embedding_model()
27
+ self.threshold = 0.92
28
+
29
+ # Initialize Cache Collection
30
+ try:
31
+ if not self.client.collection_exists(collection_name):
32
+ print(f"⚙️ Initializing Semantic Cache: '{collection_name}'...")
33
+ self.client.create_collection(
34
+ collection_name=collection_name,
35
+ vectors_config=models.VectorParams(
36
+ size=3072,
37
+ distance=models.Distance.COSINE
38
+ )
39
+ )
40
+ except Exception as e:
41
+ print(f"⚠️ Cache Initialization Warning: {e}")
42
+
43
+ def search_cache(self, query: str):
44
+ """
45
+ Checks if a similar question has already been answered.
46
+ Uses the modern 'query_points' method.
47
+ """
48
+ try:
49
+ # 1. Embed the query
50
+ vector = self.embedding_model.embed_query(query)
51
+
52
+ # 2. Search Qdrant Cache Collection (UPDATED METHOD)
53
+ search_result = self.client.query_points(
54
+ collection_name=self.collection_name,
55
+ query=vector,
56
+ limit=1,
57
+ with_payload=True
58
+ ).points
59
+
60
+ # 3. Check Threshold
61
+ if search_result:
62
+ best_match = search_result[0]
63
+ if best_match.score >= self.threshold:
64
+ print(f"⚡ CACHE HIT! (Similarity: {best_match.score:.4f})")
65
+ return best_match.payload["answer"]
66
+
67
+ score = search_result[0].score if search_result else 0
68
+ print(f"🐢 CACHE MISS (Best match: {score:.4f})")
69
+ return None
70
+
71
+ except Exception as e:
72
+ # Print error but don't crash the app
73
+ print(f"⚠️ Cache Search Error: {e}")
74
+ return None
75
+
76
+ def add_to_cache(self, query: str, answer: str):
77
+ """
78
+ Saves the Query + Answer pair.
79
+ """
80
+ try:
81
+ vector = self.embedding_model.embed_query(query)
82
+ point_id = int(time.time() * 1000)
83
+
84
+ self.client.upsert(
85
+ collection_name=self.collection_name,
86
+ points=[
87
+ models.PointStruct(
88
+ id=point_id,
89
+ vector=vector,
90
+ payload={
91
+ "question": query,
92
+ "answer": answer,
93
+ "timestamp": time.time()
94
+ }
95
+ )
96
+ ]
97
+ )
98
+ except Exception as e:
99
+ print(f"⚠️ Failed to save to cache: {e}")
src/chunking.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_core.documents import Document
3
+
4
+ class ChunkingManager:
5
+ def __init__(self):
6
+ # PRO STRATEGY:
7
+ # Chunk Size 1000: Large enough to capture context.
8
+ # Overlap 200: Ensures we don't cut a sentence in half at the border.
9
+ self.text_splitter = RecursiveCharacterTextSplitter(
10
+ chunk_size=1000,
11
+ chunk_overlap=200,
12
+ separators=["\n\n", "\n", " ", ""]
13
+ )
14
+
15
+ def chunk_documents(self, documents: list[Document]):
16
+ """
17
+ Splits large documents into smaller vector-ready chunks.
18
+ INTELLIGENCE: Skips CSV rows (structured_data) as they are already perfect.
19
+ """
20
+ print(f"✂️ Starting Chunking Process on {len(documents)} documents...")
21
+
22
+ chunked_docs = []
23
+ skipped_docs = [] # CSVs and short image descriptions
24
+
25
+ for doc in documents:
26
+ category = doc.metadata.get("category", "")
27
+
28
+ # CONDITION 1: Structured Data (CSV) -> DO NOT SPLIT
29
+ # We already formatted these as single sentences.
30
+ if category == "structured_data":
31
+ skipped_docs.append(doc)
32
+ continue
33
+
34
+ # CONDITION 2: Text/PDF/Word -> SPLIT
35
+ # These are pages that might be 3000 tokens long.
36
+ splits = self.text_splitter.split_documents([doc])
37
+ chunked_docs.extend(splits)
38
+
39
+ # Merge results
40
+ total_docs = skipped_docs + chunked_docs
41
+
42
+ print(f" - CSV Rows preserved: {len(skipped_docs)}")
43
+ print(f" - Text Pages split into: {len(chunked_docs)} chunks")
44
+ print(f"✅ Total Vector-Ready Chunks: {len(total_docs)}")
45
+
46
+ return total_docs
src/database.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from qdrant_client import QdrantClient
3
+ from qdrant_client.http import models
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ class VectorDB:
10
+ def __init__(self, collection_name: str = "pro_rag_container"):
11
+ self.collection_name = collection_name
12
+
13
+ # --- 1. CLOUD vs LOCAL LOGIC ---
14
+ qdrant_url = os.getenv("QDRANT_URL")
15
+ qdrant_key = os.getenv("QDRANT_API_KEY")
16
+
17
+ if qdrant_url and qdrant_key:
18
+ print("☁️ Connecting to Qdrant Cloud...")
19
+ self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
20
+ else:
21
+ print("🏠 Connecting to Local Docker...")
22
+ self.client = QdrantClient(url="http://localhost:6333")
23
+
24
+ # --- 2. THE MISSING FUNCTION ---
25
+ def create_collection(self, vector_size: int = 3072):
26
+ """
27
+ Creates the collection if it doesn't exist.
28
+ Using 3072 dimensions for OpenAI text-embedding-3-large.
29
+ """
30
+ # Check if collection exists
31
+ if self.client.collection_exists(collection_name=self.collection_name):
32
+ print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
33
+ return
34
+
35
+ print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
36
+
37
+ # Create Collection with Cosine Similarity
38
+ self.client.create_collection(
39
+ collection_name=self.collection_name,
40
+ vectors_config=models.VectorParams(
41
+ size=vector_size,
42
+ distance=models.Distance.COSINE
43
+ )
44
+ )
45
+ print(f"✅ Collection '{self.collection_name}' created successfully!")
46
+
47
+ def reset_database(self):
48
+ """
49
+ Deletes the collection.
50
+ """
51
+ self.client.delete_collection(collection_name=self.collection_name)
52
+ print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")
53
+
54
+
55
+ # import os
56
+ # from qdrant_client import QdrantClient
57
+ # from qdrant_client.http import models
58
+ # from dotenv import load_dotenv
59
+
60
+ # # Load environment variables (API Keys, etc.)
61
+ # load_dotenv()
62
+
63
+ # class VectorDB:
64
+ # def __init__(self, collection_name: str = "pro_rag_v1"):
65
+ # """
66
+ # Initialize connection to Qdrant (Docker).
67
+ # """
68
+ # self.collection_name = collection_name
69
+ # self.client = QdrantClient(url="http://localhost:6333")
70
+
71
+ # # Verify connection immediately
72
+ # try:
73
+ # self.client.get_collections()
74
+ # print(f"✅ Connected to Qdrant Database at http://localhost:6333")
75
+ # except Exception as e:
76
+ # print(f"❌ Could not connect to Qdrant. Is Docker running? Error: {e}")
77
+
78
+ # def create_collection(self, vector_size: int = 3072):
79
+ # """
80
+ # Creates the collection if it doesn't exist.
81
+ # Using 3072 dimensions for OpenAI text-embedding-3-large.
82
+ # """
83
+ # # Check if collection exists
84
+ # if self.client.collection_exists(collection_name=self.collection_name):
85
+ # print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
86
+ # return
87
+
88
+ # print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
89
+
90
+ # # Create Collection with Cosine Similarity
91
+ # self.client.create_collection(
92
+ # collection_name=self.collection_name,
93
+ # vectors_config=models.VectorParams(
94
+ # size=vector_size,
95
+ # distance=models.Distance.COSINE
96
+ # )
97
+ # )
98
+ # print(f"✅ Collection '{self.collection_name}' created successfully!")
99
+
100
+ # def reset_database(self):
101
+ # """
102
+ # DANGEROUS: Deletes the collection. Used for restarting the POC.
103
+ # """
104
+ # self.client.delete_collection(collection_name=self.collection_name)
105
+ # print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")
src/embeddings.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def get_embedding_model():
8
+ """
9
+ Returns the Pro-Level Embedding Model.
10
+ Using: text-embedding-3-large (3072 dimensions)
11
+ """
12
+ api_key = os.getenv("OPENAI_API_KEY")
13
+ if not api_key:
14
+ raise ValueError("❌ OPENAI_API_KEY not found in .env file!")
15
+
16
+ model = OpenAIEmbeddings(
17
+ model="text-embedding-3-large",
18
+ dimensions=3072, # Must match Qdrant config
19
+ openai_api_key=api_key
20
+ )
21
+ return model
src/fix_qdrant.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.http import models
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ def fix_database():
10
+ print("🔧 Starting Database Repair for Qdrant Cloud...")
11
+
12
+ # 1. Connect
13
+ qdrant_url = os.getenv("QDRANT_URL")
14
+ qdrant_key = os.getenv("QDRANT_API_KEY")
15
+
16
+ if not qdrant_url:
17
+ print("❌ Error: QDRANT_URL not found in .env")
18
+ return
19
+
20
+ client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
21
+ collection_name = "pro_rag_container"
22
+
23
+ # 2. Create Payload Index for 'metadata.category'
24
+ # This solves the "Index required but not found" error (400 Bad Request)
25
+ print(f"⚙️ Optimizing collection '{collection_name}'...")
26
+
27
+ try:
28
+ client.create_payload_index(
29
+ collection_name=collection_name,
30
+ field_name="metadata.category",
31
+ field_schema=models.PayloadSchemaType.KEYWORD
32
+ )
33
+ print("✅ Success: Index created on 'metadata.category'.")
34
+ except Exception as e:
35
+ print(f"ℹ️ Note: {e}")
36
+
37
+ # 3. Create Payload Index for 'metadata.source' (Good practice)
38
+ try:
39
+ client.create_payload_index(
40
+ collection_name=collection_name,
41
+ field_name="metadata.source",
42
+ field_schema=models.PayloadSchemaType.KEYWORD
43
+ )
44
+ print("✅ Success: Index created on 'metadata.source'.")
45
+ except Exception as e:
46
+ print(f"ℹ️ Note: {e}")
47
+
48
+ print("\n🎉 Database Optimization Complete. Filters will now work.")
49
+
50
+ if __name__ == "__main__":
51
+ fix_database()
src/indexing.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_qdrant import QdrantVectorStore
3
+ from langchain_core.documents import Document
4
+ from src.embeddings import get_embedding_model
5
+ from qdrant_client import QdrantClient
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ class IndexerManager:
12
+ def __init__(self, collection_name: str = "pro_rag_container"):
13
+ self.collection_name = collection_name
14
+
15
+ # --- 1. CONNECT TO QDRANT (Cloud or Local) ---
16
+ qdrant_url = os.getenv("QDRANT_URL")
17
+ qdrant_key = os.getenv("QDRANT_API_KEY")
18
+
19
+ if qdrant_url and qdrant_key:
20
+ print("☁️ [Indexer] Connecting to Qdrant Cloud...")
21
+ self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
22
+ else:
23
+ print("🏠 [Indexer] Connecting to Local Docker...")
24
+ self.client = QdrantClient(url="http://localhost:6333")
25
+
26
+ self.embedding_model = get_embedding_model()
27
+
28
+ # --- 2. INITIALIZE VECTOR STORE (The Missing Part) ---
29
+ # This wrapper allows LangChain to talk to our Qdrant client
30
+ self.vector_store = QdrantVectorStore(
31
+ client=self.client,
32
+ collection_name=self.collection_name,
33
+ embedding=self.embedding_model
34
+ )
35
+
36
+ def index_documents(self, documents: list[Document]):
37
+ """
38
+ Pushes documents to Qdrant in batches.
39
+ """
40
+ print(f"🚀 Starting Indexing of {len(documents)} chunks to Qdrant...")
41
+
42
+ batch_size = 100
43
+ total = len(documents)
44
+
45
+ # BATCHING LOOP
46
+ for i in range(0, total, batch_size):
47
+ batch = documents[i : i + batch_size]
48
+
49
+ # Upload batch
50
+ self.vector_store.add_documents(batch)
51
+
52
+ # Progress print every 500 docs
53
+ if (i + batch_size) % 500 == 0:
54
+ print(f" ... Indexed {i + batch_size}/{total} chunks")
55
+
56
+ print("✅ Indexing Complete! Data is now searchable.")
src/ingestion.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.vision_processor import VisionProcessor
2
+ import os
3
+ import pandas as pd
4
+ from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
5
+ from langchain_core.documents import Document
6
+
7
+ class IngestionManager:
8
+ def __init__(self, data_path: str = "data"):
9
+ self.data_path = data_path
10
+
11
+ # Define our "Pro" folders mapping
12
+ self.folders = {
13
+ "text_pdfs": os.path.join(data_path, "text_pdfs"),
14
+ "visual_pdfs": os.path.join(data_path, "visual_pdfs"),
15
+ "excel_csv": os.path.join(data_path, "excel_csv"),
16
+ "reference_docs": os.path.join(data_path, "reference_docs"),
17
+ }
18
+
19
+ def process_all_data(self):
20
+ all_documents = []
21
+ print(f"\n📂 Starting Ingestion Scan in '{self.data_path}'...")
22
+
23
+ # 1. Text PDFs
24
+ all_documents.extend(self._load_text_pdfs())
25
+
26
+ # 2. Word Docs
27
+ all_documents.extend(self._load_word_docs())
28
+
29
+ # 3. CSVs
30
+ all_documents.extend(self._load_structured_data())
31
+
32
+ # 4. Visual PDFs (NEW CODE)
33
+ folder = self.folders["visual_pdfs"]
34
+ if os.path.exists(folder):
35
+ vp = VisionProcessor()
36
+ for filename in os.listdir(folder):
37
+ if filename.endswith(".pdf"):
38
+ filepath = os.path.join(folder, filename)
39
+ all_documents.extend(vp.process_visual_pdf(filepath))
40
+
41
+ print(f"✅ Ingestion Complete. Total Documents Processed: {len(all_documents)}")
42
+ return all_documents
43
+
44
+ def _load_text_pdfs(self):
45
+ """
46
+ Strategy: Use PyPDF for high-speed text extraction.
47
+ Good for Annual Reports where text density is high.
48
+ """
49
+ folder = self.folders["text_pdfs"]
50
+ documents = []
51
+
52
+ if not os.path.exists(folder):
53
+ print(f"⚠️ Folder not found: {folder}")
54
+ return []
55
+
56
+ print(f" scan: {folder}...")
57
+ for filename in os.listdir(folder):
58
+ if filename.endswith(".pdf"):
59
+ filepath = os.path.join(folder, filename)
60
+ try:
61
+ # PRO TIP: Extract metadata like page numbers automatically
62
+ loader = PyPDFLoader(filepath)
63
+ pages = loader.load()
64
+
65
+ # Add custom metadata tags
66
+ for page in pages:
67
+ page.metadata["category"] = "financial_report"
68
+ page.metadata["source_type"] = "pdf_text"
69
+
70
+ documents.extend(pages)
71
+ print(f" -> Loaded: {filename} ({len(pages)} pages)")
72
+ except Exception as e:
73
+ print(f" ❌ Error loading {filename}: {e}")
74
+ return documents
75
+
76
+ def _load_word_docs(self):
77
+ """
78
+ Strategy: Use Unstructured for .docx files.
79
+ """
80
+ folder = self.folders["reference_docs"]
81
+ documents = []
82
+
83
+ if not os.path.exists(folder):
84
+ return []
85
+
86
+ print(f" scan: {folder}...")
87
+ for filename in os.listdir(folder):
88
+ if filename.endswith(".docx") or filename.endswith(".doc"):
89
+ filepath = os.path.join(folder, filename)
90
+ try:
91
+ loader = UnstructuredWordDocumentLoader(filepath)
92
+ docs = loader.load()
93
+ for d in docs:
94
+ d.metadata["category"] = "reference"
95
+ documents.extend(docs)
96
+ print(f" -> Loaded: {filename}")
97
+ except Exception as e:
98
+ print(f" ❌ Error loading {filename}: {e}")
99
+ return documents
100
+
101
+ def _load_structured_data(self):
102
+ """
103
+ Strategy: Convert CSV Rows to 'Natural Language Sentences'.
104
+ Why: Vector DBs understand sentences, not raw CSV rows.
105
+ """
106
+ folder = self.folders["excel_csv"]
107
+ documents = []
108
+
109
+ if not os.path.exists(folder):
110
+ return []
111
+
112
+ print(f" scan: {folder}...")
113
+ for filename in os.listdir(folder):
114
+ if filename.endswith(".csv"):
115
+ filepath = os.path.join(folder, filename)
116
+ try:
117
+ # Load CSV into Pandas
118
+ df = pd.read_csv(filepath)
119
+
120
+ # PRO LOGIC: Convert Row to Sentence
121
+ # Example: Row(ID=1, Sales=500) -> "Record ID 1 has Sales of 500."
122
+ for index, row in df.iterrows():
123
+ # Create a sentence summary of the row
124
+ content = f"Data Record from {filename}: " + ", ".join([f"{col}: {val}" for col, val in row.items()])
125
+
126
+ doc = Document(
127
+ page_content=content,
128
+ metadata={
129
+ "source": filename,
130
+ "row_index": index,
131
+ "category": "structured_data"
132
+ }
133
+ )
134
+ documents.append(doc)
135
+
136
+ print(f" -> Loaded: {filename} ({len(df)} rows converted to vectors)")
137
+ except Exception as e:
138
+ print(f" ❌ Error loading {filename}: {e}")
139
+ return documents
src/retrieval.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_qdrant import QdrantVectorStore
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_core.runnables import RunnablePassthrough
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from qdrant_client import QdrantClient
8
+ from qdrant_client.http import models
9
+ from src.embeddings import get_embedding_model
10
+ from src.cache import SemanticCache
11
+ from dotenv import load_dotenv
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
+ class RetrievalEngine:
17
+ def __init__(self, collection_name: str = "pro_rag_container"):
18
+
19
+ # --- 1. CONNECT TO QDRANT (Cloud or Local) ---
20
+ qdrant_url = os.getenv("QDRANT_URL")
21
+ qdrant_key = os.getenv("QDRANT_API_KEY")
22
+
23
+ if qdrant_url and qdrant_key:
24
+ print("☁️ [Retrieval] Connecting to Qdrant Cloud...")
25
+ self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
26
+ else:
27
+ print("🏠 [Retrieval] Connecting to Local Docker...")
28
+ self.client = QdrantClient(url="http://localhost:6333")
29
+
30
+ embedding_model = get_embedding_model()
31
+
32
+ # --- 2. INITIALIZE VECTOR STORE (This was missing!) ---
33
+ self.vector_store = QdrantVectorStore(
34
+ client=self.client,
35
+ collection_name=collection_name,
36
+ embedding=embedding_model
37
+ )
38
+
39
+ # --- 3. SETUP LLM & CACHE ---
40
+ self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
41
+ self.cache = SemanticCache()
42
+
43
+ self.prompt_template = ChatPromptTemplate.from_template("""
44
+ You are an expert Enterprise Assistant.
45
+
46
+ STRICT INSTRUCTIONS:
47
+ 1. Use the Context below to answer the user.
48
+ 2. If the user asks for a specific "ID" or "Row", look for it in the context.
49
+ 3. If the user asks for a "Summary" or "Total" (like Revenue), explain that you can only see a sample of the data, but summarize what you see in the provided rows.
50
+ 4. Ignore irrelevant chunks.
51
+
52
+ Context:
53
+ {context}
54
+
55
+ User Question:
56
+ {question}
57
+
58
+ Answer:
59
+ """)
60
+
61
+ def query(self, question: str, filter_type: str = "all"):
62
+ print(f"\n🔎 Processing: '{question}' (Filter: {filter_type.upper()})...")
63
+
64
+ # 1. CHECK CACHE FIRST
65
+ cached_answer = self.cache.search_cache(question)
66
+ if cached_answer:
67
+ return f"{cached_answer} \n\n(🚀 Served from Cache)"
68
+
69
+ # 2. CONSTRUCT FILTER
70
+ qdrant_filter = None
71
+ if filter_type == "pdf":
72
+ qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="financial_report"))])
73
+ elif filter_type == "visual":
74
+ qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="visual_data"))])
75
+ elif filter_type == "csv":
76
+ qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="structured_data"))])
77
+
78
+ # 3. PERFORM SEARCH
79
+ retriever = self.vector_store.as_retriever(
80
+ search_kwargs={
81
+ "k": 10,
82
+ "filter": qdrant_filter
83
+ }
84
+ )
85
+
86
+ docs = retriever.invoke(question)
87
+ print(f" -> Found {len(docs)} chunks.")
88
+
89
+ if not docs:
90
+ return "❌ No relevant data found in this category."
91
+
92
+ # 4. GENERATE ANSWER
93
+ chain = (
94
+ {"context": retriever, "question": RunnablePassthrough()}
95
+ | self.prompt_template
96
+ | self.llm
97
+ | StrOutputParser()
98
+ )
99
+
100
+ print("🤖 Generating Answer via GPT-4o...")
101
+ answer = chain.invoke(question)
102
+
103
+ # 5. SAVE TO CACHE
104
+ self.cache.add_to_cache(question, answer)
105
+
106
+ return answer
src/vision_processor.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ from pdf2image import convert_from_path
4
+ from langchain_core.documents import Document
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.messages import HumanMessage
7
+
8
+ class VisionProcessor:
9
+ def __init__(self):
10
+ self.vision_model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
11
+
12
+ # PRO FIX: Point to local Poppler bin
13
+ # This assumes 'poppler' folder is in the project root
14
+ self.poppler_path = os.path.join(os.getcwd(), "poppler", "Library", "bin")
15
+
16
+ def process_visual_pdf(self, pdf_path):
17
+ print(f" 👁️ Processing Visual PDF: {os.path.basename(pdf_path)}...")
18
+ documents = []
19
+
20
+ try:
21
+ # Check if our local poppler exists
22
+ if not os.path.exists(self.poppler_path):
23
+ print(f" ❌ Error: Poppler not found at {self.poppler_path}")
24
+ return []
25
+
26
+ # 1. Convert PDF pages to Images (Using local poppler)
27
+ images = convert_from_path(pdf_path, fmt="jpeg", poppler_path=self.poppler_path)
28
+
29
+ print(f" -> Extracted {len(images)} images (pages) from PDF.")
30
+
31
+ # 2. Analyze first 3 pages (Cost Saving Mode)
32
+ for i, img in enumerate(images[:3]):
33
+ print(f" -> Analyzing Page {i+1} with GPT-4o Vision...")
34
+
35
+ # Base64 Encode
36
+ import io
37
+ buffered = io.BytesIO()
38
+ img.save(buffered, format="JPEG")
39
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
40
+
41
+ # 3. Send to GPT-4o
42
+ response = self.vision_model.invoke(
43
+ [
44
+ HumanMessage(
45
+ content=[
46
+ {"type": "text", "text": "Describe this image in detail. If it is a graph, extract the data points. If it is a table, transcribe it."},
47
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}},
48
+ ]
49
+ )
50
+ ]
51
+ )
52
+
53
+ description = response.content
54
+
55
+ doc = Document(
56
+ page_content=f"IMAGE DESCRIPTION (Page {i+1}): {description}",
57
+ metadata={
58
+ "source": os.path.basename(pdf_path),
59
+ "page": i+1,
60
+ "category": "visual_data"
61
+ }
62
+ )
63
+ documents.append(doc)
64
+
65
+ if len(images) > 3:
66
+ print(" ℹ️ Limited to first 3 pages for POC cost safety.")
67
+
68
+ except Exception as e:
69
+ print(f" ❌ Vision Error: {e}")
70
+
71
+ return documents