Spaces:

alihaiderscholar
/

Pro-RAG-Level1

Sleeping

App Files Files Community

alihaiderscholar commited on Jan 22

Commit

aabd1d8

verified ·

1 Parent(s): 063466c

Upload 19 files

Browse files

Files changed (19) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/cache.cpython-313.pyc +0 -0
src/__pycache__/chunking.cpython-313.pyc +0 -0
src/__pycache__/database.cpython-313.pyc +0 -0
src/__pycache__/embeddings.cpython-313.pyc +0 -0
src/__pycache__/indexing.cpython-313.pyc +0 -0
src/__pycache__/ingestion.cpython-313.pyc +0 -0
src/__pycache__/retrieval.cpython-313.pyc +0 -0
src/__pycache__/vision_processor.cpython-313.pyc +0 -0
src/cache.py +99 -0
src/chunking.py +46 -0
src/database.py +105 -0
src/embeddings.py +21 -0
src/fix_qdrant.py +51 -0
src/indexing.py +56 -0
src/ingestion.py +139 -0
src/retrieval.py +106 -0
src/vision_processor.py +71 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (170 Bytes). View file

src/__pycache__/cache.cpython-313.pyc ADDED Viewed

Binary file (4.43 kB). View file

src/__pycache__/chunking.cpython-313.pyc ADDED Viewed

Binary file (2.1 kB). View file

src/__pycache__/database.cpython-313.pyc ADDED Viewed

Binary file (2.87 kB). View file

src/__pycache__/embeddings.cpython-313.pyc ADDED Viewed

Binary file (840 Bytes). View file

src/__pycache__/indexing.cpython-313.pyc ADDED Viewed

Binary file (2.57 kB). View file

src/__pycache__/ingestion.cpython-313.pyc ADDED Viewed

Binary file (7.16 kB). View file

src/__pycache__/retrieval.cpython-313.pyc ADDED Viewed

Binary file (4.92 kB). View file

src/__pycache__/vision_processor.cpython-313.pyc ADDED Viewed

Binary file (3.65 kB). View file

src/cache.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import time
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from src.embeddings import get_embedding_model
+from dotenv import load_dotenv
+# Load secrets
+load_dotenv()
+class SemanticCache:
+    def __init__(self, collection_name: str = "pro_rag_cache"):
+        self.collection_name = collection_name
+        # --- CONNECTION LOGIC ---
+        qdrant_url = os.getenv("QDRANT_URL")
+        qdrant_key = os.getenv("QDRANT_API_KEY")
+        if qdrant_url and qdrant_key:
+            print(f"☁️ [Cache] Connecting to Qdrant Cloud...")
+            self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
+        else:
+            print(f"🏠 [Cache] Connecting to Local Docker...")
+            self.client = QdrantClient(url="http://localhost:6333")
+        self.embedding_model = get_embedding_model()
+        self.threshold = 0.92
+        # Initialize Cache Collection
+        try:
+            if not self.client.collection_exists(collection_name):
+                print(f"⚙️ Initializing Semantic Cache: '{collection_name}'...")
+                self.client.create_collection(
+                    collection_name=collection_name,
+                    vectors_config=models.VectorParams(
+                        size=3072,
+                        distance=models.Distance.COSINE
+                    )
+                )
+        except Exception as e:
+            print(f"⚠️ Cache Initialization Warning: {e}")
+    def search_cache(self, query: str):
+        """
+        Checks if a similar question has already been answered.
+        Uses the modern 'query_points' method.
+        """
+        try:
+            # 1. Embed the query
+            vector = self.embedding_model.embed_query(query)
+            # 2. Search Qdrant Cache Collection (UPDATED METHOD)
+            search_result = self.client.query_points(
+                collection_name=self.collection_name,
+                query=vector,
+                limit=1,
+                with_payload=True
+            ).points
+            # 3. Check Threshold
+            if search_result:
+                best_match = search_result[0]
+                if best_match.score >= self.threshold:
+                    print(f"⚡ CACHE HIT! (Similarity: {best_match.score:.4f})")
+                    return best_match.payload["answer"]
+            score = search_result[0].score if search_result else 0
+            print(f"🐢 CACHE MISS (Best match: {score:.4f})")
+            return None
+        except Exception as e:
+            # Print error but don't crash the app
+            print(f"⚠️ Cache Search Error: {e}")
+            return None
+    def add_to_cache(self, query: str, answer: str):
+        """
+        Saves the Query + Answer pair.
+        """
+        try:
+            vector = self.embedding_model.embed_query(query)
+            point_id = int(time.time() * 1000)
+            self.client.upsert(
+                collection_name=self.collection_name,
+                points=[
+                    models.PointStruct(
+                        id=point_id,
+                        vector=vector,
+                        payload={
+                            "question": query,
+                            "answer": answer,
+                            "timestamp": time.time()
+                        }
+                    )
+                ]
+            )
+        except Exception as e:
+            print(f"⚠️ Failed to save to cache: {e}")

src/chunking.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+class ChunkingManager:
+    def __init__(self):
+        # PRO STRATEGY:
+        # Chunk Size 1000: Large enough to capture context.
+        # Overlap 200: Ensures we don't cut a sentence in half at the border.
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            separators=["\n\n", "\n", " ", ""]
+        )
+    def chunk_documents(self, documents: list[Document]):
+        """
+        Splits large documents into smaller vector-ready chunks.
+        INTELLIGENCE: Skips CSV rows (structured_data) as they are already perfect.
+        """
+        print(f"✂️  Starting Chunking Process on {len(documents)} documents...")
+        chunked_docs = []
+        skipped_docs = [] # CSVs and short image descriptions
+        for doc in documents:
+            category = doc.metadata.get("category", "")
+            # CONDITION 1: Structured Data (CSV) -> DO NOT SPLIT
+            # We already formatted these as single sentences.
+            if category == "structured_data":
+                skipped_docs.append(doc)
+                continue
+            # CONDITION 2: Text/PDF/Word -> SPLIT
+            # These are pages that might be 3000 tokens long.
+            splits = self.text_splitter.split_documents([doc])
+            chunked_docs.extend(splits)
+        # Merge results
+        total_docs = skipped_docs + chunked_docs
+        print(f"   - CSV Rows preserved: {len(skipped_docs)}")
+        print(f"   - Text Pages split into: {len(chunked_docs)} chunks")
+        print(f"✅ Total Vector-Ready Chunks: {len(total_docs)}")
+        return total_docs

src/database.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class VectorDB:
+    def __init__(self, collection_name: str = "pro_rag_container"):
+        self.collection_name = collection_name
+        # --- 1. CLOUD vs LOCAL LOGIC ---
+        qdrant_url = os.getenv("QDRANT_URL")
+        qdrant_key = os.getenv("QDRANT_API_KEY")
+        if qdrant_url and qdrant_key:
+            print("☁️ Connecting to Qdrant Cloud...")
+            self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
+        else:
+            print("🏠 Connecting to Local Docker...")
+            self.client = QdrantClient(url="http://localhost:6333")
+    # --- 2. THE MISSING FUNCTION ---
+    def create_collection(self, vector_size: int = 3072):
+        """
+        Creates the collection if it doesn't exist.
+        Using 3072 dimensions for OpenAI text-embedding-3-large.
+        """
+        # Check if collection exists
+        if self.client.collection_exists(collection_name=self.collection_name):
+            print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
+            return
+        print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
+        # Create Collection with Cosine Similarity
+        self.client.create_collection(
+            collection_name=self.collection_name,
+            vectors_config=models.VectorParams(
+                size=vector_size,
+                distance=models.Distance.COSINE
+            )
+        )
+        print(f"✅ Collection '{self.collection_name}' created successfully!")
+    def reset_database(self):
+        """
+        Deletes the collection.
+        """
+        self.client.delete_collection(collection_name=self.collection_name)
+        print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")
+# import os
+# from qdrant_client import QdrantClient
+# from qdrant_client.http import models
+# from dotenv import load_dotenv
+# # Load environment variables (API Keys, etc.)
+# load_dotenv()
+# class VectorDB:
+#     def __init__(self, collection_name: str = "pro_rag_v1"):
+#         """
+#         Initialize connection to Qdrant (Docker).
+#         """
+#         self.collection_name = collection_name
+#         self.client = QdrantClient(url="http://localhost:6333")
+#         # Verify connection immediately
+#         try:
+#             self.client.get_collections()
+#             print(f"✅ Connected to Qdrant Database at http://localhost:6333")
+#         except Exception as e:
+#             print(f"❌ Could not connect to Qdrant. Is Docker running? Error: {e}")
+#     def create_collection(self, vector_size: int = 3072):
+#         """
+#         Creates the collection if it doesn't exist.
+#         Using 3072 dimensions for OpenAI text-embedding-3-large.
+#         """
+#         # Check if collection exists
+#         if self.client.collection_exists(collection_name=self.collection_name):
+#             print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
+#             return
+#         print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
+#         # Create Collection with Cosine Similarity
+#         self.client.create_collection(
+#             collection_name=self.collection_name,
+#             vectors_config=models.VectorParams(
+#                 size=vector_size,
+#                 distance=models.Distance.COSINE
+#             )
+#         )
+#         print(f"✅ Collection '{self.collection_name}' created successfully!")
+#     def reset_database(self):
+#         """
+#         DANGEROUS: Deletes the collection. Used for restarting the POC.
+#         """
+#         self.client.delete_collection(collection_name=self.collection_name)
+#         print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from langchain_openai import OpenAIEmbeddings
+from dotenv import load_dotenv
+load_dotenv()
+def get_embedding_model():
+    """
+    Returns the Pro-Level Embedding Model.
+    Using: text-embedding-3-large (3072 dimensions)
+    """
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("❌ OPENAI_API_KEY not found in .env file!")
+    model = OpenAIEmbeddings(
+        model="text-embedding-3-large",
+        dimensions=3072,  # Must match Qdrant config
+        openai_api_key=api_key
+    )
+    return model

src/fix_qdrant.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import time
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from dotenv import load_dotenv
+load_dotenv()
+def fix_database():
+    print("🔧 Starting Database Repair for Qdrant Cloud...")
+    # 1. Connect
+    qdrant_url = os.getenv("QDRANT_URL")
+    qdrant_key = os.getenv("QDRANT_API_KEY")
+    if not qdrant_url:
+        print("❌ Error: QDRANT_URL not found in .env")
+        return
+    client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
+    collection_name = "pro_rag_container"
+    # 2. Create Payload Index for 'metadata.category'
+    # This solves the "Index required but not found" error (400 Bad Request)
+    print(f"⚙️ Optimizing collection '{collection_name}'...")
+    try:
+        client.create_payload_index(
+            collection_name=collection_name,
+            field_name="metadata.category",
+            field_schema=models.PayloadSchemaType.KEYWORD
+        )
+        print("✅ Success: Index created on 'metadata.category'.")
+    except Exception as e:
+        print(f"ℹ️ Note: {e}")
+    # 3. Create Payload Index for 'metadata.source' (Good practice)
+    try:
+        client.create_payload_index(
+            collection_name=collection_name,
+            field_name="metadata.source",
+            field_schema=models.PayloadSchemaType.KEYWORD
+        )
+        print("✅ Success: Index created on 'metadata.source'.")
+    except Exception as e:
+        print(f"ℹ️ Note: {e}")
+    print("\n🎉 Database Optimization Complete. Filters will now work.")
+if __name__ == "__main__":
+    fix_database()

src/indexing.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from langchain_qdrant import QdrantVectorStore
+from langchain_core.documents import Document
+from src.embeddings import get_embedding_model
+from qdrant_client import QdrantClient
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class IndexerManager:
+    def __init__(self, collection_name: str = "pro_rag_container"):
+        self.collection_name = collection_name
+        # --- 1. CONNECT TO QDRANT (Cloud or Local) ---
+        qdrant_url = os.getenv("QDRANT_URL")
+        qdrant_key = os.getenv("QDRANT_API_KEY")
+        if qdrant_url and qdrant_key:
+            print("☁️ [Indexer] Connecting to Qdrant Cloud...")
+            self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
+        else:
+            print("🏠 [Indexer] Connecting to Local Docker...")
+            self.client = QdrantClient(url="http://localhost:6333")
+        self.embedding_model = get_embedding_model()
+        # --- 2. INITIALIZE VECTOR STORE (The Missing Part) ---
+        # This wrapper allows LangChain to talk to our Qdrant client
+        self.vector_store = QdrantVectorStore(
+            client=self.client,
+            collection_name=self.collection_name,
+            embedding=self.embedding_model
+        )
+    def index_documents(self, documents: list[Document]):
+        """
+        Pushes documents to Qdrant in batches.
+        """
+        print(f"🚀 Starting Indexing of {len(documents)} chunks to Qdrant...")
+        batch_size = 100
+        total = len(documents)
+        # BATCHING LOOP
+        for i in range(0, total, batch_size):
+            batch = documents[i : i + batch_size]
+            # Upload batch
+            self.vector_store.add_documents(batch)
+            # Progress print every 500 docs
+            if (i + batch_size) % 500 == 0:
+                print(f"   ... Indexed {i + batch_size}/{total} chunks")
+        print("✅ Indexing Complete! Data is now searchable.")

src/ingestion.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from src.vision_processor import VisionProcessor
+import os
+import pandas as pd
+from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
+from langchain_core.documents import Document
+class IngestionManager:
+    def __init__(self, data_path: str = "data"):
+        self.data_path = data_path
+        # Define our "Pro" folders mapping
+        self.folders = {
+            "text_pdfs": os.path.join(data_path, "text_pdfs"),
+            "visual_pdfs": os.path.join(data_path, "visual_pdfs"),
+            "excel_csv": os.path.join(data_path, "excel_csv"),
+            "reference_docs": os.path.join(data_path, "reference_docs"),
+        }
+    def process_all_data(self):
+        all_documents = []
+        print(f"\n📂 Starting Ingestion Scan in '{self.data_path}'...")
+        # 1. Text PDFs
+        all_documents.extend(self._load_text_pdfs())
+        # 2. Word Docs
+        all_documents.extend(self._load_word_docs())
+        # 3. CSVs
+        all_documents.extend(self._load_structured_data())
+        # 4. Visual PDFs (NEW CODE)
+        folder = self.folders["visual_pdfs"]
+        if os.path.exists(folder):
+            vp = VisionProcessor()
+            for filename in os.listdir(folder):
+                if filename.endswith(".pdf"):
+                    filepath = os.path.join(folder, filename)
+                    all_documents.extend(vp.process_visual_pdf(filepath))
+        print(f"✅ Ingestion Complete. Total Documents Processed: {len(all_documents)}")
+        return all_documents
+    def _load_text_pdfs(self):
+        """
+        Strategy: Use PyPDF for high-speed text extraction.
+        Good for Annual Reports where text density is high.
+        """
+        folder = self.folders["text_pdfs"]
+        documents = []
+        if not os.path.exists(folder):
+            print(f"⚠️ Folder not found: {folder}")
+            return []
+        print(f"   scan: {folder}...")
+        for filename in os.listdir(folder):
+            if filename.endswith(".pdf"):
+                filepath = os.path.join(folder, filename)
+                try:
+                    # PRO TIP: Extract metadata like page numbers automatically
+                    loader = PyPDFLoader(filepath)
+                    pages = loader.load()
+                    # Add custom metadata tags
+                    for page in pages:
+                        page.metadata["category"] = "financial_report"
+                        page.metadata["source_type"] = "pdf_text"
+                    documents.extend(pages)
+                    print(f"     -> Loaded: {filename} ({len(pages)} pages)")
+                except Exception as e:
+                    print(f"     ❌ Error loading {filename}: {e}")
+        return documents
+    def _load_word_docs(self):
+        """
+        Strategy: Use Unstructured for .docx files.
+        """
+        folder = self.folders["reference_docs"]
+        documents = []
+        if not os.path.exists(folder):
+            return []
+        print(f"   scan: {folder}...")
+        for filename in os.listdir(folder):
+            if filename.endswith(".docx") or filename.endswith(".doc"):
+                filepath = os.path.join(folder, filename)
+                try:
+                    loader = UnstructuredWordDocumentLoader(filepath)
+                    docs = loader.load()
+                    for d in docs:
+                        d.metadata["category"] = "reference"
+                    documents.extend(docs)
+                    print(f"     -> Loaded: {filename}")
+                except Exception as e:
+                    print(f"     ❌ Error loading {filename}: {e}")
+        return documents
+    def _load_structured_data(self):
+        """
+        Strategy: Convert CSV Rows to 'Natural Language Sentences'.
+        Why: Vector DBs understand sentences, not raw CSV rows.
+        """
+        folder = self.folders["excel_csv"]
+        documents = []
+        if not os.path.exists(folder):
+            return []
+        print(f"   scan: {folder}...")
+        for filename in os.listdir(folder):
+            if filename.endswith(".csv"):
+                filepath = os.path.join(folder, filename)
+                try:
+                    # Load CSV into Pandas
+                    df = pd.read_csv(filepath)
+                    # PRO LOGIC: Convert Row to Sentence
+                    # Example: Row(ID=1, Sales=500) -> "Record ID 1 has Sales of 500."
+                    for index, row in df.iterrows():
+                        # Create a sentence summary of the row
+                        content = f"Data Record from {filename}: " + ", ".join([f"{col}: {val}" for col, val in row.items()])
+                        doc = Document(
+                            page_content=content,
+                            metadata={
+                                "source": filename,
+                                "row_index": index,
+                                "category": "structured_data"
+                            }
+                        )
+                        documents.append(doc)
+                    print(f"     -> Loaded: {filename} ({len(df)} rows converted to vectors)")
+                except Exception as e:
+                    print(f"     ❌ Error loading {filename}: {e}")
+        return documents

src/retrieval.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+from langchain_qdrant import QdrantVectorStore
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from src.embeddings import get_embedding_model
+from src.cache import SemanticCache
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class RetrievalEngine:
+    def __init__(self, collection_name: str = "pro_rag_container"):
+        # --- 1. CONNECT TO QDRANT (Cloud or Local) ---
+        qdrant_url = os.getenv("QDRANT_URL")
+        qdrant_key = os.getenv("QDRANT_API_KEY")
+        if qdrant_url and qdrant_key:
+            print("☁️ [Retrieval] Connecting to Qdrant Cloud...")
+            self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
+        else:
+            print("🏠 [Retrieval] Connecting to Local Docker...")
+            self.client = QdrantClient(url="http://localhost:6333")
+        embedding_model = get_embedding_model()
+        # --- 2. INITIALIZE VECTOR STORE (This was missing!) ---
+        self.vector_store = QdrantVectorStore(
+            client=self.client,
+            collection_name=collection_name,
+            embedding=embedding_model
+        )
+        # --- 3. SETUP LLM & CACHE ---
+        self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
+        self.cache = SemanticCache()
+        self.prompt_template = ChatPromptTemplate.from_template("""
+            You are an expert Enterprise Assistant.
+            STRICT INSTRUCTIONS:
+            1. Use the Context below to answer the user.
+            2. If the user asks for a specific "ID" or "Row", look for it in the context.
+            3. If the user asks for a "Summary" or "Total" (like Revenue), explain that you can only see a sample of the data, but summarize what you see in the provided rows.
+            4. Ignore irrelevant chunks.
+            Context:
+            {context}
+            User Question:
+            {question}
+            Answer:
+        """)
+    def query(self, question: str, filter_type: str = "all"):
+        print(f"\n🔎 Processing: '{question}' (Filter: {filter_type.upper()})...")
+        # 1. CHECK CACHE FIRST
+        cached_answer = self.cache.search_cache(question)
+        if cached_answer:
+             return f"{cached_answer} \n\n(🚀 Served from Cache)"
+        # 2. CONSTRUCT FILTER
+        qdrant_filter = None
+        if filter_type == "pdf":
+            qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="financial_report"))])
+        elif filter_type == "visual":
+             qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="visual_data"))])
+        elif filter_type == "csv":
+             qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="structured_data"))])
+        # 3. PERFORM SEARCH
+        retriever = self.vector_store.as_retriever(
+            search_kwargs={
+                "k": 10,
+                "filter": qdrant_filter
+            }
+        )
+        docs = retriever.invoke(question)
+        print(f"   -> Found {len(docs)} chunks.")
+        if not docs:
+            return "❌ No relevant data found in this category."
+        # 4. GENERATE ANSWER
+        chain = (
+            {"context": retriever, "question": RunnablePassthrough()}
+            | self.prompt_template
+            | self.llm
+            | StrOutputParser()
+        )
+        print("🤖 Generating Answer via GPT-4o...")
+        answer = chain.invoke(question)
+        # 5. SAVE TO CACHE
+        self.cache.add_to_cache(question, answer)
+        return answer

src/vision_processor.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import base64
+from pdf2image import convert_from_path
+from langchain_core.documents import Document
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage
+class VisionProcessor:
+    def __init__(self):
+        self.vision_model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
+        # PRO FIX: Point to local Poppler bin
+        # This assumes 'poppler' folder is in the project root
+        self.poppler_path = os.path.join(os.getcwd(), "poppler", "Library", "bin")
+    def process_visual_pdf(self, pdf_path):
+        print(f"   👁️ Processing Visual PDF: {os.path.basename(pdf_path)}...")
+        documents = []
+        try:
+            # Check if our local poppler exists
+            if not os.path.exists(self.poppler_path):
+                print(f"     ❌ Error: Poppler not found at {self.poppler_path}")
+                return []
+            # 1. Convert PDF pages to Images (Using local poppler)
+            images = convert_from_path(pdf_path, fmt="jpeg", poppler_path=self.poppler_path)
+            print(f"     -> Extracted {len(images)} images (pages) from PDF.")
+            # 2. Analyze first 3 pages (Cost Saving Mode)
+            for i, img in enumerate(images[:3]):
+                print(f"     -> Analyzing Page {i+1} with GPT-4o Vision...")
+                # Base64 Encode
+                import io
+                buffered = io.BytesIO()
+                img.save(buffered, format="JPEG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                # 3. Send to GPT-4o
+                response = self.vision_model.invoke(
+                    [
+                        HumanMessage(
+                            content=[
+                                {"type": "text", "text": "Describe this image in detail. If it is a graph, extract the data points. If it is a table, transcribe it."},
+                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}},
+                            ]
+                        )
+                    ]
+                )
+                description = response.content
+                doc = Document(
+                    page_content=f"IMAGE DESCRIPTION (Page {i+1}): {description}",
+                    metadata={
+                        "source": os.path.basename(pdf_path),
+                        "page": i+1,
+                        "category": "visual_data"
+                    }
+                )
+                documents.append(doc)
+            if len(images) > 3:
+                print("     ℹ️ Limited to first 3 pages for POC cost safety.")
+        except Exception as e:
+            print(f"     ❌ Vision Error: {e}")
+        return documents