Spaces:
Sleeping
Sleeping
Upload 19 files
Browse files- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/cache.cpython-313.pyc +0 -0
- src/__pycache__/chunking.cpython-313.pyc +0 -0
- src/__pycache__/database.cpython-313.pyc +0 -0
- src/__pycache__/embeddings.cpython-313.pyc +0 -0
- src/__pycache__/indexing.cpython-313.pyc +0 -0
- src/__pycache__/ingestion.cpython-313.pyc +0 -0
- src/__pycache__/retrieval.cpython-313.pyc +0 -0
- src/__pycache__/vision_processor.cpython-313.pyc +0 -0
- src/cache.py +99 -0
- src/chunking.py +46 -0
- src/database.py +105 -0
- src/embeddings.py +21 -0
- src/fix_qdrant.py +51 -0
- src/indexing.py +56 -0
- src/ingestion.py +139 -0
- src/retrieval.py +106 -0
- src/vision_processor.py +71 -0
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (170 Bytes). View file
|
|
|
src/__pycache__/cache.cpython-313.pyc
ADDED
|
Binary file (4.43 kB). View file
|
|
|
src/__pycache__/chunking.cpython-313.pyc
ADDED
|
Binary file (2.1 kB). View file
|
|
|
src/__pycache__/database.cpython-313.pyc
ADDED
|
Binary file (2.87 kB). View file
|
|
|
src/__pycache__/embeddings.cpython-313.pyc
ADDED
|
Binary file (840 Bytes). View file
|
|
|
src/__pycache__/indexing.cpython-313.pyc
ADDED
|
Binary file (2.57 kB). View file
|
|
|
src/__pycache__/ingestion.cpython-313.pyc
ADDED
|
Binary file (7.16 kB). View file
|
|
|
src/__pycache__/retrieval.cpython-313.pyc
ADDED
|
Binary file (4.92 kB). View file
|
|
|
src/__pycache__/vision_processor.cpython-313.pyc
ADDED
|
Binary file (3.65 kB). View file
|
|
|
src/cache.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.http import models
|
| 5 |
+
from src.embeddings import get_embedding_model
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load secrets
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
class SemanticCache:
|
| 12 |
+
def __init__(self, collection_name: str = "pro_rag_cache"):
|
| 13 |
+
self.collection_name = collection_name
|
| 14 |
+
|
| 15 |
+
# --- CONNECTION LOGIC ---
|
| 16 |
+
qdrant_url = os.getenv("QDRANT_URL")
|
| 17 |
+
qdrant_key = os.getenv("QDRANT_API_KEY")
|
| 18 |
+
|
| 19 |
+
if qdrant_url and qdrant_key:
|
| 20 |
+
print(f"☁️ [Cache] Connecting to Qdrant Cloud...")
|
| 21 |
+
self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
|
| 22 |
+
else:
|
| 23 |
+
print(f"🏠 [Cache] Connecting to Local Docker...")
|
| 24 |
+
self.client = QdrantClient(url="http://localhost:6333")
|
| 25 |
+
|
| 26 |
+
self.embedding_model = get_embedding_model()
|
| 27 |
+
self.threshold = 0.92
|
| 28 |
+
|
| 29 |
+
# Initialize Cache Collection
|
| 30 |
+
try:
|
| 31 |
+
if not self.client.collection_exists(collection_name):
|
| 32 |
+
print(f"⚙️ Initializing Semantic Cache: '{collection_name}'...")
|
| 33 |
+
self.client.create_collection(
|
| 34 |
+
collection_name=collection_name,
|
| 35 |
+
vectors_config=models.VectorParams(
|
| 36 |
+
size=3072,
|
| 37 |
+
distance=models.Distance.COSINE
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"⚠️ Cache Initialization Warning: {e}")
|
| 42 |
+
|
| 43 |
+
def search_cache(self, query: str):
|
| 44 |
+
"""
|
| 45 |
+
Checks if a similar question has already been answered.
|
| 46 |
+
Uses the modern 'query_points' method.
|
| 47 |
+
"""
|
| 48 |
+
try:
|
| 49 |
+
# 1. Embed the query
|
| 50 |
+
vector = self.embedding_model.embed_query(query)
|
| 51 |
+
|
| 52 |
+
# 2. Search Qdrant Cache Collection (UPDATED METHOD)
|
| 53 |
+
search_result = self.client.query_points(
|
| 54 |
+
collection_name=self.collection_name,
|
| 55 |
+
query=vector,
|
| 56 |
+
limit=1,
|
| 57 |
+
with_payload=True
|
| 58 |
+
).points
|
| 59 |
+
|
| 60 |
+
# 3. Check Threshold
|
| 61 |
+
if search_result:
|
| 62 |
+
best_match = search_result[0]
|
| 63 |
+
if best_match.score >= self.threshold:
|
| 64 |
+
print(f"⚡ CACHE HIT! (Similarity: {best_match.score:.4f})")
|
| 65 |
+
return best_match.payload["answer"]
|
| 66 |
+
|
| 67 |
+
score = search_result[0].score if search_result else 0
|
| 68 |
+
print(f"🐢 CACHE MISS (Best match: {score:.4f})")
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
# Print error but don't crash the app
|
| 73 |
+
print(f"⚠️ Cache Search Error: {e}")
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
def add_to_cache(self, query: str, answer: str):
|
| 77 |
+
"""
|
| 78 |
+
Saves the Query + Answer pair.
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
vector = self.embedding_model.embed_query(query)
|
| 82 |
+
point_id = int(time.time() * 1000)
|
| 83 |
+
|
| 84 |
+
self.client.upsert(
|
| 85 |
+
collection_name=self.collection_name,
|
| 86 |
+
points=[
|
| 87 |
+
models.PointStruct(
|
| 88 |
+
id=point_id,
|
| 89 |
+
vector=vector,
|
| 90 |
+
payload={
|
| 91 |
+
"question": query,
|
| 92 |
+
"answer": answer,
|
| 93 |
+
"timestamp": time.time()
|
| 94 |
+
}
|
| 95 |
+
)
|
| 96 |
+
]
|
| 97 |
+
)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"⚠️ Failed to save to cache: {e}")
|
src/chunking.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 2 |
+
from langchain_core.documents import Document
|
| 3 |
+
|
| 4 |
+
class ChunkingManager:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
# PRO STRATEGY:
|
| 7 |
+
# Chunk Size 1000: Large enough to capture context.
|
| 8 |
+
# Overlap 200: Ensures we don't cut a sentence in half at the border.
|
| 9 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 10 |
+
chunk_size=1000,
|
| 11 |
+
chunk_overlap=200,
|
| 12 |
+
separators=["\n\n", "\n", " ", ""]
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
def chunk_documents(self, documents: list[Document]):
|
| 16 |
+
"""
|
| 17 |
+
Splits large documents into smaller vector-ready chunks.
|
| 18 |
+
INTELLIGENCE: Skips CSV rows (structured_data) as they are already perfect.
|
| 19 |
+
"""
|
| 20 |
+
print(f"✂️ Starting Chunking Process on {len(documents)} documents...")
|
| 21 |
+
|
| 22 |
+
chunked_docs = []
|
| 23 |
+
skipped_docs = [] # CSVs and short image descriptions
|
| 24 |
+
|
| 25 |
+
for doc in documents:
|
| 26 |
+
category = doc.metadata.get("category", "")
|
| 27 |
+
|
| 28 |
+
# CONDITION 1: Structured Data (CSV) -> DO NOT SPLIT
|
| 29 |
+
# We already formatted these as single sentences.
|
| 30 |
+
if category == "structured_data":
|
| 31 |
+
skipped_docs.append(doc)
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
# CONDITION 2: Text/PDF/Word -> SPLIT
|
| 35 |
+
# These are pages that might be 3000 tokens long.
|
| 36 |
+
splits = self.text_splitter.split_documents([doc])
|
| 37 |
+
chunked_docs.extend(splits)
|
| 38 |
+
|
| 39 |
+
# Merge results
|
| 40 |
+
total_docs = skipped_docs + chunked_docs
|
| 41 |
+
|
| 42 |
+
print(f" - CSV Rows preserved: {len(skipped_docs)}")
|
| 43 |
+
print(f" - Text Pages split into: {len(chunked_docs)} chunks")
|
| 44 |
+
print(f"✅ Total Vector-Ready Chunks: {len(total_docs)}")
|
| 45 |
+
|
| 46 |
+
return total_docs
|
src/database.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from qdrant_client import QdrantClient
|
| 3 |
+
from qdrant_client.http import models
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Load environment variables
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
class VectorDB:
|
| 10 |
+
def __init__(self, collection_name: str = "pro_rag_container"):
|
| 11 |
+
self.collection_name = collection_name
|
| 12 |
+
|
| 13 |
+
# --- 1. CLOUD vs LOCAL LOGIC ---
|
| 14 |
+
qdrant_url = os.getenv("QDRANT_URL")
|
| 15 |
+
qdrant_key = os.getenv("QDRANT_API_KEY")
|
| 16 |
+
|
| 17 |
+
if qdrant_url and qdrant_key:
|
| 18 |
+
print("☁️ Connecting to Qdrant Cloud...")
|
| 19 |
+
self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
|
| 20 |
+
else:
|
| 21 |
+
print("🏠 Connecting to Local Docker...")
|
| 22 |
+
self.client = QdrantClient(url="http://localhost:6333")
|
| 23 |
+
|
| 24 |
+
# --- 2. THE MISSING FUNCTION ---
|
| 25 |
+
def create_collection(self, vector_size: int = 3072):
|
| 26 |
+
"""
|
| 27 |
+
Creates the collection if it doesn't exist.
|
| 28 |
+
Using 3072 dimensions for OpenAI text-embedding-3-large.
|
| 29 |
+
"""
|
| 30 |
+
# Check if collection exists
|
| 31 |
+
if self.client.collection_exists(collection_name=self.collection_name):
|
| 32 |
+
print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
|
| 36 |
+
|
| 37 |
+
# Create Collection with Cosine Similarity
|
| 38 |
+
self.client.create_collection(
|
| 39 |
+
collection_name=self.collection_name,
|
| 40 |
+
vectors_config=models.VectorParams(
|
| 41 |
+
size=vector_size,
|
| 42 |
+
distance=models.Distance.COSINE
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
print(f"✅ Collection '{self.collection_name}' created successfully!")
|
| 46 |
+
|
| 47 |
+
def reset_database(self):
|
| 48 |
+
"""
|
| 49 |
+
Deletes the collection.
|
| 50 |
+
"""
|
| 51 |
+
self.client.delete_collection(collection_name=self.collection_name)
|
| 52 |
+
print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# import os
|
| 56 |
+
# from qdrant_client import QdrantClient
|
| 57 |
+
# from qdrant_client.http import models
|
| 58 |
+
# from dotenv import load_dotenv
|
| 59 |
+
|
| 60 |
+
# # Load environment variables (API Keys, etc.)
|
| 61 |
+
# load_dotenv()
|
| 62 |
+
|
| 63 |
+
# class VectorDB:
|
| 64 |
+
# def __init__(self, collection_name: str = "pro_rag_v1"):
|
| 65 |
+
# """
|
| 66 |
+
# Initialize connection to Qdrant (Docker).
|
| 67 |
+
# """
|
| 68 |
+
# self.collection_name = collection_name
|
| 69 |
+
# self.client = QdrantClient(url="http://localhost:6333")
|
| 70 |
+
|
| 71 |
+
# # Verify connection immediately
|
| 72 |
+
# try:
|
| 73 |
+
# self.client.get_collections()
|
| 74 |
+
# print(f"✅ Connected to Qdrant Database at http://localhost:6333")
|
| 75 |
+
# except Exception as e:
|
| 76 |
+
# print(f"❌ Could not connect to Qdrant. Is Docker running? Error: {e}")
|
| 77 |
+
|
| 78 |
+
# def create_collection(self, vector_size: int = 3072):
|
| 79 |
+
# """
|
| 80 |
+
# Creates the collection if it doesn't exist.
|
| 81 |
+
# Using 3072 dimensions for OpenAI text-embedding-3-large.
|
| 82 |
+
# """
|
| 83 |
+
# # Check if collection exists
|
| 84 |
+
# if self.client.collection_exists(collection_name=self.collection_name):
|
| 85 |
+
# print(f"ℹ️ Collection '{self.collection_name}' already exists. Skipping creation.")
|
| 86 |
+
# return
|
| 87 |
+
|
| 88 |
+
# print(f"⚙️ Creating collection '{self.collection_name}' with size {vector_size}...")
|
| 89 |
+
|
| 90 |
+
# # Create Collection with Cosine Similarity
|
| 91 |
+
# self.client.create_collection(
|
| 92 |
+
# collection_name=self.collection_name,
|
| 93 |
+
# vectors_config=models.VectorParams(
|
| 94 |
+
# size=vector_size,
|
| 95 |
+
# distance=models.Distance.COSINE
|
| 96 |
+
# )
|
| 97 |
+
# )
|
| 98 |
+
# print(f"✅ Collection '{self.collection_name}' created successfully!")
|
| 99 |
+
|
| 100 |
+
# def reset_database(self):
|
| 101 |
+
# """
|
| 102 |
+
# DANGEROUS: Deletes the collection. Used for restarting the POC.
|
| 103 |
+
# """
|
| 104 |
+
# self.client.delete_collection(collection_name=self.collection_name)
|
| 105 |
+
# print(f"⚠️ Collection '{self.collection_name}' has been DELETED.")
|
src/embeddings.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_openai import OpenAIEmbeddings
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
def get_embedding_model():
|
| 8 |
+
"""
|
| 9 |
+
Returns the Pro-Level Embedding Model.
|
| 10 |
+
Using: text-embedding-3-large (3072 dimensions)
|
| 11 |
+
"""
|
| 12 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 13 |
+
if not api_key:
|
| 14 |
+
raise ValueError("❌ OPENAI_API_KEY not found in .env file!")
|
| 15 |
+
|
| 16 |
+
model = OpenAIEmbeddings(
|
| 17 |
+
model="text-embedding-3-large",
|
| 18 |
+
dimensions=3072, # Must match Qdrant config
|
| 19 |
+
openai_api_key=api_key
|
| 20 |
+
)
|
| 21 |
+
return model
|
src/fix_qdrant.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.http import models
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
def fix_database():
|
| 10 |
+
print("🔧 Starting Database Repair for Qdrant Cloud...")
|
| 11 |
+
|
| 12 |
+
# 1. Connect
|
| 13 |
+
qdrant_url = os.getenv("QDRANT_URL")
|
| 14 |
+
qdrant_key = os.getenv("QDRANT_API_KEY")
|
| 15 |
+
|
| 16 |
+
if not qdrant_url:
|
| 17 |
+
print("❌ Error: QDRANT_URL not found in .env")
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
+
client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
|
| 21 |
+
collection_name = "pro_rag_container"
|
| 22 |
+
|
| 23 |
+
# 2. Create Payload Index for 'metadata.category'
|
| 24 |
+
# This solves the "Index required but not found" error (400 Bad Request)
|
| 25 |
+
print(f"⚙️ Optimizing collection '{collection_name}'...")
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
client.create_payload_index(
|
| 29 |
+
collection_name=collection_name,
|
| 30 |
+
field_name="metadata.category",
|
| 31 |
+
field_schema=models.PayloadSchemaType.KEYWORD
|
| 32 |
+
)
|
| 33 |
+
print("✅ Success: Index created on 'metadata.category'.")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"ℹ️ Note: {e}")
|
| 36 |
+
|
| 37 |
+
# 3. Create Payload Index for 'metadata.source' (Good practice)
|
| 38 |
+
try:
|
| 39 |
+
client.create_payload_index(
|
| 40 |
+
collection_name=collection_name,
|
| 41 |
+
field_name="metadata.source",
|
| 42 |
+
field_schema=models.PayloadSchemaType.KEYWORD
|
| 43 |
+
)
|
| 44 |
+
print("✅ Success: Index created on 'metadata.source'.")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"ℹ️ Note: {e}")
|
| 47 |
+
|
| 48 |
+
print("\n🎉 Database Optimization Complete. Filters will now work.")
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
fix_database()
|
src/indexing.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_qdrant import QdrantVectorStore
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from src.embeddings import get_embedding_model
|
| 5 |
+
from qdrant_client import QdrantClient
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
class IndexerManager:
|
| 12 |
+
def __init__(self, collection_name: str = "pro_rag_container"):
|
| 13 |
+
self.collection_name = collection_name
|
| 14 |
+
|
| 15 |
+
# --- 1. CONNECT TO QDRANT (Cloud or Local) ---
|
| 16 |
+
qdrant_url = os.getenv("QDRANT_URL")
|
| 17 |
+
qdrant_key = os.getenv("QDRANT_API_KEY")
|
| 18 |
+
|
| 19 |
+
if qdrant_url and qdrant_key:
|
| 20 |
+
print("☁️ [Indexer] Connecting to Qdrant Cloud...")
|
| 21 |
+
self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
|
| 22 |
+
else:
|
| 23 |
+
print("🏠 [Indexer] Connecting to Local Docker...")
|
| 24 |
+
self.client = QdrantClient(url="http://localhost:6333")
|
| 25 |
+
|
| 26 |
+
self.embedding_model = get_embedding_model()
|
| 27 |
+
|
| 28 |
+
# --- 2. INITIALIZE VECTOR STORE (The Missing Part) ---
|
| 29 |
+
# This wrapper allows LangChain to talk to our Qdrant client
|
| 30 |
+
self.vector_store = QdrantVectorStore(
|
| 31 |
+
client=self.client,
|
| 32 |
+
collection_name=self.collection_name,
|
| 33 |
+
embedding=self.embedding_model
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
def index_documents(self, documents: list[Document]):
|
| 37 |
+
"""
|
| 38 |
+
Pushes documents to Qdrant in batches.
|
| 39 |
+
"""
|
| 40 |
+
print(f"🚀 Starting Indexing of {len(documents)} chunks to Qdrant...")
|
| 41 |
+
|
| 42 |
+
batch_size = 100
|
| 43 |
+
total = len(documents)
|
| 44 |
+
|
| 45 |
+
# BATCHING LOOP
|
| 46 |
+
for i in range(0, total, batch_size):
|
| 47 |
+
batch = documents[i : i + batch_size]
|
| 48 |
+
|
| 49 |
+
# Upload batch
|
| 50 |
+
self.vector_store.add_documents(batch)
|
| 51 |
+
|
| 52 |
+
# Progress print every 500 docs
|
| 53 |
+
if (i + batch_size) % 500 == 0:
|
| 54 |
+
print(f" ... Indexed {i + batch_size}/{total} chunks")
|
| 55 |
+
|
| 56 |
+
print("✅ Indexing Complete! Data is now searchable.")
|
src/ingestion.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.vision_processor import VisionProcessor
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
|
| 7 |
+
class IngestionManager:
|
| 8 |
+
def __init__(self, data_path: str = "data"):
|
| 9 |
+
self.data_path = data_path
|
| 10 |
+
|
| 11 |
+
# Define our "Pro" folders mapping
|
| 12 |
+
self.folders = {
|
| 13 |
+
"text_pdfs": os.path.join(data_path, "text_pdfs"),
|
| 14 |
+
"visual_pdfs": os.path.join(data_path, "visual_pdfs"),
|
| 15 |
+
"excel_csv": os.path.join(data_path, "excel_csv"),
|
| 16 |
+
"reference_docs": os.path.join(data_path, "reference_docs"),
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def process_all_data(self):
|
| 20 |
+
all_documents = []
|
| 21 |
+
print(f"\n📂 Starting Ingestion Scan in '{self.data_path}'...")
|
| 22 |
+
|
| 23 |
+
# 1. Text PDFs
|
| 24 |
+
all_documents.extend(self._load_text_pdfs())
|
| 25 |
+
|
| 26 |
+
# 2. Word Docs
|
| 27 |
+
all_documents.extend(self._load_word_docs())
|
| 28 |
+
|
| 29 |
+
# 3. CSVs
|
| 30 |
+
all_documents.extend(self._load_structured_data())
|
| 31 |
+
|
| 32 |
+
# 4. Visual PDFs (NEW CODE)
|
| 33 |
+
folder = self.folders["visual_pdfs"]
|
| 34 |
+
if os.path.exists(folder):
|
| 35 |
+
vp = VisionProcessor()
|
| 36 |
+
for filename in os.listdir(folder):
|
| 37 |
+
if filename.endswith(".pdf"):
|
| 38 |
+
filepath = os.path.join(folder, filename)
|
| 39 |
+
all_documents.extend(vp.process_visual_pdf(filepath))
|
| 40 |
+
|
| 41 |
+
print(f"✅ Ingestion Complete. Total Documents Processed: {len(all_documents)}")
|
| 42 |
+
return all_documents
|
| 43 |
+
|
| 44 |
+
def _load_text_pdfs(self):
|
| 45 |
+
"""
|
| 46 |
+
Strategy: Use PyPDF for high-speed text extraction.
|
| 47 |
+
Good for Annual Reports where text density is high.
|
| 48 |
+
"""
|
| 49 |
+
folder = self.folders["text_pdfs"]
|
| 50 |
+
documents = []
|
| 51 |
+
|
| 52 |
+
if not os.path.exists(folder):
|
| 53 |
+
print(f"⚠️ Folder not found: {folder}")
|
| 54 |
+
return []
|
| 55 |
+
|
| 56 |
+
print(f" scan: {folder}...")
|
| 57 |
+
for filename in os.listdir(folder):
|
| 58 |
+
if filename.endswith(".pdf"):
|
| 59 |
+
filepath = os.path.join(folder, filename)
|
| 60 |
+
try:
|
| 61 |
+
# PRO TIP: Extract metadata like page numbers automatically
|
| 62 |
+
loader = PyPDFLoader(filepath)
|
| 63 |
+
pages = loader.load()
|
| 64 |
+
|
| 65 |
+
# Add custom metadata tags
|
| 66 |
+
for page in pages:
|
| 67 |
+
page.metadata["category"] = "financial_report"
|
| 68 |
+
page.metadata["source_type"] = "pdf_text"
|
| 69 |
+
|
| 70 |
+
documents.extend(pages)
|
| 71 |
+
print(f" -> Loaded: {filename} ({len(pages)} pages)")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f" ❌ Error loading {filename}: {e}")
|
| 74 |
+
return documents
|
| 75 |
+
|
| 76 |
+
def _load_word_docs(self):
|
| 77 |
+
"""
|
| 78 |
+
Strategy: Use Unstructured for .docx files.
|
| 79 |
+
"""
|
| 80 |
+
folder = self.folders["reference_docs"]
|
| 81 |
+
documents = []
|
| 82 |
+
|
| 83 |
+
if not os.path.exists(folder):
|
| 84 |
+
return []
|
| 85 |
+
|
| 86 |
+
print(f" scan: {folder}...")
|
| 87 |
+
for filename in os.listdir(folder):
|
| 88 |
+
if filename.endswith(".docx") or filename.endswith(".doc"):
|
| 89 |
+
filepath = os.path.join(folder, filename)
|
| 90 |
+
try:
|
| 91 |
+
loader = UnstructuredWordDocumentLoader(filepath)
|
| 92 |
+
docs = loader.load()
|
| 93 |
+
for d in docs:
|
| 94 |
+
d.metadata["category"] = "reference"
|
| 95 |
+
documents.extend(docs)
|
| 96 |
+
print(f" -> Loaded: {filename}")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f" ❌ Error loading {filename}: {e}")
|
| 99 |
+
return documents
|
| 100 |
+
|
| 101 |
+
def _load_structured_data(self):
|
| 102 |
+
"""
|
| 103 |
+
Strategy: Convert CSV Rows to 'Natural Language Sentences'.
|
| 104 |
+
Why: Vector DBs understand sentences, not raw CSV rows.
|
| 105 |
+
"""
|
| 106 |
+
folder = self.folders["excel_csv"]
|
| 107 |
+
documents = []
|
| 108 |
+
|
| 109 |
+
if not os.path.exists(folder):
|
| 110 |
+
return []
|
| 111 |
+
|
| 112 |
+
print(f" scan: {folder}...")
|
| 113 |
+
for filename in os.listdir(folder):
|
| 114 |
+
if filename.endswith(".csv"):
|
| 115 |
+
filepath = os.path.join(folder, filename)
|
| 116 |
+
try:
|
| 117 |
+
# Load CSV into Pandas
|
| 118 |
+
df = pd.read_csv(filepath)
|
| 119 |
+
|
| 120 |
+
# PRO LOGIC: Convert Row to Sentence
|
| 121 |
+
# Example: Row(ID=1, Sales=500) -> "Record ID 1 has Sales of 500."
|
| 122 |
+
for index, row in df.iterrows():
|
| 123 |
+
# Create a sentence summary of the row
|
| 124 |
+
content = f"Data Record from {filename}: " + ", ".join([f"{col}: {val}" for col, val in row.items()])
|
| 125 |
+
|
| 126 |
+
doc = Document(
|
| 127 |
+
page_content=content,
|
| 128 |
+
metadata={
|
| 129 |
+
"source": filename,
|
| 130 |
+
"row_index": index,
|
| 131 |
+
"category": "structured_data"
|
| 132 |
+
}
|
| 133 |
+
)
|
| 134 |
+
documents.append(doc)
|
| 135 |
+
|
| 136 |
+
print(f" -> Loaded: {filename} ({len(df)} rows converted to vectors)")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f" ❌ Error loading {filename}: {e}")
|
| 139 |
+
return documents
|
src/retrieval.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_qdrant import QdrantVectorStore
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 6 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 7 |
+
from qdrant_client import QdrantClient
|
| 8 |
+
from qdrant_client.http import models
|
| 9 |
+
from src.embeddings import get_embedding_model
|
| 10 |
+
from src.cache import SemanticCache
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
# Load environment variables
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
class RetrievalEngine:
|
| 17 |
+
def __init__(self, collection_name: str = "pro_rag_container"):
|
| 18 |
+
|
| 19 |
+
# --- 1. CONNECT TO QDRANT (Cloud or Local) ---
|
| 20 |
+
qdrant_url = os.getenv("QDRANT_URL")
|
| 21 |
+
qdrant_key = os.getenv("QDRANT_API_KEY")
|
| 22 |
+
|
| 23 |
+
if qdrant_url and qdrant_key:
|
| 24 |
+
print("☁️ [Retrieval] Connecting to Qdrant Cloud...")
|
| 25 |
+
self.client = QdrantClient(url=qdrant_url, api_key=qdrant_key)
|
| 26 |
+
else:
|
| 27 |
+
print("🏠 [Retrieval] Connecting to Local Docker...")
|
| 28 |
+
self.client = QdrantClient(url="http://localhost:6333")
|
| 29 |
+
|
| 30 |
+
embedding_model = get_embedding_model()
|
| 31 |
+
|
| 32 |
+
# --- 2. INITIALIZE VECTOR STORE (This was missing!) ---
|
| 33 |
+
self.vector_store = QdrantVectorStore(
|
| 34 |
+
client=self.client,
|
| 35 |
+
collection_name=collection_name,
|
| 36 |
+
embedding=embedding_model
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# --- 3. SETUP LLM & CACHE ---
|
| 40 |
+
self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
|
| 41 |
+
self.cache = SemanticCache()
|
| 42 |
+
|
| 43 |
+
self.prompt_template = ChatPromptTemplate.from_template("""
|
| 44 |
+
You are an expert Enterprise Assistant.
|
| 45 |
+
|
| 46 |
+
STRICT INSTRUCTIONS:
|
| 47 |
+
1. Use the Context below to answer the user.
|
| 48 |
+
2. If the user asks for a specific "ID" or "Row", look for it in the context.
|
| 49 |
+
3. If the user asks for a "Summary" or "Total" (like Revenue), explain that you can only see a sample of the data, but summarize what you see in the provided rows.
|
| 50 |
+
4. Ignore irrelevant chunks.
|
| 51 |
+
|
| 52 |
+
Context:
|
| 53 |
+
{context}
|
| 54 |
+
|
| 55 |
+
User Question:
|
| 56 |
+
{question}
|
| 57 |
+
|
| 58 |
+
Answer:
|
| 59 |
+
""")
|
| 60 |
+
|
| 61 |
+
def query(self, question: str, filter_type: str = "all"):
|
| 62 |
+
print(f"\n🔎 Processing: '{question}' (Filter: {filter_type.upper()})...")
|
| 63 |
+
|
| 64 |
+
# 1. CHECK CACHE FIRST
|
| 65 |
+
cached_answer = self.cache.search_cache(question)
|
| 66 |
+
if cached_answer:
|
| 67 |
+
return f"{cached_answer} \n\n(🚀 Served from Cache)"
|
| 68 |
+
|
| 69 |
+
# 2. CONSTRUCT FILTER
|
| 70 |
+
qdrant_filter = None
|
| 71 |
+
if filter_type == "pdf":
|
| 72 |
+
qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="financial_report"))])
|
| 73 |
+
elif filter_type == "visual":
|
| 74 |
+
qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="visual_data"))])
|
| 75 |
+
elif filter_type == "csv":
|
| 76 |
+
qdrant_filter = models.Filter(must=[models.FieldCondition(key="metadata.category", match=models.MatchValue(value="structured_data"))])
|
| 77 |
+
|
| 78 |
+
# 3. PERFORM SEARCH
|
| 79 |
+
retriever = self.vector_store.as_retriever(
|
| 80 |
+
search_kwargs={
|
| 81 |
+
"k": 10,
|
| 82 |
+
"filter": qdrant_filter
|
| 83 |
+
}
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
docs = retriever.invoke(question)
|
| 87 |
+
print(f" -> Found {len(docs)} chunks.")
|
| 88 |
+
|
| 89 |
+
if not docs:
|
| 90 |
+
return "❌ No relevant data found in this category."
|
| 91 |
+
|
| 92 |
+
# 4. GENERATE ANSWER
|
| 93 |
+
chain = (
|
| 94 |
+
{"context": retriever, "question": RunnablePassthrough()}
|
| 95 |
+
| self.prompt_template
|
| 96 |
+
| self.llm
|
| 97 |
+
| StrOutputParser()
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
print("🤖 Generating Answer via GPT-4o...")
|
| 101 |
+
answer = chain.invoke(question)
|
| 102 |
+
|
| 103 |
+
# 5. SAVE TO CACHE
|
| 104 |
+
self.cache.add_to_cache(question, answer)
|
| 105 |
+
|
| 106 |
+
return answer
|
src/vision_processor.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import base64
|
| 3 |
+
from pdf2image import convert_from_path
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
from langchain_openai import ChatOpenAI
|
| 6 |
+
from langchain_core.messages import HumanMessage
|
| 7 |
+
|
| 8 |
+
class VisionProcessor:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.vision_model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
|
| 11 |
+
|
| 12 |
+
# PRO FIX: Point to local Poppler bin
|
| 13 |
+
# This assumes 'poppler' folder is in the project root
|
| 14 |
+
self.poppler_path = os.path.join(os.getcwd(), "poppler", "Library", "bin")
|
| 15 |
+
|
| 16 |
+
def process_visual_pdf(self, pdf_path):
|
| 17 |
+
print(f" 👁️ Processing Visual PDF: {os.path.basename(pdf_path)}...")
|
| 18 |
+
documents = []
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
# Check if our local poppler exists
|
| 22 |
+
if not os.path.exists(self.poppler_path):
|
| 23 |
+
print(f" ❌ Error: Poppler not found at {self.poppler_path}")
|
| 24 |
+
return []
|
| 25 |
+
|
| 26 |
+
# 1. Convert PDF pages to Images (Using local poppler)
|
| 27 |
+
images = convert_from_path(pdf_path, fmt="jpeg", poppler_path=self.poppler_path)
|
| 28 |
+
|
| 29 |
+
print(f" -> Extracted {len(images)} images (pages) from PDF.")
|
| 30 |
+
|
| 31 |
+
# 2. Analyze first 3 pages (Cost Saving Mode)
|
| 32 |
+
for i, img in enumerate(images[:3]):
|
| 33 |
+
print(f" -> Analyzing Page {i+1} with GPT-4o Vision...")
|
| 34 |
+
|
| 35 |
+
# Base64 Encode
|
| 36 |
+
import io
|
| 37 |
+
buffered = io.BytesIO()
|
| 38 |
+
img.save(buffered, format="JPEG")
|
| 39 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 40 |
+
|
| 41 |
+
# 3. Send to GPT-4o
|
| 42 |
+
response = self.vision_model.invoke(
|
| 43 |
+
[
|
| 44 |
+
HumanMessage(
|
| 45 |
+
content=[
|
| 46 |
+
{"type": "text", "text": "Describe this image in detail. If it is a graph, extract the data points. If it is a table, transcribe it."},
|
| 47 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}},
|
| 48 |
+
]
|
| 49 |
+
)
|
| 50 |
+
]
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
description = response.content
|
| 54 |
+
|
| 55 |
+
doc = Document(
|
| 56 |
+
page_content=f"IMAGE DESCRIPTION (Page {i+1}): {description}",
|
| 57 |
+
metadata={
|
| 58 |
+
"source": os.path.basename(pdf_path),
|
| 59 |
+
"page": i+1,
|
| 60 |
+
"category": "visual_data"
|
| 61 |
+
}
|
| 62 |
+
)
|
| 63 |
+
documents.append(doc)
|
| 64 |
+
|
| 65 |
+
if len(images) > 3:
|
| 66 |
+
print(" ℹ️ Limited to first 3 pages for POC cost safety.")
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f" ❌ Vision Error: {e}")
|
| 70 |
+
|
| 71 |
+
return documents
|