Spaces:
Sleeping
Sleeping
Commit Β·
51277f6
1
Parent(s): 37f0716
upgrade it to flask
Browse files- Dockerfile +15 -14
- app.py +250 -432
- git +0 -0
- requirements.txt +1 -0
Dockerfile
CHANGED
|
@@ -1,32 +1,33 @@
|
|
| 1 |
-
# Use a standard Python 3.10 image
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
| 4 |
-
# Install system dependencies for OpenCV
|
| 5 |
-
RUN apt-get update && apt-get install -y
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
# Set the working directory inside the container
|
| 10 |
WORKDIR /code
|
| 11 |
|
| 12 |
-
# Set
|
| 13 |
ENV DOCTR_CACHE_DIR="/code/.cache/doctr"
|
| 14 |
ENV HF_HOME="/code/.cache/huggingface"
|
| 15 |
|
| 16 |
-
#
|
| 17 |
RUN mkdir -p /code/.cache/doctr /code/.cache/huggingface && \
|
| 18 |
chmod 777 -R /code/.cache
|
| 19 |
|
| 20 |
-
# Copy
|
| 21 |
COPY requirements.txt .
|
| 22 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 23 |
|
| 24 |
-
# Copy
|
| 25 |
COPY . .
|
| 26 |
|
| 27 |
-
# Expose
|
| 28 |
EXPOSE 7860
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
+
# Install system dependencies for OpenCV
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
libgl1 \
|
| 6 |
+
libglib2.0-0 \
|
| 7 |
+
libsm6 \
|
| 8 |
+
libxext6 \
|
| 9 |
+
&& apt-get clean \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
|
|
|
| 12 |
WORKDIR /code
|
| 13 |
|
| 14 |
+
# Set cache directories
|
| 15 |
ENV DOCTR_CACHE_DIR="/code/.cache/doctr"
|
| 16 |
ENV HF_HOME="/code/.cache/huggingface"
|
| 17 |
|
| 18 |
+
# Create and set permissions for cache directories
|
| 19 |
RUN mkdir -p /code/.cache/doctr /code/.cache/huggingface && \
|
| 20 |
chmod 777 -R /code/.cache
|
| 21 |
|
| 22 |
+
# Copy and install requirements
|
| 23 |
COPY requirements.txt .
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
+
# Copy application files
|
| 27 |
COPY . .
|
| 28 |
|
| 29 |
+
# Expose port
|
| 30 |
EXPOSE 7860
|
| 31 |
|
| 32 |
+
# Run the Flask+Gradio app
|
| 33 |
+
CMD ["python", "app.py"]
|
|
|
app.py
CHANGED
|
@@ -9,91 +9,56 @@ import os
|
|
| 9 |
from groq import Groq
|
| 10 |
import base64
|
| 11 |
from io import BytesIO
|
| 12 |
-
import fitz
|
| 13 |
-
from pathlib import Path
|
| 14 |
import time
|
| 15 |
import shutil
|
| 16 |
-
import tempfile
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
from typing import List, Dict, Any
|
| 25 |
from langchain_qdrant import Qdrant
|
| 26 |
from qdrant_client import QdrantClient
|
| 27 |
|
| 28 |
# -------------------------------
|
| 29 |
-
#
|
| 30 |
# -------------------------------
|
| 31 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 32 |
-
|
| 33 |
-
print(f"Loading OCR model to {device}...")
|
| 34 |
-
ocr_model = ocr_predictor(pretrained=True).to(device)
|
| 35 |
-
print("β
OCR model loaded.")
|
| 36 |
-
except Exception as e:
|
| 37 |
-
print(f"β Failed to load OCR model: {e}")
|
| 38 |
-
ocr_model = None
|
| 39 |
-
|
| 40 |
-
print("Loading Embedding model...")
|
| 41 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
# Initialize Groq client
|
| 45 |
-
if not os.environ.get("GROQ_API_KEY"):
|
| 46 |
-
print("β οΈ WARNING: GROQ_API_KEY environment variable not set.")
|
| 47 |
-
groq_client = None
|
| 48 |
-
else:
|
| 49 |
-
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 50 |
|
| 51 |
-
# Model configurations
|
| 52 |
VISION_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 53 |
LLM_MODEL = "llama-3.3-70b-versatile"
|
| 54 |
|
| 55 |
-
|
| 56 |
-
# 1b βοΈ NEW: Qdrant Cloud Configuration
|
| 57 |
-
# -------------------------------
|
| 58 |
-
QDRANT_URL = os.environ.get("QDRANT_URL", "https://bdf142ef-7e2a-433b-87a0-301ff303e3af.us-east4-0.gcp.cloud.qdrant.io:6333")
|
| 59 |
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
|
| 60 |
COLLECTION_NAME = "multimodal_rag_store"
|
| 61 |
|
| 62 |
-
# --- Helper Functions (2 to 7) ---
|
| 63 |
-
|
| 64 |
# -------------------------------
|
| 65 |
-
#
|
| 66 |
# -------------------------------
|
| 67 |
def has_substantial_text(text, min_words=10):
|
| 68 |
-
"""
|
| 69 |
-
Determines if OCR extracted enough text to consider it a text-based image.
|
| 70 |
-
"""
|
| 71 |
words = text.split()
|
| 72 |
return len(words) >= min_words
|
| 73 |
|
| 74 |
-
|
| 75 |
-
# -------------------------------
|
| 76 |
-
# 3οΈβ£ Vision Analysis using Groq Llama 4 Scout
|
| 77 |
-
# -------------------------------
|
| 78 |
def analyze_image_with_vision(img_path=None, img_bytes=None, pil_image=None, max_retries=3):
|
| 79 |
-
if not groq_client:
|
| 80 |
-
return ""
|
| 81 |
-
|
| 82 |
for attempt in range(max_retries):
|
| 83 |
try:
|
| 84 |
-
img_data = None
|
| 85 |
-
img_format = "png"
|
| 86 |
-
|
| 87 |
if pil_image:
|
| 88 |
buffered = BytesIO()
|
| 89 |
pil_image.save(buffered, format="PNG")
|
| 90 |
img_data = buffered.getvalue()
|
|
|
|
| 91 |
elif img_path:
|
| 92 |
with open(img_path, "rb") as img_file:
|
| 93 |
img_data = img_file.read()
|
| 94 |
img_format = img_path.lower().split('.')[-1]
|
| 95 |
elif img_bytes:
|
| 96 |
img_data = img_bytes
|
|
|
|
| 97 |
else:
|
| 98 |
return ""
|
| 99 |
|
|
@@ -104,19 +69,19 @@ def analyze_image_with_vision(img_path=None, img_bytes=None, pil_image=None, max
|
|
| 104 |
vision_prompt = """Analyze this image carefully and provide a detailed description:
|
| 105 |
1. IDENTIFY THE TYPE: Is this a chart, graph, table, diagram, photograph, or text document?
|
| 106 |
2. IF IT'S A CHART/GRAPH/TABLE:
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
3. IF IT'S A PHOTOGRAPH/DIAGRAM:
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
4. IF IT'S A TEXT DOCUMENT:
|
| 117 |
-
|
| 118 |
Provide a comprehensive description suitable for semantic search. Be specific and detailed."""
|
| 119 |
-
|
| 120 |
chat_completion = groq_client.chat.completions.create(
|
| 121 |
messages=[
|
| 122 |
{
|
|
@@ -145,25 +110,13 @@ Provide a comprehensive description suitable for semantic search. Be specific an
|
|
| 145 |
continue
|
| 146 |
return ""
|
| 147 |
except Exception as e:
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
else:
|
| 153 |
-
if attempt < max_retries - 1:
|
| 154 |
-
time.sleep(2)
|
| 155 |
-
continue
|
| 156 |
-
return ""
|
| 157 |
return ""
|
| 158 |
|
| 159 |
-
|
| 160 |
-
# -------------------------------
|
| 161 |
-
# 4οΈβ£ Smart OCR/Vision Extraction for Images
|
| 162 |
-
# -------------------------------
|
| 163 |
def extract_text_from_image(img_path):
|
| 164 |
-
if not ocr_model:
|
| 165 |
-
return analyze_image_with_vision(img_path=img_path)
|
| 166 |
-
|
| 167 |
try:
|
| 168 |
image = Image.open(img_path).convert("RGB")
|
| 169 |
image_np = np.array(image)
|
|
@@ -177,90 +130,76 @@ def extract_text_from_image(img_path):
|
|
| 177 |
ocr_text = "\n".join(text)
|
| 178 |
|
| 179 |
if has_substantial_text(ocr_text, min_words=10):
|
| 180 |
-
print(f"π {os.path.basename(img_path)}: Using OCR
|
| 181 |
return ocr_text
|
| 182 |
else:
|
| 183 |
-
print(f"πΌοΈ {os.path.basename(img_path)}: Using Vision Model
|
| 184 |
vision_summary = analyze_image_with_vision(img_path=img_path)
|
| 185 |
-
return vision_summary if vision_summary else ocr_text
|
| 186 |
except Exception as e:
|
| 187 |
print(f"β Error processing {img_path}: {e}")
|
| 188 |
return ""
|
| 189 |
|
| 190 |
-
|
| 191 |
-
# -------------------------------
|
| 192 |
-
# 5οΈβ£ Extract Text from Plain Text Files
|
| 193 |
-
# -------------------------------
|
| 194 |
def extract_text_from_txt(file_path):
|
| 195 |
try:
|
| 196 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 197 |
text = f.read()
|
| 198 |
-
print(f"π {os.path.basename(file_path)}: Extracted text
|
| 199 |
return text
|
| 200 |
except Exception as e:
|
| 201 |
print(f"β Error reading text file {file_path}: {e}")
|
| 202 |
return ""
|
| 203 |
|
| 204 |
-
|
| 205 |
-
# -------------------------------
|
| 206 |
-
# 6οΈβ£ Extract Content from PDFs with Vision Analysis
|
| 207 |
-
# -------------------------------
|
| 208 |
def extract_content_from_pdf(pdf_path):
|
| 209 |
try:
|
| 210 |
doc = fitz.open(pdf_path)
|
| 211 |
all_content = []
|
|
|
|
| 212 |
for page_num, page in enumerate(doc, 1):
|
| 213 |
page_content = []
|
| 214 |
-
|
| 215 |
-
# 1. Extract text content
|
| 216 |
text = page.get_text()
|
|
|
|
| 217 |
if text.strip():
|
| 218 |
page_content.append(f"[Page {page_num} - Text Content]\n{text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
try:
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
image_list = page.get_images(full=True)
|
| 238 |
-
for img_index, img_info in enumerate(image_list, 1):
|
| 239 |
-
try:
|
| 240 |
-
xref = img_info[0]
|
| 241 |
-
base_image = doc.extract_image(xref)
|
| 242 |
-
image_bytes = base_image["image"]
|
| 243 |
-
image = Image.open(BytesIO(image_bytes)).convert("RGB")
|
| 244 |
-
image_np = np.array(image)
|
| 245 |
-
result = ocr_model([image_np])
|
| 246 |
-
ocr_text = []
|
| 247 |
-
for ocr_page in result.pages:
|
| 248 |
-
for block in ocr_page.blocks:
|
| 249 |
-
for line in block.lines:
|
| 250 |
-
line_text = " ".join([word.value for word in line.words])
|
| 251 |
-
ocr_text.append(line_text)
|
| 252 |
-
extracted_text = "\n".join(ocr_text)
|
| 253 |
-
|
| 254 |
-
if has_substantial_text(extracted_text, min_words=10):
|
| 255 |
-
page_content.append(f"[Page {page_num} - Embedded Image {img_index} OCR]\n{extracted_text}")
|
| 256 |
-
else:
|
| 257 |
-
vision_summary = analyze_image_with_vision(img_bytes=image_bytes)
|
| 258 |
-
if vision_summary:
|
| 259 |
-
page_content.append(
|
| 260 |
-
f"[Page {page_num} - Embedded Image {img_index} Analysis]\n{vision_summary}")
|
| 261 |
-
|
| 262 |
-
except Exception:
|
| 263 |
-
continue
|
| 264 |
|
| 265 |
if page_content:
|
| 266 |
combined_page = "\n\n---SECTION BREAK---\n\n".join(page_content)
|
|
@@ -269,23 +208,18 @@ def extract_content_from_pdf(pdf_path):
|
|
| 269 |
doc.close()
|
| 270 |
final_content = "\n\n---PAGE BREAK---\n\n".join(all_content)
|
| 271 |
return final_content
|
| 272 |
-
|
| 273 |
except Exception as e:
|
| 274 |
print(f"β Error processing PDF {pdf_path}: {e}")
|
| 275 |
return ""
|
| 276 |
|
| 277 |
-
|
| 278 |
-
# -------------------------------
|
| 279 |
-
# 7οΈβ£ Process All Document Types for folder build
|
| 280 |
-
# -------------------------------
|
| 281 |
def create_documents_from_folder(folder_path):
|
| 282 |
docs = []
|
| 283 |
for root, dirs, files in os.walk(folder_path):
|
| 284 |
for filename in files:
|
| 285 |
full_path = os.path.join(root, filename)
|
| 286 |
file_ext = filename.lower().split('.')[-1]
|
| 287 |
-
text = ""
|
| 288 |
|
|
|
|
| 289 |
if file_ext in ["jpg", "jpeg", "png"]:
|
| 290 |
text = extract_text_from_image(full_path)
|
| 291 |
elif file_ext in ["txt", "md"]:
|
|
@@ -294,7 +228,7 @@ def create_documents_from_folder(folder_path):
|
|
| 294 |
text = extract_content_from_pdf(full_path)
|
| 295 |
else:
|
| 296 |
continue
|
| 297 |
-
|
| 298 |
if text.strip():
|
| 299 |
relative_path = os.path.relpath(full_path, folder_path)
|
| 300 |
doc = Document(
|
|
@@ -307,22 +241,16 @@ def create_documents_from_folder(folder_path):
|
|
| 307 |
}
|
| 308 |
)
|
| 309 |
docs.append(doc)
|
| 310 |
-
|
| 311 |
return docs
|
| 312 |
|
| 313 |
-
# --- Core RAG/DB Functions (8 to 12) ---
|
| 314 |
-
|
| 315 |
-
# -------------------------------
|
| 316 |
-
# 8οΈβ£ Build or Update QDRANT Store
|
| 317 |
-
# -------------------------------
|
| 318 |
def build_or_update_qdrant_store(folder_path):
|
| 319 |
-
|
| 320 |
-
return None
|
| 321 |
-
|
| 322 |
docs = create_documents_from_folder(folder_path)
|
| 323 |
if not docs:
|
|
|
|
| 324 |
return None
|
| 325 |
-
|
| 326 |
try:
|
| 327 |
vector_store = Qdrant.from_documents(
|
| 328 |
docs,
|
|
@@ -332,26 +260,15 @@ def build_or_update_qdrant_store(folder_path):
|
|
| 332 |
collection_name=COLLECTION_NAME,
|
| 333 |
force_recreate=True
|
| 334 |
)
|
| 335 |
-
print(f"β
|
| 336 |
return vector_store
|
| 337 |
except Exception as e:
|
| 338 |
-
print(f"β Error
|
| 339 |
return None
|
| 340 |
|
| 341 |
-
|
| 342 |
-
# -------------------------------
|
| 343 |
-
# 9οΈβ£ Query QDRANT Function with Chart-Aware Re-ranking
|
| 344 |
-
# -------------------------------
|
| 345 |
def query_qdrant_store(query_text, k=3):
|
| 346 |
-
if not QDRANT_API_KEY:
|
| 347 |
-
return []
|
| 348 |
-
|
| 349 |
try:
|
| 350 |
-
client = QdrantClient(
|
| 351 |
-
url=QDRANT_URL,
|
| 352 |
-
api_key=QDRANT_API_KEY,
|
| 353 |
-
timeout=20
|
| 354 |
-
)
|
| 355 |
vector_store = Qdrant(
|
| 356 |
client=client,
|
| 357 |
collection_name=COLLECTION_NAME,
|
|
@@ -360,7 +277,7 @@ def query_qdrant_store(query_text, k=3):
|
|
| 360 |
except Exception as e:
|
| 361 |
print(f"β Error connecting to Qdrant: {e}")
|
| 362 |
return []
|
| 363 |
-
|
| 364 |
initial_k = k * 3
|
| 365 |
results = vector_store.similarity_search_with_score(query_text, k=initial_k)
|
| 366 |
|
|
@@ -371,26 +288,15 @@ def query_qdrant_store(query_text, k=3):
|
|
| 371 |
reranked_results = []
|
| 372 |
for doc, score in results:
|
| 373 |
boost = 0.0
|
| 374 |
-
if "Visual Analysis]" in doc.page_content
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
if 'bar chart' in query_text.lower() and 'bar chart' in visual_content:
|
| 378 |
-
boost += 1.0
|
| 379 |
-
elif 'pie chart' in query_text.lower() and 'pie chart' in visual_content:
|
| 380 |
-
boost += 1.0
|
| 381 |
-
elif any(kw in query_text.lower() for kw in ['chart', 'graph']) and any(kw in visual_content for kw in ['chart', 'graph', 'plot', 'diagram', 'table']):
|
| 382 |
-
boost += 0.5
|
| 383 |
-
else:
|
| 384 |
-
boost += 0.2
|
| 385 |
-
|
| 386 |
adjusted_score = score - boost
|
| 387 |
-
reranked_results.append((doc, adjusted_score
|
| 388 |
-
|
| 389 |
reranked_results.sort(key=lambda x: x[1])
|
| 390 |
-
results =
|
| 391 |
else:
|
| 392 |
results = results[:k]
|
| 393 |
-
|
| 394 |
retrieved_docs = []
|
| 395 |
for doc, score in results:
|
| 396 |
retrieved_docs.append({
|
|
@@ -401,54 +307,34 @@ def query_qdrant_store(query_text, k=3):
|
|
| 401 |
})
|
| 402 |
return retrieved_docs
|
| 403 |
|
| 404 |
-
|
| 405 |
-
# -------------------------------
|
| 406 |
-
# 10οΈβ£ Answer Question using Llama 3.3 70B
|
| 407 |
-
# -------------------------------
|
| 408 |
def answer_question_with_llm(query_text, retrieved_docs, max_tokens=1000):
|
| 409 |
-
if not groq_client:
|
| 410 |
-
return "β Groq client not initialized. Cannot generate answer."
|
| 411 |
if not retrieved_docs:
|
| 412 |
-
return "β No relevant documents found
|
| 413 |
-
|
| 414 |
context_parts = []
|
| 415 |
for i, doc in enumerate(retrieved_docs, 1):
|
| 416 |
source = doc['source']
|
| 417 |
content = doc['content']
|
| 418 |
-
metadata = doc['metadata']
|
| 419 |
-
timestamp = metadata.get('upload_timestamp')
|
| 420 |
-
|
| 421 |
-
readable_time = time.ctime(float(timestamp)) if timestamp else "N/A"
|
| 422 |
-
|
| 423 |
-
metadata_str = (
|
| 424 |
-
f"Source: {source}\n"
|
| 425 |
-
f"File Type: {metadata.get('file_type', 'N/A')}\n"
|
| 426 |
-
f"Uploaded/Modified: {readable_time}"
|
| 427 |
-
)
|
| 428 |
|
| 429 |
max_content_length = 2500
|
| 430 |
if len(content) > max_content_length:
|
| 431 |
content = content[:max_content_length] + "...[truncated]"
|
| 432 |
-
|
| 433 |
-
context_parts.append(
|
| 434 |
-
f"--- Document {i} ---\n"
|
| 435 |
-
f"[METADATA]:\n{metadata_str}\n\n"
|
| 436 |
-
f"[CONTENT]:\n{content}\n"
|
| 437 |
-
)
|
| 438 |
|
|
|
|
|
|
|
| 439 |
context = "\n".join(context_parts)
|
| 440 |
|
| 441 |
system_prompt = """You are a concise AI assistant. Answer the user's question *only* using the provided documents.
|
| 442 |
- Be brief and to the point.
|
| 443 |
-
- If the answer is not in the documents
|
| 444 |
-
|
| 445 |
user_prompt = f"""DOCUMENTS:
|
| 446 |
{context}
|
| 447 |
|
| 448 |
QUESTION: {query_text}
|
| 449 |
|
| 450 |
-
ANSWER:
|
| 451 |
-
|
| 452 |
try:
|
| 453 |
response = groq_client.chat.completions.create(
|
| 454 |
model=LLM_MODEL,
|
|
@@ -458,45 +344,30 @@ ANSWER: (Provide a concise answer based *only* on the documents)"""
|
|
| 458 |
],
|
| 459 |
temperature=0.2,
|
| 460 |
max_tokens=max_tokens,
|
| 461 |
-
top_p=0.9,
|
| 462 |
)
|
| 463 |
-
|
| 464 |
-
return answer
|
| 465 |
except Exception as e:
|
| 466 |
-
return f"β Error
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
# -------------------------------
|
| 471 |
-
def get_rag_response(query_text: str, k: int = 3) -> Dict[str, Any]:
|
| 472 |
-
"""Core RAG pipeline: retrieves, generates, and formats response."""
|
| 473 |
-
print(f"β QUERY: {query_text}")
|
| 474 |
retrieved_docs = query_qdrant_store(query_text, k=k)
|
| 475 |
|
| 476 |
if not retrieved_docs:
|
| 477 |
return {
|
| 478 |
-
"answer": "β No relevant documents found
|
| 479 |
"sources": []
|
| 480 |
}
|
| 481 |
|
| 482 |
answer = answer_question_with_llm(query_text, retrieved_docs)
|
|
|
|
| 483 |
|
| 484 |
-
|
| 485 |
-
{"source": doc['source'], "score": doc['score']} for doc in retrieved_docs
|
| 486 |
-
]
|
| 487 |
-
|
| 488 |
-
response_data = {
|
| 489 |
"answer": answer,
|
| 490 |
"sources": sources_list
|
| 491 |
}
|
| 492 |
-
|
| 493 |
-
return response_data
|
| 494 |
|
| 495 |
-
|
| 496 |
-
# 12οΈβ£ Core File Processing & Qdrant Addition
|
| 497 |
-
# -------------------------------
|
| 498 |
-
def process_single_file(file_path: str, filename: str) -> Document:
|
| 499 |
-
"""Processes a single file and returns a LangChain Document."""
|
| 500 |
file_ext = filename.lower().split('.')[-1]
|
| 501 |
text = ""
|
| 502 |
|
|
@@ -506,6 +377,8 @@ def process_single_file(file_path: str, filename: str) -> Document:
|
|
| 506 |
text = extract_text_from_txt(file_path)
|
| 507 |
elif file_ext == "pdf":
|
| 508 |
text = extract_content_from_pdf(file_path)
|
|
|
|
|
|
|
| 509 |
|
| 510 |
if text.strip():
|
| 511 |
doc = Document(
|
|
@@ -517,14 +390,14 @@ def process_single_file(file_path: str, filename: str) -> Document:
|
|
| 517 |
"upload_timestamp": time.time()
|
| 518 |
}
|
| 519 |
)
|
|
|
|
| 520 |
return doc
|
| 521 |
return None
|
| 522 |
|
| 523 |
-
def add_documents_to_qdrant(docs
|
| 524 |
-
|
| 525 |
-
if not QDRANT_API_KEY or not docs:
|
| 526 |
return
|
| 527 |
-
|
| 528 |
try:
|
| 529 |
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
|
| 530 |
vector_store = Qdrant(
|
|
@@ -533,233 +406,178 @@ def add_documents_to_qdrant(docs: List[Document]):
|
|
| 533 |
embeddings=embedding_model
|
| 534 |
)
|
| 535 |
vector_store.add_documents(docs)
|
| 536 |
-
print(f"β
|
| 537 |
except Exception as e:
|
| 538 |
-
print(f"β Error adding
|
| 539 |
-
raise
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
# -------------------------------
|
| 543 |
-
# π 14. Gradio UI Setup
|
| 544 |
-
# -------------------------------
|
| 545 |
-
def create_gradio_ui():
|
| 546 |
-
"""
|
| 547 |
-
Creates the Gradio Blocks UI.
|
| 548 |
-
"""
|
| 549 |
-
|
| 550 |
-
def gradio_chat_response_func(message, history):
|
| 551 |
-
"""
|
| 552 |
-
The function that Gradio's ChatInterface will call.
|
| 553 |
-
"""
|
| 554 |
-
response_data = get_rag_response(message, k=3)
|
| 555 |
-
answer = response_data['answer']
|
| 556 |
-
sources = response_data['sources']
|
| 557 |
-
|
| 558 |
-
sources_md = "\n\n---\n**π Sources Used:**\n"
|
| 559 |
-
for i, doc in enumerate(sources, 1):
|
| 560 |
-
sources_md += f"* **{doc['source']}** (Score: {doc['score']:.4f})\n"
|
| 561 |
-
|
| 562 |
-
final_response = answer + sources_md
|
| 563 |
-
return final_response
|
| 564 |
-
|
| 565 |
-
def gradio_upload_func(file_list):
|
| 566 |
-
"""
|
| 567 |
-
The function that Gradio's Upload button will call.
|
| 568 |
-
"""
|
| 569 |
-
if not file_list:
|
| 570 |
-
return "No files uploaded."
|
| 571 |
-
|
| 572 |
-
print("\n" + "=" * 60)
|
| 573 |
-
print("NEW GRADIO UPLOAD DETECTED: Processing files...")
|
| 574 |
-
print("=" * 60)
|
| 575 |
-
|
| 576 |
-
docs_to_add = []
|
| 577 |
-
processed_count = 0
|
| 578 |
-
failed_count = 0
|
| 579 |
-
|
| 580 |
-
for file_obj in file_list:
|
| 581 |
-
full_path = file_obj.name
|
| 582 |
-
filename = os.path.basename(full_path)
|
| 583 |
-
|
| 584 |
-
try:
|
| 585 |
-
doc = process_single_file(full_path, filename)
|
| 586 |
-
if doc:
|
| 587 |
-
docs_to_add.append(doc)
|
| 588 |
-
processed_count += 1
|
| 589 |
-
else:
|
| 590 |
-
failed_count += 1
|
| 591 |
-
except Exception as e:
|
| 592 |
-
print(f"β Error processing file {filename} from Gradio: {e}")
|
| 593 |
-
failed_count += 1
|
| 594 |
-
|
| 595 |
-
if docs_to_add:
|
| 596 |
-
try:
|
| 597 |
-
add_documents_to_qdrant(docs_to_add)
|
| 598 |
-
except Exception as e:
|
| 599 |
-
return f"β Error adding documents to vector store: {e}"
|
| 600 |
-
|
| 601 |
-
return f"β
Processing complete. Added {processed_count} files. Failed: {failed_count}."
|
| 602 |
-
|
| 603 |
-
# Create the Gradio UI using Blocks
|
| 604 |
-
with gr.Blocks(theme="soft") as demo:
|
| 605 |
-
gr.Markdown("# π§ Multimodal RAG System (Powered by Qdrant Cloud)")
|
| 606 |
-
|
| 607 |
-
with gr.Tabs():
|
| 608 |
-
# --- CHAT TAB ---
|
| 609 |
-
with gr.TabItem("Chat with Documents"):
|
| 610 |
-
gr.ChatInterface(
|
| 611 |
-
fn=gradio_chat_response_func,
|
| 612 |
-
title="Multimodal RAG Chat",
|
| 613 |
-
description="Ask questions about your documents (PDFs, images, text). The system uses Llama 4 Scout for vision and Llama 3.3 70B for answers.",
|
| 614 |
-
examples=[
|
| 615 |
-
"What documents contain bar charts?",
|
| 616 |
-
"Summarize the information about pollution",
|
| 617 |
-
"What are the key findings in the environmental report?",
|
| 618 |
-
"Describe the graphs showing water quality"
|
| 619 |
-
],
|
| 620 |
-
)
|
| 621 |
-
|
| 622 |
-
# --- UPLOAD TAB ---
|
| 623 |
-
with gr.TabItem("Upload New Documents"):
|
| 624 |
-
gr.Markdown("Upload new PDF, image, or text files to add them to the knowledge base.")
|
| 625 |
-
|
| 626 |
-
# Define components
|
| 627 |
-
file_uploader = gr.File(
|
| 628 |
-
label="Upload Documents",
|
| 629 |
-
file_count="multiple",
|
| 630 |
-
file_types=["image", ".pdf", ".txt", ".md"],
|
| 631 |
-
interactive=True
|
| 632 |
-
)
|
| 633 |
-
upload_button = gr.Button("Process and Add Documents", variant="primary")
|
| 634 |
-
status_output = gr.Markdown("Status: Ready to upload new documents.")
|
| 635 |
-
|
| 636 |
-
# Connect the upload button to the processing function
|
| 637 |
-
upload_button.click(
|
| 638 |
-
fn=gradio_upload_func,
|
| 639 |
-
inputs=[file_uploader],
|
| 640 |
-
outputs=[status_output]
|
| 641 |
-
)
|
| 642 |
-
|
| 643 |
-
return demo
|
| 644 |
-
|
| 645 |
|
| 646 |
# -------------------------------
|
| 647 |
-
#
|
| 648 |
# -------------------------------
|
|
|
|
| 649 |
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
class UploadResponse(BaseModel):
|
| 660 |
-
message: str
|
| 661 |
-
processed_files: List[str]
|
| 662 |
-
failed_files: List[str]
|
| 663 |
-
|
| 664 |
-
# --- FastAPI App ---
|
| 665 |
-
app = FastAPI(title="π§ Multimodal RAG API")
|
| 666 |
-
|
| 667 |
-
@app.on_event("startup")
|
| 668 |
-
def on_startup():
|
| 669 |
-
"""Checks keys and builds the initial database on server startup."""
|
| 670 |
-
print("π FastAPI app starting up...")
|
| 671 |
-
|
| 672 |
-
if not os.environ.get("GROQ_API_KEY"):
|
| 673 |
-
print("β οΈ WARNING: GROQ_API_KEY not set!")
|
| 674 |
-
if not QDRANT_API_KEY:
|
| 675 |
-
print("β οΈ WARNING: QDRANT_API_KEY not set! Database functions will fail.")
|
| 676 |
-
|
| 677 |
-
folder = "data"
|
| 678 |
-
if os.path.exists(folder):
|
| 679 |
-
build_or_update_qdrant_store(folder)
|
| 680 |
-
else:
|
| 681 |
-
print("βΉοΈ No 'data' folder found. Skipping initial build.")
|
| 682 |
-
|
| 683 |
-
# --- API Endpoints ---
|
| 684 |
-
|
| 685 |
-
@app.post("/query/", response_model=QueryResponse)
|
| 686 |
-
async def handle_query(request: QueryRequest):
|
| 687 |
-
"""Executes a RAG query against the vector database."""
|
| 688 |
try:
|
| 689 |
-
response_data = get_rag_response(
|
| 690 |
-
return response_data
|
| 691 |
except Exception as e:
|
| 692 |
-
|
| 693 |
|
| 694 |
-
@
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
processed_files = []
|
| 701 |
failed_files = []
|
| 702 |
docs_to_add = []
|
| 703 |
-
|
| 704 |
for file in files:
|
| 705 |
-
|
|
|
|
|
|
|
| 706 |
try:
|
| 707 |
-
|
| 708 |
-
|
|
|
|
| 709 |
tmp_path = tmp.name
|
| 710 |
|
| 711 |
-
doc = process_single_file(tmp_path,
|
| 712 |
|
| 713 |
if doc:
|
| 714 |
docs_to_add.append(doc)
|
| 715 |
-
processed_files.append(
|
| 716 |
else:
|
| 717 |
-
failed_files.append(
|
| 718 |
-
|
|
|
|
| 719 |
except Exception as e:
|
|
|
|
| 720 |
failed_files.append(file.filename)
|
| 721 |
-
|
| 722 |
-
finally:
|
| 723 |
-
if tmp_path and os.path.exists(tmp_path):
|
| 724 |
-
os.unlink(tmp_path)
|
| 725 |
-
file.file.close()
|
| 726 |
-
|
| 727 |
if docs_to_add:
|
| 728 |
try:
|
| 729 |
add_documents_to_qdrant(docs_to_add)
|
| 730 |
-
except
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
"message": f"Processing complete. Added {len(processed_files)} file(s) to the database.",
|
| 736 |
"processed_files": processed_files,
|
| 737 |
"failed_files": failed_files
|
| 738 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
|
| 740 |
# -------------------------------
|
| 741 |
-
#
|
| 742 |
# -------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
|
| 761 |
-
|
| 762 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
|
| 764 |
-
#
|
| 765 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from groq import Groq
|
| 10 |
import base64
|
| 11 |
from io import BytesIO
|
| 12 |
+
import fitz # PyMuPDF
|
|
|
|
| 13 |
import time
|
| 14 |
import shutil
|
|
|
|
| 15 |
|
| 16 |
+
# Flask imports
|
| 17 |
+
from flask import Flask, request, jsonify
|
| 18 |
+
from werkzeug.utils import secure_filename
|
| 19 |
+
import tempfile
|
| 20 |
+
|
| 21 |
+
# Qdrant imports
|
|
|
|
| 22 |
from langchain_qdrant import Qdrant
|
| 23 |
from qdrant_client import QdrantClient
|
| 24 |
|
| 25 |
# -------------------------------
|
| 26 |
+
# Configuration
|
| 27 |
# -------------------------------
|
| 28 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
+
ocr_model = ocr_predictor(pretrained=True).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 31 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
|
|
|
| 33 |
VISION_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 34 |
LLM_MODEL = "llama-3.3-70b-versatile"
|
| 35 |
|
| 36 |
+
QDRANT_URL = "https://bdf142ef-7e2a-433b-87a0-301ff303e3af.us-east4-0.gcp.cloud.qdrant.io:6333"
|
|
|
|
|
|
|
|
|
|
| 37 |
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
|
| 38 |
COLLECTION_NAME = "multimodal_rag_store"
|
| 39 |
|
|
|
|
|
|
|
| 40 |
# -------------------------------
|
| 41 |
+
# Helper Functions
|
| 42 |
# -------------------------------
|
| 43 |
def has_substantial_text(text, min_words=10):
|
|
|
|
|
|
|
|
|
|
| 44 |
words = text.split()
|
| 45 |
return len(words) >= min_words
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def analyze_image_with_vision(img_path=None, img_bytes=None, pil_image=None, max_retries=3):
|
|
|
|
|
|
|
|
|
|
| 48 |
for attempt in range(max_retries):
|
| 49 |
try:
|
|
|
|
|
|
|
|
|
|
| 50 |
if pil_image:
|
| 51 |
buffered = BytesIO()
|
| 52 |
pil_image.save(buffered, format="PNG")
|
| 53 |
img_data = buffered.getvalue()
|
| 54 |
+
img_format = "png"
|
| 55 |
elif img_path:
|
| 56 |
with open(img_path, "rb") as img_file:
|
| 57 |
img_data = img_file.read()
|
| 58 |
img_format = img_path.lower().split('.')[-1]
|
| 59 |
elif img_bytes:
|
| 60 |
img_data = img_bytes
|
| 61 |
+
img_format = "png"
|
| 62 |
else:
|
| 63 |
return ""
|
| 64 |
|
|
|
|
| 69 |
vision_prompt = """Analyze this image carefully and provide a detailed description:
|
| 70 |
1. IDENTIFY THE TYPE: Is this a chart, graph, table, diagram, photograph, or text document?
|
| 71 |
2. IF IT'S A CHART/GRAPH/TABLE:
|
| 72 |
+
- Specify the exact type (bar chart, pie chart, line graph, scatter plot, table, etc.)
|
| 73 |
+
- List ALL categories/labels shown
|
| 74 |
+
- Describe the data values and trends
|
| 75 |
+
- Mention axis labels, title, legend if present
|
| 76 |
+
- Highlight key insights or patterns
|
| 77 |
3. IF IT'S A PHOTOGRAPH/DIAGRAM:
|
| 78 |
+
- Describe what you see in detail
|
| 79 |
+
- Identify key objects, people, or concepts
|
| 80 |
+
- Note any text visible in the image
|
| 81 |
4. IF IT'S A TEXT DOCUMENT:
|
| 82 |
+
- Summarize the main content and structure
|
| 83 |
Provide a comprehensive description suitable for semantic search. Be specific and detailed."""
|
| 84 |
+
|
| 85 |
chat_completion = groq_client.chat.completions.create(
|
| 86 |
messages=[
|
| 87 |
{
|
|
|
|
| 110 |
continue
|
| 111 |
return ""
|
| 112 |
except Exception as e:
|
| 113 |
+
if attempt < max_retries - 1:
|
| 114 |
+
time.sleep(2)
|
| 115 |
+
continue
|
| 116 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
return ""
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def extract_text_from_image(img_path):
|
|
|
|
|
|
|
|
|
|
| 120 |
try:
|
| 121 |
image = Image.open(img_path).convert("RGB")
|
| 122 |
image_np = np.array(image)
|
|
|
|
| 130 |
ocr_text = "\n".join(text)
|
| 131 |
|
| 132 |
if has_substantial_text(ocr_text, min_words=10):
|
| 133 |
+
print(f"π {os.path.basename(img_path)}: Using OCR")
|
| 134 |
return ocr_text
|
| 135 |
else:
|
| 136 |
+
print(f"πΌοΈ {os.path.basename(img_path)}: Using Vision Model")
|
| 137 |
vision_summary = analyze_image_with_vision(img_path=img_path)
|
| 138 |
+
return vision_summary if vision_summary else ocr_text
|
| 139 |
except Exception as e:
|
| 140 |
print(f"β Error processing {img_path}: {e}")
|
| 141 |
return ""
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
def extract_text_from_txt(file_path):
|
| 144 |
try:
|
| 145 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 146 |
text = f.read()
|
| 147 |
+
print(f"π {os.path.basename(file_path)}: Extracted text")
|
| 148 |
return text
|
| 149 |
except Exception as e:
|
| 150 |
print(f"β Error reading text file {file_path}: {e}")
|
| 151 |
return ""
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def extract_content_from_pdf(pdf_path):
|
| 154 |
try:
|
| 155 |
doc = fitz.open(pdf_path)
|
| 156 |
all_content = []
|
| 157 |
+
|
| 158 |
for page_num, page in enumerate(doc, 1):
|
| 159 |
page_content = []
|
|
|
|
|
|
|
| 160 |
text = page.get_text()
|
| 161 |
+
|
| 162 |
if text.strip():
|
| 163 |
page_content.append(f"[Page {page_num} - Text Content]\n{text}")
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
mat = fitz.Matrix(2, 2)
|
| 167 |
+
pix = page.get_pixmap(matrix=mat)
|
| 168 |
+
img_data = pix.tobytes("png")
|
| 169 |
+
page_image = Image.open(BytesIO(img_data)).convert("RGB")
|
| 170 |
|
| 171 |
+
vision_analysis = analyze_image_with_vision(pil_image=page_image)
|
| 172 |
+
if vision_analysis and len(vision_analysis.strip()) > 30:
|
| 173 |
+
page_content.append(f"[Page {page_num} - Visual Analysis]\n{vision_analysis}")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"β Error rendering page {page_num}: {e}")
|
| 176 |
+
|
| 177 |
+
image_list = page.get_images(full=True)
|
| 178 |
+
for img_index, img_info in enumerate(image_list, 1):
|
| 179 |
try:
|
| 180 |
+
xref = img_info[0]
|
| 181 |
+
base_image = doc.extract_image(xref)
|
| 182 |
+
image_bytes = base_image["image"]
|
| 183 |
+
image = Image.open(BytesIO(image_bytes)).convert("RGB")
|
| 184 |
+
image_np = np.array(image)
|
| 185 |
+
result = ocr_model([image_np])
|
| 186 |
+
ocr_text = []
|
| 187 |
+
for ocr_page in result.pages:
|
| 188 |
+
for block in ocr_page.blocks:
|
| 189 |
+
for line in block.lines:
|
| 190 |
+
line_text = " ".join([word.value for word in line.words])
|
| 191 |
+
ocr_text.append(line_text)
|
| 192 |
+
extracted_text = "\n".join(ocr_text)
|
| 193 |
|
| 194 |
+
if has_substantial_text(extracted_text, min_words=10):
|
| 195 |
+
page_content.append(f"[Page {page_num} - Embedded Image {img_index} OCR]\n{extracted_text}")
|
| 196 |
+
else:
|
| 197 |
+
vision_summary = analyze_image_with_vision(img_bytes=image_bytes)
|
| 198 |
+
if vision_summary:
|
| 199 |
+
page_content.append(f"[Page {page_num} - Embedded Image {img_index} Analysis]\n{vision_summary}")
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f"β Error processing embedded image {img_index}: {e}")
|
| 202 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
if page_content:
|
| 205 |
combined_page = "\n\n---SECTION BREAK---\n\n".join(page_content)
|
|
|
|
| 208 |
doc.close()
|
| 209 |
final_content = "\n\n---PAGE BREAK---\n\n".join(all_content)
|
| 210 |
return final_content
|
|
|
|
| 211 |
except Exception as e:
|
| 212 |
print(f"β Error processing PDF {pdf_path}: {e}")
|
| 213 |
return ""
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def create_documents_from_folder(folder_path):
|
| 216 |
docs = []
|
| 217 |
for root, dirs, files in os.walk(folder_path):
|
| 218 |
for filename in files:
|
| 219 |
full_path = os.path.join(root, filename)
|
| 220 |
file_ext = filename.lower().split('.')[-1]
|
|
|
|
| 221 |
|
| 222 |
+
text = ""
|
| 223 |
if file_ext in ["jpg", "jpeg", "png"]:
|
| 224 |
text = extract_text_from_image(full_path)
|
| 225 |
elif file_ext in ["txt", "md"]:
|
|
|
|
| 228 |
text = extract_content_from_pdf(full_path)
|
| 229 |
else:
|
| 230 |
continue
|
| 231 |
+
|
| 232 |
if text.strip():
|
| 233 |
relative_path = os.path.relpath(full_path, folder_path)
|
| 234 |
doc = Document(
|
|
|
|
| 241 |
}
|
| 242 |
)
|
| 243 |
docs.append(doc)
|
| 244 |
+
print(f"β
Added {filename}")
|
| 245 |
return docs
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
def build_or_update_qdrant_store(folder_path):
|
| 248 |
+
print("\nπ Building Qdrant collection...")
|
|
|
|
|
|
|
| 249 |
docs = create_documents_from_folder(folder_path)
|
| 250 |
if not docs:
|
| 251 |
+
print("β οΈ No valid documents found!")
|
| 252 |
return None
|
| 253 |
+
|
| 254 |
try:
|
| 255 |
vector_store = Qdrant.from_documents(
|
| 256 |
docs,
|
|
|
|
| 260 |
collection_name=COLLECTION_NAME,
|
| 261 |
force_recreate=True
|
| 262 |
)
|
| 263 |
+
print(f"β
Created collection with {len(docs)} documents")
|
| 264 |
return vector_store
|
| 265 |
except Exception as e:
|
| 266 |
+
print(f"β Error with Qdrant: {e}")
|
| 267 |
return None
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
def query_qdrant_store(query_text, k=3):
|
|
|
|
|
|
|
|
|
|
| 270 |
try:
|
| 271 |
+
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY, timeout=20)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
vector_store = Qdrant(
|
| 273 |
client=client,
|
| 274 |
collection_name=COLLECTION_NAME,
|
|
|
|
| 277 |
except Exception as e:
|
| 278 |
print(f"β Error connecting to Qdrant: {e}")
|
| 279 |
return []
|
| 280 |
+
|
| 281 |
initial_k = k * 3
|
| 282 |
results = vector_store.similarity_search_with_score(query_text, k=initial_k)
|
| 283 |
|
|
|
|
| 288 |
reranked_results = []
|
| 289 |
for doc, score in results:
|
| 290 |
boost = 0.0
|
| 291 |
+
if "Visual Analysis]" in doc.page_content:
|
| 292 |
+
boost += 0.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
adjusted_score = score - boost
|
| 294 |
+
reranked_results.append((doc, adjusted_score))
|
|
|
|
| 295 |
reranked_results.sort(key=lambda x: x[1])
|
| 296 |
+
results = reranked_results[:k]
|
| 297 |
else:
|
| 298 |
results = results[:k]
|
| 299 |
+
|
| 300 |
retrieved_docs = []
|
| 301 |
for doc, score in results:
|
| 302 |
retrieved_docs.append({
|
|
|
|
| 307 |
})
|
| 308 |
return retrieved_docs
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
def answer_question_with_llm(query_text, retrieved_docs, max_tokens=1000):
|
|
|
|
|
|
|
| 311 |
if not retrieved_docs:
|
| 312 |
+
return "β No relevant documents found."
|
| 313 |
+
|
| 314 |
context_parts = []
|
| 315 |
for i, doc in enumerate(retrieved_docs, 1):
|
| 316 |
source = doc['source']
|
| 317 |
content = doc['content']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
max_content_length = 2500
|
| 320 |
if len(content) > max_content_length:
|
| 321 |
content = content[:max_content_length] + "...[truncated]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
+
context_parts.append(f"--- Document {i} ---\nSource: {source}\n\n{content}\n")
|
| 324 |
+
|
| 325 |
context = "\n".join(context_parts)
|
| 326 |
|
| 327 |
system_prompt = """You are a concise AI assistant. Answer the user's question *only* using the provided documents.
|
| 328 |
- Be brief and to the point.
|
| 329 |
+
- If the answer is not in the documents, state 'That information is not available in the documents.'"""
|
| 330 |
+
|
| 331 |
user_prompt = f"""DOCUMENTS:
|
| 332 |
{context}
|
| 333 |
|
| 334 |
QUESTION: {query_text}
|
| 335 |
|
| 336 |
+
ANSWER:"""
|
| 337 |
+
|
| 338 |
try:
|
| 339 |
response = groq_client.chat.completions.create(
|
| 340 |
model=LLM_MODEL,
|
|
|
|
| 344 |
],
|
| 345 |
temperature=0.2,
|
| 346 |
max_tokens=max_tokens,
|
|
|
|
| 347 |
)
|
| 348 |
+
return response.choices[0].message.content
|
|
|
|
| 349 |
except Exception as e:
|
| 350 |
+
return f"β Error: {str(e)}"
|
| 351 |
|
| 352 |
+
def get_rag_response(query_text, k=3):
|
| 353 |
+
print(f"\nβ Query: {query_text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
retrieved_docs = query_qdrant_store(query_text, k=k)
|
| 355 |
|
| 356 |
if not retrieved_docs:
|
| 357 |
return {
|
| 358 |
+
"answer": "β No relevant documents found.",
|
| 359 |
"sources": []
|
| 360 |
}
|
| 361 |
|
| 362 |
answer = answer_question_with_llm(query_text, retrieved_docs)
|
| 363 |
+
sources_list = [{"source": doc['source'], "score": doc['score']} for doc in retrieved_docs]
|
| 364 |
|
| 365 |
+
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
"answer": answer,
|
| 367 |
"sources": sources_list
|
| 368 |
}
|
|
|
|
|
|
|
| 369 |
|
| 370 |
+
def process_single_file(file_path, filename):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
file_ext = filename.lower().split('.')[-1]
|
| 372 |
text = ""
|
| 373 |
|
|
|
|
| 377 |
text = extract_text_from_txt(file_path)
|
| 378 |
elif file_ext == "pdf":
|
| 379 |
text = extract_content_from_pdf(file_path)
|
| 380 |
+
else:
|
| 381 |
+
return None
|
| 382 |
|
| 383 |
if text.strip():
|
| 384 |
doc = Document(
|
|
|
|
| 390 |
"upload_timestamp": time.time()
|
| 391 |
}
|
| 392 |
)
|
| 393 |
+
print(f"β
Processed {filename}")
|
| 394 |
return doc
|
| 395 |
return None
|
| 396 |
|
| 397 |
+
def add_documents_to_qdrant(docs):
|
| 398 |
+
if not docs:
|
|
|
|
| 399 |
return
|
| 400 |
+
|
| 401 |
try:
|
| 402 |
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
|
| 403 |
vector_store = Qdrant(
|
|
|
|
| 406 |
embeddings=embedding_model
|
| 407 |
)
|
| 408 |
vector_store.add_documents(docs)
|
| 409 |
+
print(f"β
Added {len(docs)} documents to Qdrant")
|
| 410 |
except Exception as e:
|
| 411 |
+
print(f"β Error adding to Qdrant: {e}")
|
| 412 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
# -------------------------------
|
| 415 |
+
# Flask App Setup
|
| 416 |
# -------------------------------
|
| 417 |
+
flask_app = Flask(__name__)
|
| 418 |
|
| 419 |
+
@flask_app.route('/api/query', methods=['POST'])
|
| 420 |
+
def handle_query():
|
| 421 |
+
data = request.get_json()
|
| 422 |
+
query = data.get('query', '')
|
| 423 |
+
k = data.get('k', 3)
|
| 424 |
+
|
| 425 |
+
if not query:
|
| 426 |
+
return jsonify({"error": "No query provided"}), 400
|
| 427 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
try:
|
| 429 |
+
response_data = get_rag_response(query, k)
|
| 430 |
+
return jsonify(response_data)
|
| 431 |
except Exception as e:
|
| 432 |
+
return jsonify({"error": str(e)}), 500
|
| 433 |
|
| 434 |
+
@flask_app.route('/api/upload', methods=['POST'])
|
| 435 |
+
def handle_upload():
|
| 436 |
+
if 'files' not in request.files:
|
| 437 |
+
return jsonify({"error": "No files provided"}), 400
|
| 438 |
+
|
| 439 |
+
files = request.files.getlist('files')
|
| 440 |
processed_files = []
|
| 441 |
failed_files = []
|
| 442 |
docs_to_add = []
|
| 443 |
+
|
| 444 |
for file in files:
|
| 445 |
+
if file.filename == '':
|
| 446 |
+
continue
|
| 447 |
+
|
| 448 |
try:
|
| 449 |
+
filename = secure_filename(file.filename)
|
| 450 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=filename) as tmp:
|
| 451 |
+
file.save(tmp.name)
|
| 452 |
tmp_path = tmp.name
|
| 453 |
|
| 454 |
+
doc = process_single_file(tmp_path, filename)
|
| 455 |
|
| 456 |
if doc:
|
| 457 |
docs_to_add.append(doc)
|
| 458 |
+
processed_files.append(filename)
|
| 459 |
else:
|
| 460 |
+
failed_files.append(filename)
|
| 461 |
+
|
| 462 |
+
os.unlink(tmp_path)
|
| 463 |
except Exception as e:
|
| 464 |
+
print(f"β Error: {e}")
|
| 465 |
failed_files.append(file.filename)
|
| 466 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
if docs_to_add:
|
| 468 |
try:
|
| 469 |
add_documents_to_qdrant(docs_to_add)
|
| 470 |
+
except Exception as e:
|
| 471 |
+
return jsonify({"error": f"Failed to add to database: {str(e)}"}), 500
|
| 472 |
+
|
| 473 |
+
return jsonify({
|
| 474 |
+
"message": f"Processed {len(processed_files)} files",
|
|
|
|
| 475 |
"processed_files": processed_files,
|
| 476 |
"failed_files": failed_files
|
| 477 |
+
})
|
| 478 |
+
|
| 479 |
+
@flask_app.route('/api/health', methods=['GET'])
|
| 480 |
+
def health_check():
|
| 481 |
+
return jsonify({"status": "ok", "message": "API is running"})
|
| 482 |
|
| 483 |
# -------------------------------
|
| 484 |
+
# Gradio UI
|
| 485 |
# -------------------------------
|
| 486 |
+
def gradio_chat_response(message, history):
|
| 487 |
+
response_data = get_rag_response(message, k=3)
|
| 488 |
+
answer = response_data['answer']
|
| 489 |
+
sources = response_data['sources']
|
| 490 |
+
|
| 491 |
+
sources_md = "\n\n---\n**π Sources:**\n"
|
| 492 |
+
for doc in sources:
|
| 493 |
+
sources_md += f"* {doc['source']} (Score: {doc['score']:.4f})\n"
|
| 494 |
+
|
| 495 |
+
return answer + sources_md
|
| 496 |
|
| 497 |
+
def gradio_upload(file_list):
|
| 498 |
+
if not file_list:
|
| 499 |
+
return "No files uploaded."
|
| 500 |
+
|
| 501 |
+
docs_to_add = []
|
| 502 |
+
processed = 0
|
| 503 |
+
failed = 0
|
| 504 |
+
|
| 505 |
+
for file_obj in file_list:
|
| 506 |
+
full_path = file_obj.name
|
| 507 |
+
filename = os.path.basename(full_path)
|
| 508 |
+
|
| 509 |
+
try:
|
| 510 |
+
doc = process_single_file(full_path, filename)
|
| 511 |
+
if doc:
|
| 512 |
+
docs_to_add.append(doc)
|
| 513 |
+
processed += 1
|
| 514 |
+
else:
|
| 515 |
+
failed += 1
|
| 516 |
+
except Exception as e:
|
| 517 |
+
print(f"β Error: {e}")
|
| 518 |
+
failed += 1
|
| 519 |
+
|
| 520 |
+
if docs_to_add:
|
| 521 |
+
try:
|
| 522 |
+
add_documents_to_qdrant(docs_to_add)
|
| 523 |
+
except Exception as e:
|
| 524 |
+
return f"β Error: {e}"
|
| 525 |
+
|
| 526 |
+
return f"β
Processed {processed} files. Failed: {failed}."
|
| 527 |
|
| 528 |
+
with gr.Blocks(theme="soft") as gradio_ui:
|
| 529 |
+
gr.Markdown("# π§ Multimodal RAG System")
|
| 530 |
+
|
| 531 |
+
with gr.Tabs():
|
| 532 |
+
with gr.TabItem("π¬ Chat"):
|
| 533 |
+
gr.ChatInterface(
|
| 534 |
+
fn=gradio_chat_response,
|
| 535 |
+
title="Chat with Documents",
|
| 536 |
+
description="Ask questions about your documents",
|
| 537 |
+
examples=[
|
| 538 |
+
"What documents contain bar charts?",
|
| 539 |
+
"Summarize the environmental report",
|
| 540 |
+
"What are the key findings?"
|
| 541 |
+
]
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
with gr.TabItem("π€ Upload"):
|
| 545 |
+
gr.Markdown("Upload new documents to the knowledge base")
|
| 546 |
+
file_uploader = gr.File(
|
| 547 |
+
label="Upload Documents",
|
| 548 |
+
file_count="multiple",
|
| 549 |
+
file_types=["image", ".pdf", ".txt", ".md"]
|
| 550 |
+
)
|
| 551 |
+
upload_btn = gr.Button("Process Documents", variant="primary")
|
| 552 |
+
status = gr.Markdown("Ready to upload.")
|
| 553 |
+
|
| 554 |
+
upload_btn.click(fn=gradio_upload, inputs=[file_uploader], outputs=[status])
|
| 555 |
|
| 556 |
+
# -------------------------------
|
| 557 |
+
# Initialize and Run
|
| 558 |
+
# -------------------------------
|
| 559 |
+
if __name__ == "__main__":
|
| 560 |
+
print("π Starting Multimodal RAG System...")
|
| 561 |
+
|
| 562 |
+
# Build initial database if data folder exists
|
| 563 |
+
folder = "data"
|
| 564 |
+
if os.path.exists(folder):
|
| 565 |
+
print(f"\nπ Found '{folder}' folder, building database...")
|
| 566 |
+
build_or_update_qdrant_store(folder)
|
| 567 |
+
|
| 568 |
+
# Launch both Flask and Gradio
|
| 569 |
+
from werkzeug.serving import run_simple
|
| 570 |
+
from werkzeug.middleware.dispatcher import DispatcherMiddleware
|
| 571 |
+
|
| 572 |
+
# Mount Gradio at root, Flask API at /api
|
| 573 |
+
application = DispatcherMiddleware(
|
| 574 |
+
gradio_ui.launch(prevent_thread_lock=True, show_error=True),
|
| 575 |
+
{'/api': flask_app}
|
| 576 |
+
)
|
| 577 |
+
|
| 578 |
+
print("\nβ
Server starting on http://0.0.0.0:7860")
|
| 579 |
+
print(" - Gradio UI: http://0.0.0.0:7860")
|
| 580 |
+
print(" - Flask API: http://0.0.0.0:7860/api/query")
|
| 581 |
+
print(" - Health Check: http://0.0.0.0:7860/api/health")
|
| 582 |
+
|
| 583 |
+
run_simple('0.0.0.0', 7860, application, use_reloader=False, use_debugger=True)
|
git
ADDED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -12,6 +12,7 @@ sentence-transformers==5.1.2
|
|
| 12 |
langchain-qdrant==1.1.0
|
| 13 |
qdrant-client==1.15.1
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
fastapi
|
| 17 |
uvicorn
|
|
|
|
| 12 |
langchain-qdrant==1.1.0
|
| 13 |
qdrant-client==1.15.1
|
| 14 |
|
| 15 |
+
flask
|
| 16 |
|
| 17 |
fastapi
|
| 18 |
uvicorn
|