Spaces:
Runtime error
Runtime error
logging
Browse files- .huggingface-spaces +15 -0
- README.md +2 -3
- app.py +420 -213
- colpali_manager.py +3 -3
- middleware.py +25 -5
- milvus_manager.py +115 -115
- pdf_manager.py +19 -4
- rag.py +174 -29
.huggingface-spaces
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces Configuration
|
| 2 |
+
# This file helps ensure proper deployment and configuration
|
| 3 |
+
|
| 4 |
+
# Environment variables for Hugging Face Spaces
|
| 5 |
+
SPACE_ID=${SPACE_ID}
|
| 6 |
+
HF_SPACE_ID=${HF_SPACE_ID}
|
| 7 |
+
|
| 8 |
+
# File path configuration
|
| 9 |
+
BASE_DIR=/tmp/pages
|
| 10 |
+
FALLBACK_DIR=pages
|
| 11 |
+
|
| 12 |
+
# Ensure proper permissions
|
| 13 |
+
chmod 755 /tmp
|
| 14 |
+
mkdir -p /tmp/pages
|
| 15 |
+
chmod 755 /tmp/pages
|
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: π
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
@@ -339,5 +339,4 @@ For support and questions:
|
|
| 339 |
|
| 340 |
---
|
| 341 |
|
| 342 |
-
**Made by Collar** - Enhanced with Team Management & Chat History
|
| 343 |
-
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.44.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 339 |
|
| 340 |
---
|
| 341 |
|
| 342 |
+
**Made by Collar** - Enhanced with Team Management & Chat History
|
|
|
app.py
CHANGED
|
@@ -17,6 +17,7 @@ import requests
|
|
| 17 |
import base64
|
| 18 |
from PIL import Image
|
| 19 |
import io
|
|
|
|
| 20 |
|
| 21 |
from middleware import Middleware
|
| 22 |
from rag import Rag
|
|
@@ -28,7 +29,14 @@ from dotenv import load_dotenv, dotenv_values
|
|
| 28 |
import dotenv
|
| 29 |
import platform
|
| 30 |
import time
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Import libraries for DOC and Excel export
|
| 34 |
try:
|
|
@@ -378,31 +386,29 @@ class PDFSearchApp:
|
|
| 378 |
self.db_manager = db_manager
|
| 379 |
self.session_manager = session_manager
|
| 380 |
|
| 381 |
-
def upload_and_convert(self,
|
| 382 |
-
"""Upload and convert files
|
| 383 |
|
| 384 |
if files is None:
|
| 385 |
return "No file uploaded"
|
| 386 |
|
| 387 |
try:
|
| 388 |
-
# Get user info from session if available
|
| 389 |
-
user_info = None
|
| 390 |
-
team = "default"
|
| 391 |
-
if session_id:
|
| 392 |
-
session = self.session_manager.get_session(session_id)
|
| 393 |
-
if session:
|
| 394 |
-
user_info = session['user_info']
|
| 395 |
-
team = user_info['team']
|
| 396 |
-
|
| 397 |
total_pages = 0
|
| 398 |
uploaded_files = []
|
| 399 |
|
| 400 |
-
# Create
|
| 401 |
if folder_name:
|
| 402 |
folder_name = folder_name.replace(" ", "_").replace("-", "_")
|
| 403 |
-
collection_name = f"{
|
| 404 |
else:
|
| 405 |
-
collection_name = f"{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
for file in files[:]:
|
| 408 |
# Extract the last part of the path (file name)
|
|
@@ -412,13 +418,16 @@ class PDFSearchApp:
|
|
| 412 |
|
| 413 |
# Convert PPT to PDF if needed
|
| 414 |
if ext.lower() in [".ppt", ".pptx"]:
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
# Create unique document ID
|
| 424 |
doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
|
|
@@ -426,31 +435,93 @@ class PDFSearchApp:
|
|
| 426 |
print(f"Uploading file: {doc_id}")
|
| 427 |
middleware = Middleware(collection_name, create_collection=True)
|
| 428 |
|
| 429 |
-
|
|
|
|
| 430 |
total_pages += len(pages) if pages else 0
|
| 431 |
uploaded_files.append(doc_id)
|
| 432 |
-
|
| 433 |
-
self.indexed_docs[doc_id] = True
|
| 434 |
|
| 435 |
-
#
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
len(uploaded_files)
|
| 442 |
-
)
|
| 443 |
|
| 444 |
-
return
|
| 445 |
|
| 446 |
except Exception as e:
|
| 447 |
return f"Error processing files: {str(e)}"
|
| 448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
-
def display_file_list(text):
|
| 451 |
try:
|
| 452 |
# Retrieve all entries in the specified directory
|
| 453 |
-
|
|
|
|
|
|
|
| 454 |
current_working_directory = os.getcwd()
|
| 455 |
directory_path = os.path.join(current_working_directory, directory_path)
|
| 456 |
entries = os.listdir(directory_path)
|
|
@@ -465,39 +536,134 @@ class PDFSearchApp:
|
|
| 465 |
return str(e)
|
| 466 |
|
| 467 |
|
| 468 |
-
def search_documents(self,
|
| 469 |
print(f"Searching for query: {query}")
|
| 470 |
|
| 471 |
if not query:
|
| 472 |
print("Please enter a search query")
|
| 473 |
-
return "Please enter a search query", "--", "Please enter a search query", [], None
|
| 474 |
|
| 475 |
try:
|
| 476 |
-
#
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
# Enhanced multi-page retrieval with vision-guided chunking approach
|
| 486 |
# Get more results than requested to allow for intelligent filtering
|
| 487 |
# Request 3x the number of results for better selection
|
| 488 |
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
| 489 |
|
| 490 |
-
#
|
| 491 |
-
print(f"π
|
|
|
|
| 492 |
if len(search_results) > 0:
|
| 493 |
-
print(f"
|
| 494 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
if not search_results:
|
| 497 |
-
return "No search results found", "--", "No search results found for your query", [], None
|
| 498 |
|
| 499 |
# Implement intelligent multi-page selection based on research
|
| 500 |
-
selected_results = self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
# Process selected results
|
| 503 |
cited_pages = []
|
|
@@ -507,13 +673,22 @@ class PDFSearchApp:
|
|
| 507 |
|
| 508 |
print(f"π Processing {len(selected_results)} selected results...")
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
-
if
|
| 517 |
img_paths.append(img_path)
|
| 518 |
all_paths.append(path)
|
| 519 |
page_scores.append(score)
|
|
@@ -521,16 +696,78 @@ class PDFSearchApp:
|
|
| 521 |
print(f"β
Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
|
| 522 |
else:
|
| 523 |
print(f"β Image file not found: {img_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
print(f"π Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
|
| 526 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
if not img_paths:
|
| 528 |
-
return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None
|
| 529 |
|
| 530 |
# Generate RAG response with multiple pages using enhanced approach
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
# Prepare downloads
|
| 536 |
csv_download = self._prepare_csv_download(csv_filepath)
|
|
@@ -556,81 +793,42 @@ class PDFSearchApp:
|
|
| 556 |
|
| 557 |
except Exception as e:
|
| 558 |
error_msg = f"Error during search: {str(e)}"
|
|
|
|
|
|
|
| 559 |
return error_msg, "--", error_msg, [], None, None, None, None
|
| 560 |
-
|
| 561 |
-
def
|
| 562 |
"""
|
| 563 |
-
Intelligent page selection
|
| 564 |
-
Based on research from M3DocRAG and multi-modal retrieval models
|
| 565 |
"""
|
| 566 |
if len(search_results) <= num_results:
|
| 567 |
return search_results
|
| 568 |
|
| 569 |
-
# Detect if query needs multiple pages
|
| 570 |
-
multi_page_keywords = [
|
| 571 |
-
'compare', 'difference', 'similarities', 'both', 'multiple', 'various',
|
| 572 |
-
'different', 'types', 'kinds', 'categories', 'procedures', 'methods',
|
| 573 |
-
'approaches', 'techniques', 'safety', 'protocols', 'guidelines',
|
| 574 |
-
'overview', 'summary', 'comprehensive', 'complete', 'all', 'everything'
|
| 575 |
-
]
|
| 576 |
-
|
| 577 |
-
query_lower = query.lower()
|
| 578 |
-
needs_multiple_pages = any(keyword in query_lower for keyword in multi_page_keywords)
|
| 579 |
-
|
| 580 |
# Sort by relevance score
|
| 581 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
| 582 |
|
| 583 |
-
#
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
# Strategy 1: Include highest scoring result from each collection (diversity)
|
| 587 |
-
selected = []
|
| 588 |
-
seen_collections = set()
|
| 589 |
-
|
| 590 |
-
# First pass: get one page from each collection for diversity
|
| 591 |
-
for score, page_num, coll_num in sorted_results:
|
| 592 |
-
if coll_num not in seen_collections and len(selected) < min(num_results // 2, len(search_results)):
|
| 593 |
-
selected.append((score, page_num, coll_num))
|
| 594 |
-
seen_collections.add(coll_num)
|
| 595 |
-
|
| 596 |
-
# Strategy 2: Fill remaining slots with highest scoring results
|
| 597 |
-
for score, page_num, coll_num in sorted_results:
|
| 598 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
| 599 |
-
selected.append((score, page_num, coll_num))
|
| 600 |
-
|
| 601 |
-
# Strategy 3: If we still don't have enough, add more from any collection
|
| 602 |
-
if len(selected) < num_results:
|
| 603 |
-
for score, page_num, coll_num in sorted_results:
|
| 604 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
| 605 |
-
selected.append((score, page_num, coll_num))
|
| 606 |
-
|
| 607 |
-
# Strategy 4: If we have too many, trim to exact number requested
|
| 608 |
-
if len(selected) > num_results:
|
| 609 |
-
selected = selected[:num_results]
|
| 610 |
-
|
| 611 |
-
# Strategy 5: If we have too few, add more from the sorted results
|
| 612 |
-
if len(selected) < num_results and len(sorted_results) >= num_results:
|
| 613 |
-
for score, page_num, coll_num in sorted_results:
|
| 614 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
| 615 |
-
selected.append((score, page_num, coll_num))
|
| 616 |
-
|
| 617 |
-
# Sort selected results by score for consistency
|
| 618 |
-
selected.sort(key=lambda x: x[0], reverse=True)
|
| 619 |
-
|
| 620 |
-
print(f"Requested {num_results} pages, selected {len(selected)} pages from {len(seen_collections)} collections")
|
| 621 |
|
| 622 |
-
|
| 623 |
-
if len(selected) != num_results:
|
| 624 |
-
print(f"β οΈ Warning: Requested {num_results} pages but selected {len(selected)} pages")
|
| 625 |
-
if len(selected) < num_results and len(sorted_results) >= num_results:
|
| 626 |
-
# Add more pages to reach the target
|
| 627 |
-
for score, page_num, coll_num in sorted_results:
|
| 628 |
-
if (score, page_num, coll_num) not in selected and len(selected) < num_results:
|
| 629 |
-
selected.append((score, page_num, coll_num))
|
| 630 |
-
print(f"Added more pages to reach target: {len(selected)} pages")
|
| 631 |
|
| 632 |
return selected
|
| 633 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
| 635 |
"""
|
| 636 |
Optimize selection to include consecutive pages when beneficial
|
|
@@ -1167,7 +1365,7 @@ The system detected you requested tabular data, but the current response doesn't
|
|
| 1167 |
cell_str = str(cell)
|
| 1168 |
if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
|
| 1169 |
# Escape quotes and wrap in quotes
|
| 1170 |
-
cell_str =
|
| 1171 |
escaped_row.append(cell_str)
|
| 1172 |
csv_lines.append(','.join(escaped_row))
|
| 1173 |
|
|
@@ -2798,76 +2996,113 @@ The system detected you requested tabular data, but the current response doesn't
|
|
| 2798 |
# Fallback to simple response with enhanced prompt
|
| 2799 |
return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
|
| 2800 |
|
| 2801 |
-
|
| 2802 |
-
|
| 2803 |
-
|
| 2804 |
-
if
|
| 2805 |
-
|
| 2806 |
-
|
| 2807 |
-
|
| 2808 |
-
|
| 2809 |
-
|
| 2810 |
-
def logout_user(self, session_id):
|
| 2811 |
-
"""Logout user and remove session"""
|
| 2812 |
-
if session_id:
|
| 2813 |
-
self.session_manager.remove_session(session_id)
|
| 2814 |
-
return "Logged out successfully", None, None
|
| 2815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2816 |
|
| 2817 |
-
def
|
| 2818 |
-
"""
|
| 2819 |
-
|
| 2820 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2821 |
|
| 2822 |
-
|
| 2823 |
-
|
| 2824 |
-
return "Session expired. Please log in again."
|
| 2825 |
|
| 2826 |
-
|
| 2827 |
-
|
|
|
|
| 2828 |
|
| 2829 |
-
|
| 2830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2831 |
|
| 2832 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2833 |
|
| 2834 |
def create_ui():
|
| 2835 |
app = PDFSearchApp()
|
| 2836 |
|
| 2837 |
with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
|
| 2838 |
-
# Session state management
|
| 2839 |
-
session_state = gr.State(value=None)
|
| 2840 |
-
user_info_state = gr.State(value=None)
|
| 2841 |
-
|
| 2842 |
gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
|
| 2843 |
-
gr.Markdown("
|
| 2844 |
-
|
| 2845 |
-
#
|
| 2846 |
-
with gr.Tab("
|
| 2847 |
-
with gr.Row():
|
| 2848 |
-
with gr.Column(scale=1):
|
| 2849 |
-
gr.Markdown("### Login")
|
| 2850 |
-
username_input = gr.Textbox(label="Username", placeholder="Enter username")
|
| 2851 |
-
password_input = gr.Textbox(label="Password", type="password", placeholder="Enter password")
|
| 2852 |
-
login_btn = gr.Button("Login", variant="primary")
|
| 2853 |
-
logout_btn = gr.Button("Logout")
|
| 2854 |
-
auth_status = gr.Textbox(label="Authentication Status", interactive=False)
|
| 2855 |
-
current_team = gr.Textbox(label="Current Team", interactive=False)
|
| 2856 |
-
|
| 2857 |
-
with gr.Column(scale=1):
|
| 2858 |
-
gr.Markdown("### Default Users")
|
| 2859 |
-
gr.Markdown("""
|
| 2860 |
-
**Team A:** admin_team_a / admin123_team_a
|
| 2861 |
-
**Team B:** admin_team_b / admin123_team_b
|
| 2862 |
-
""")
|
| 2863 |
-
|
| 2864 |
-
# Document Management Tab
|
| 2865 |
-
with gr.Tab("π Document Management"):
|
| 2866 |
with gr.Column():
|
| 2867 |
-
gr.Markdown("### Upload Documents
|
| 2868 |
folder_name_input = gr.Textbox(
|
| 2869 |
-
label="
|
| 2870 |
-
placeholder="
|
| 2871 |
)
|
| 2872 |
max_pages_input = gr.Slider(
|
| 2873 |
minimum=1,
|
|
@@ -2877,19 +3112,11 @@ def create_ui():
|
|
| 2877 |
label="Max pages to extract and index per document"
|
| 2878 |
)
|
| 2879 |
file_input = gr.Files(
|
| 2880 |
-
label="Upload PPTs/PDFs (Multiple files supported)",
|
| 2881 |
file_count="multiple"
|
| 2882 |
)
|
| 2883 |
-
upload_btn = gr.Button("Upload
|
| 2884 |
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
| 2885 |
-
|
| 2886 |
-
gr.Markdown("### Team Collections")
|
| 2887 |
-
refresh_collections_btn = gr.Button("Refresh Collections")
|
| 2888 |
-
team_collections_display = gr.Textbox(
|
| 2889 |
-
label="Available Collections",
|
| 2890 |
-
interactive=False,
|
| 2891 |
-
lines=5
|
| 2892 |
-
)
|
| 2893 |
|
| 2894 |
# Enhanced Query Tab
|
| 2895 |
with gr.Tab("π Advanced Query"):
|
|
@@ -2958,36 +3185,16 @@ def create_ui():
|
|
| 2958 |
|
| 2959 |
|
| 2960 |
# Event handlers
|
| 2961 |
-
# Authentication events
|
| 2962 |
-
login_btn.click(
|
| 2963 |
-
fn=app.authenticate_user,
|
| 2964 |
-
inputs=[username_input, password_input],
|
| 2965 |
-
outputs=[auth_status, session_state, current_team]
|
| 2966 |
-
)
|
| 2967 |
-
|
| 2968 |
-
logout_btn.click(
|
| 2969 |
-
fn=app.logout_user,
|
| 2970 |
-
inputs=[session_state],
|
| 2971 |
-
outputs=[auth_status, session_state, current_team]
|
| 2972 |
-
)
|
| 2973 |
-
|
| 2974 |
-
# Document management events
|
| 2975 |
upload_btn.click(
|
| 2976 |
fn=app.upload_and_convert,
|
| 2977 |
-
inputs=[
|
| 2978 |
outputs=[upload_status]
|
| 2979 |
)
|
| 2980 |
|
| 2981 |
-
refresh_collections_btn.click(
|
| 2982 |
-
fn=app.get_team_collections,
|
| 2983 |
-
inputs=[session_state],
|
| 2984 |
-
outputs=[team_collections_display]
|
| 2985 |
-
)
|
| 2986 |
-
|
| 2987 |
# Query events
|
| 2988 |
search_btn.click(
|
| 2989 |
fn=app.search_documents,
|
| 2990 |
-
inputs=[
|
| 2991 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 2992 |
)
|
| 2993 |
|
|
|
|
| 17 |
import base64
|
| 18 |
from PIL import Image
|
| 19 |
import io
|
| 20 |
+
import traceback
|
| 21 |
|
| 22 |
from middleware import Middleware
|
| 23 |
from rag import Rag
|
|
|
|
| 29 |
import dotenv
|
| 30 |
import platform
|
| 31 |
import time
|
| 32 |
+
# Only enable PPT/PPTX conversion on Windows where COM is available
|
| 33 |
+
PPT_CONVERT_AVAILABLE = False
|
| 34 |
+
if platform.system() == 'Windows':
|
| 35 |
+
try:
|
| 36 |
+
from pptxtopdf import convert
|
| 37 |
+
PPT_CONVERT_AVAILABLE = True
|
| 38 |
+
except Exception:
|
| 39 |
+
PPT_CONVERT_AVAILABLE = False
|
| 40 |
|
| 41 |
# Import libraries for DOC and Excel export
|
| 42 |
try:
|
|
|
|
| 386 |
self.db_manager = db_manager
|
| 387 |
self.session_manager = session_manager
|
| 388 |
|
| 389 |
+
def upload_and_convert(self, files, max_pages, folder_name=None):
|
| 390 |
+
"""Upload and convert files without authentication or team scoping"""
|
| 391 |
|
| 392 |
if files is None:
|
| 393 |
return "No file uploaded"
|
| 394 |
|
| 395 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
total_pages = 0
|
| 397 |
uploaded_files = []
|
| 398 |
|
| 399 |
+
# Create simple collection name
|
| 400 |
if folder_name:
|
| 401 |
folder_name = folder_name.replace(" ", "_").replace("-", "_")
|
| 402 |
+
collection_name = f"{folder_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 403 |
else:
|
| 404 |
+
collection_name = f"documents_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 405 |
+
|
| 406 |
+
# Store the collection name in indexed_docs BEFORE processing files
|
| 407 |
+
self.indexed_docs[collection_name] = True
|
| 408 |
+
print(f"π Created collection: {collection_name}")
|
| 409 |
+
|
| 410 |
+
# Clear old collections to ensure only the latest upload is referenced
|
| 411 |
+
self._clear_old_collections(collection_name)
|
| 412 |
|
| 413 |
for file in files[:]:
|
| 414 |
# Extract the last part of the path (file name)
|
|
|
|
| 418 |
|
| 419 |
# Convert PPT to PDF if needed
|
| 420 |
if ext.lower() in [".ppt", ".pptx"]:
|
| 421 |
+
if PPT_CONVERT_AVAILABLE:
|
| 422 |
+
output_file = os.path.splitext(file.name)[0] + '.pdf'
|
| 423 |
+
output_directory = os.path.dirname(file.name)
|
| 424 |
+
outfile = os.path.join(output_directory, output_file)
|
| 425 |
+
convert(file.name, outfile)
|
| 426 |
+
pdf_path = outfile
|
| 427 |
+
name = os.path.basename(outfile)
|
| 428 |
+
name, ext = os.path.splitext(name)
|
| 429 |
+
else:
|
| 430 |
+
return "PPT/PPTX conversion is only supported on Windows. Please upload PDFs instead."
|
| 431 |
|
| 432 |
# Create unique document ID
|
| 433 |
doc_id = f"{collection_name}_{name.replace(' ', '_').replace('-', '_')}"
|
|
|
|
| 435 |
print(f"Uploading file: {doc_id}")
|
| 436 |
middleware = Middleware(collection_name, create_collection=True)
|
| 437 |
|
| 438 |
+
# Pass collection_name as id to ensure images are saved to the right directory
|
| 439 |
+
pages = middleware.index(pdf_path, id=collection_name, max_pages=max_pages)
|
| 440 |
total_pages += len(pages) if pages else 0
|
| 441 |
uploaded_files.append(doc_id)
|
|
|
|
|
|
|
| 442 |
|
| 443 |
+
# Get the current active collection after cleanup
|
| 444 |
+
current_collection = self.get_current_collection()
|
| 445 |
+
status_message = f"Uploaded {len(uploaded_files)} files with {total_pages} total pages to collection: {collection_name}"
|
| 446 |
+
|
| 447 |
+
if current_collection:
|
| 448 |
+
status_message += f"\nβ
This is now your active collection for searches."
|
|
|
|
|
|
|
| 449 |
|
| 450 |
+
return status_message
|
| 451 |
|
| 452 |
except Exception as e:
|
| 453 |
return f"Error processing files: {str(e)}"
|
| 454 |
|
| 455 |
+
def _clear_old_collections(self, current_collection_name):
|
| 456 |
+
"""Clear old collections to ensure only the latest upload is referenced"""
|
| 457 |
+
try:
|
| 458 |
+
# Get all collections except the current one
|
| 459 |
+
collections_to_remove = [name for name in self.indexed_docs.keys() if name != current_collection_name]
|
| 460 |
+
|
| 461 |
+
if collections_to_remove:
|
| 462 |
+
print(f"ποΈ Clearing {len(collections_to_remove)} old collections to maintain latest upload reference")
|
| 463 |
+
|
| 464 |
+
for old_collection in collections_to_remove:
|
| 465 |
+
# Remove from indexed_docs
|
| 466 |
+
del self.indexed_docs[old_collection]
|
| 467 |
+
|
| 468 |
+
# Try to drop the collection from Milvus
|
| 469 |
+
try:
|
| 470 |
+
middleware = Middleware(old_collection, create_collection=False)
|
| 471 |
+
if middleware.drop_collection():
|
| 472 |
+
print(f"ποΈ Successfully dropped Milvus collection '{old_collection}'")
|
| 473 |
+
else:
|
| 474 |
+
print(f"β οΈ Failed to drop Milvus collection '{old_collection}'")
|
| 475 |
+
except Exception as e:
|
| 476 |
+
print(f"β οΈ Warning: Could not clean up Milvus collection '{old_collection}': {e}")
|
| 477 |
+
|
| 478 |
+
print(f"β
Kept only the latest collection: {current_collection_name}")
|
| 479 |
+
else:
|
| 480 |
+
print(f"β
No old collections to clear. Current collection: {current_collection_name}")
|
| 481 |
+
|
| 482 |
+
except Exception as e:
|
| 483 |
+
print(f"β οΈ Warning: Error clearing old collections: {e}")
|
| 484 |
+
# Don't fail the upload if cleanup fails
|
| 485 |
+
|
| 486 |
+
def get_current_collection_status(self):
|
| 487 |
+
"""Get a user-friendly status message about the current collection"""
|
| 488 |
+
current_collection = self.get_current_collection()
|
| 489 |
+
if current_collection:
|
| 490 |
+
return f"β
Currently active collection: {current_collection}"
|
| 491 |
+
else:
|
| 492 |
+
return "β No documents uploaded yet. Please upload a document to get started."
|
| 493 |
+
|
| 494 |
+
def get_current_collection(self):
|
| 495 |
+
"""Get the name of the currently active collection (most recent upload)"""
|
| 496 |
+
if not self.indexed_docs:
|
| 497 |
+
return None
|
| 498 |
+
|
| 499 |
+
available_collections = list(self.indexed_docs.keys())
|
| 500 |
+
if not available_collections:
|
| 501 |
+
return None
|
| 502 |
+
|
| 503 |
+
# Sort by timestamp to get the most recent one
|
| 504 |
+
def extract_timestamp(collection_name):
|
| 505 |
+
try:
|
| 506 |
+
parts = collection_name.split('_')
|
| 507 |
+
if len(parts) >= 3:
|
| 508 |
+
date_part = parts[-2]
|
| 509 |
+
time_part = parts[-1]
|
| 510 |
+
timestamp = f"{date_part}_{time_part}"
|
| 511 |
+
return timestamp
|
| 512 |
+
return collection_name
|
| 513 |
+
except:
|
| 514 |
+
return collection_name
|
| 515 |
+
|
| 516 |
+
available_collections.sort(key=extract_timestamp, reverse=True)
|
| 517 |
+
return available_collections[0]
|
| 518 |
|
| 519 |
+
def display_file_list(self, text):
|
| 520 |
try:
|
| 521 |
# Retrieve all entries in the specified directory
|
| 522 |
+
# Use the same base directory logic as PdfManager
|
| 523 |
+
base_output_dir = self._ensure_base_directory()
|
| 524 |
+
directory_path = base_output_dir
|
| 525 |
current_working_directory = os.getcwd()
|
| 526 |
directory_path = os.path.join(current_working_directory, directory_path)
|
| 527 |
entries = os.listdir(directory_path)
|
|
|
|
| 536 |
return str(e)
|
| 537 |
|
| 538 |
|
| 539 |
+
def search_documents(self, query, num_results):
|
| 540 |
print(f"Searching for query: {query}")
|
| 541 |
|
| 542 |
if not query:
|
| 543 |
print("Please enter a search query")
|
| 544 |
+
return "Please enter a search query", "--", "Please enter a search query", [], None, None, None, None
|
| 545 |
|
| 546 |
try:
|
| 547 |
+
# First, check if there are any indexed documents
|
| 548 |
+
if not self.indexed_docs:
|
| 549 |
+
return "No documents have been uploaded yet. Please upload some documents first.", "--", "No documents available for search", [], None, None, None, None
|
| 550 |
+
|
| 551 |
+
# Clean up any invalid collections first
|
| 552 |
+
print("π§Ή Cleaning up invalid collections...")
|
| 553 |
+
removed_count = self._cleanup_invalid_collections()
|
| 554 |
+
if removed_count > 0:
|
| 555 |
+
print(f"ποΈ Removed {removed_count} invalid collections")
|
| 556 |
+
|
| 557 |
+
# Check again after cleanup
|
| 558 |
+
if not self.indexed_docs:
|
| 559 |
+
return "No valid collections found after cleanup. Please re-upload your documents.", "--", "No valid collections available", [], None, None, None, None
|
| 560 |
+
|
| 561 |
+
# Get the most recent collection name from indexed docs (latest upload)
|
| 562 |
+
available_collections = list(self.indexed_docs.keys())
|
| 563 |
+
print(f"π Available collections after cleanup: {available_collections}")
|
| 564 |
+
|
| 565 |
+
if not available_collections:
|
| 566 |
+
return "No collections available for search. Please upload some documents first.", "--", "No collections available", [], None, None, None, None
|
| 567 |
+
|
| 568 |
+
# Sort collections by timestamp to get the most recent one
|
| 569 |
+
# Collections are named like "documents_20250101_120000" or "folder_20250101_120000"
|
| 570 |
+
def extract_timestamp(collection_name):
|
| 571 |
+
try:
|
| 572 |
+
# Extract the timestamp part after the last underscore
|
| 573 |
+
parts = collection_name.split('_')
|
| 574 |
+
if len(parts) >= 3:
|
| 575 |
+
# Get the last two parts which should be date and time
|
| 576 |
+
date_part = parts[-2]
|
| 577 |
+
time_part = parts[-1]
|
| 578 |
+
timestamp = f"{date_part}_{time_part}"
|
| 579 |
+
return timestamp
|
| 580 |
+
return collection_name
|
| 581 |
+
except:
|
| 582 |
+
return collection_name
|
| 583 |
+
|
| 584 |
+
# Sort by timestamp in descending order (most recent first)
|
| 585 |
+
available_collections.sort(key=extract_timestamp, reverse=True)
|
| 586 |
+
collection_name = available_collections[0]
|
| 587 |
+
print(f"π Available collections sorted by timestamp: {available_collections}")
|
| 588 |
+
print(f"π Searching in most recent collection: {collection_name}")
|
| 589 |
+
|
| 590 |
+
# Add collection info to the search results for user clarity
|
| 591 |
+
collection_info = f"π Searching in collection: {collection_name}"
|
| 592 |
+
|
| 593 |
+
middleware = Middleware(collection_name, create_collection=False)
|
| 594 |
|
| 595 |
# Enhanced multi-page retrieval with vision-guided chunking approach
|
| 596 |
# Get more results than requested to allow for intelligent filtering
|
| 597 |
# Request 3x the number of results for better selection
|
| 598 |
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
| 599 |
|
| 600 |
+
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
| 601 |
+
print(f"\nπ SEARCH RESULTS SUMMARY")
|
| 602 |
+
print(f"π Retrieved {len(search_results)} total results from search")
|
| 603 |
if len(search_results) > 0:
|
| 604 |
+
print(f"π Top result score: {search_results[0][0]:.4f}")
|
| 605 |
+
print(f"π Bottom result score: {search_results[-1][0]:.4f}")
|
| 606 |
+
print(f"π Score range: {search_results[-1][0]:.4f} - {search_results[0][0]:.4f}")
|
| 607 |
+
|
| 608 |
+
# Show top 5 results with page numbers
|
| 609 |
+
print(f"\nπ TOP 5 HIGHEST SCORING PAGES:")
|
| 610 |
+
for i, (score, doc_id) in enumerate(search_results[:5], 1):
|
| 611 |
+
page_num = doc_id + 1 # Convert to 1-based page numbering
|
| 612 |
+
print(f" {i}. Page {page_num} (doc_id: {doc_id}) - Score: {score:.4f}")
|
| 613 |
+
|
| 614 |
+
# Calculate and display score statistics
|
| 615 |
+
scores = [result[0] for result in search_results]
|
| 616 |
+
avg_score = sum(scores) / len(scores)
|
| 617 |
+
print(f"\nπ SCORE STATISTICS:")
|
| 618 |
+
print(f" Average Score: {avg_score:.4f}")
|
| 619 |
+
print(f" Score Variance: {sum((s - avg_score) ** 2 for s in scores) / len(scores):.4f}")
|
| 620 |
+
|
| 621 |
+
# Count pages by relevance level
|
| 622 |
+
excellent = sum(1 for s in scores if s >= 0.90)
|
| 623 |
+
very_good = sum(1 for s in scores if 0.80 <= s < 0.90)
|
| 624 |
+
good = sum(1 for s in scores if 0.70 <= s < 0.80)
|
| 625 |
+
moderate = sum(1 for s in scores if 0.60 <= s < 0.70)
|
| 626 |
+
basic = sum(1 for s in scores if 0.50 <= s < 0.60)
|
| 627 |
+
poor = sum(1 for s in scores if s < 0.50)
|
| 628 |
+
|
| 629 |
+
print(f"\nπ RELEVANCE DISTRIBUTION:")
|
| 630 |
+
print(f" π’ Excellent (β₯0.90): {excellent} pages")
|
| 631 |
+
print(f" π‘ Very Good (0.80-0.89): {very_good} pages")
|
| 632 |
+
print(f" π Good (0.70-0.79): {good} pages")
|
| 633 |
+
print(f" π΅ Moderate (0.60-0.69): {moderate} pages")
|
| 634 |
+
print(f" π£ Basic (0.50-0.59): {basic} pages")
|
| 635 |
+
print(f" π΄ Poor (<0.50): {poor} pages")
|
| 636 |
+
print("-" * 60)
|
| 637 |
|
| 638 |
if not search_results:
|
| 639 |
+
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
| 640 |
|
| 641 |
# Implement intelligent multi-page selection based on research
|
| 642 |
+
selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
|
| 643 |
+
|
| 644 |
+
# π SELECTION LOGGING - Show which pages were selected
|
| 645 |
+
print(f"\nπ― PAGE SELECTION RESULTS")
|
| 646 |
+
print(f"π Requested: {num_results} pages")
|
| 647 |
+
print(f"π Selected: {len(selected_results)} pages")
|
| 648 |
+
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
| 649 |
+
print("-" * 60)
|
| 650 |
+
|
| 651 |
+
print(f"π SELECTED PAGES WITH SCORES:")
|
| 652 |
+
for i, (score, doc_id) in enumerate(selected_results, 1):
|
| 653 |
+
page_num = doc_id + 1
|
| 654 |
+
relevance_level = self._get_relevance_level(score)
|
| 655 |
+
print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
| 656 |
+
|
| 657 |
+
# Calculate selection statistics
|
| 658 |
+
if selected_results:
|
| 659 |
+
selected_scores = [result[0] for result in selected_results]
|
| 660 |
+
avg_selected_score = sum(selected_scores) / len(selected_scores)
|
| 661 |
+
print(f"\nπ SELECTION STATISTICS:")
|
| 662 |
+
print(f" Average selected score: {avg_selected_score:.4f}")
|
| 663 |
+
print(f" Highest selected score: {selected_scores[0]:.4f}")
|
| 664 |
+
print(f" Lowest selected score: {selected_scores[-1]:.4f}")
|
| 665 |
+
print(f" Score improvement over average: {avg_selected_score - avg_score:.4f}")
|
| 666 |
+
print("-" * 60)
|
| 667 |
|
| 668 |
# Process selected results
|
| 669 |
cited_pages = []
|
|
|
|
| 673 |
|
| 674 |
print(f"π Processing {len(selected_results)} selected results...")
|
| 675 |
|
| 676 |
+
# Ensure base directory exists and get the correct path
|
| 677 |
+
base_output_dir = self._ensure_base_directory()
|
| 678 |
+
print(f"π Using base directory: {base_output_dir}")
|
| 679 |
+
print(f"π Collection name: {collection_name}")
|
| 680 |
+
print(f"π Environment: {'Hugging Face Spaces' if self._is_huggingface_spaces() else 'Local Development'}")
|
| 681 |
+
|
| 682 |
+
for i, (score, doc_id) in enumerate(selected_results):
|
| 683 |
+
# Use the index as page number since doc_id is just an identifier
|
| 684 |
+
# This ensures we look for page_1.png, page_2.png, etc.
|
| 685 |
+
display_page_num = i + 1
|
| 686 |
+
coll_num = collection_name # Use the current collection name
|
| 687 |
+
|
| 688 |
+
# Use debug function to get paths and check existence
|
| 689 |
+
img_path, path, file_exists = self._debug_file_paths(base_output_dir, coll_num, display_page_num)
|
| 690 |
|
| 691 |
+
if file_exists:
|
| 692 |
img_paths.append(img_path)
|
| 693 |
all_paths.append(path)
|
| 694 |
page_scores.append(score)
|
|
|
|
| 696 |
print(f"β
Retrieved page {i+1}: {img_path} (Score: {score:.3f})")
|
| 697 |
else:
|
| 698 |
print(f"β Image file not found: {img_path}")
|
| 699 |
+
# Try alternative paths with better fallback logic
|
| 700 |
+
alt_paths = [
|
| 701 |
+
# Primary path (should work in Hugging Face Spaces)
|
| 702 |
+
img_path,
|
| 703 |
+
# Relative paths from app directory
|
| 704 |
+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "pages", coll_num, f"page_{display_page_num}.png"),
|
| 705 |
+
# Current working directory paths
|
| 706 |
+
f"pages/{coll_num}/page_{display_page_num}.png",
|
| 707 |
+
f"./pages/{coll_num}/page_{display_page_num}.png",
|
| 708 |
+
os.path.join(os.getcwd(), "pages", coll_num, f"page_{display_page_num}.png"),
|
| 709 |
+
# Alternative base directories
|
| 710 |
+
os.path.join("/tmp", "pages", coll_num, f"page_{display_page_num}.png"),
|
| 711 |
+
os.path.join("/home/user", "pages", coll_num, f"page_{display_page_num}.png")
|
| 712 |
+
]
|
| 713 |
+
|
| 714 |
+
print(f"π Trying alternative paths for page {display_page_num}:")
|
| 715 |
+
for alt_path in alt_paths:
|
| 716 |
+
print(f" π Checking: {alt_path}")
|
| 717 |
+
if os.path.exists(alt_path):
|
| 718 |
+
print(f"β
Found alternative path: {alt_path}")
|
| 719 |
+
img_paths.append(alt_path)
|
| 720 |
+
all_paths.append(alt_path.replace(".png", ""))
|
| 721 |
+
page_scores.append(score)
|
| 722 |
+
cited_pages.append(f"Page {display_page_num} from {coll_num}")
|
| 723 |
+
break
|
| 724 |
+
else:
|
| 725 |
+
print(f"β No alternative path found for page {display_page_num}")
|
| 726 |
|
| 727 |
print(f"π Final count: {len(img_paths)} valid pages out of {len(selected_results)} selected")
|
| 728 |
|
| 729 |
+
# π FINAL RESULTS SUMMARY
|
| 730 |
+
if img_paths:
|
| 731 |
+
print(f"\nπ FINAL RETRIEVAL SUMMARY")
|
| 732 |
+
print(f"π Successfully retrieved: {len(img_paths)} pages")
|
| 733 |
+
print(f"π Final page scores:")
|
| 734 |
+
for i, (img_path, score) in enumerate(zip(img_paths, page_scores), 1):
|
| 735 |
+
# Extract page number from path
|
| 736 |
+
page_num = img_path.split('page_')[1].split('.png')[0] if 'page_' in img_path else f"Page {i}"
|
| 737 |
+
print(f" {i}. {page_num} - Score: {score:.4f}")
|
| 738 |
+
|
| 739 |
+
if page_scores:
|
| 740 |
+
final_avg_score = sum(page_scores) / len(page_scores)
|
| 741 |
+
print(f"\nπ FINAL STATISTICS:")
|
| 742 |
+
print(f" Average final score: {final_avg_score:.4f}")
|
| 743 |
+
print(f" Highest final score: {max(page_scores):.4f}")
|
| 744 |
+
print(f" Lowest final score: {min(page_scores):.4f}")
|
| 745 |
+
print("=" * 60)
|
| 746 |
+
|
| 747 |
if not img_paths:
|
| 748 |
+
return "No valid image files found", "--", "Error: No valid image files found for the search results", [], None, None, None, None
|
| 749 |
|
| 750 |
# Generate RAG response with multiple pages using enhanced approach
|
| 751 |
+
try:
|
| 752 |
+
print("π€ Generating RAG response...")
|
| 753 |
+
rag_response, csv_filepath, doc_filepath, excel_filepath = self._generate_multi_page_response(query, img_paths, cited_pages, page_scores)
|
| 754 |
+
print("β
RAG response generated successfully")
|
| 755 |
+
except Exception as e:
|
| 756 |
+
error_code = "RAG001"
|
| 757 |
+
error_msg = f"β **Error {error_code}**: Failed to generate RAG response"
|
| 758 |
+
print(f"{error_msg}: {str(e)}")
|
| 759 |
+
print(f"β Traceback: {traceback.format_exc()}")
|
| 760 |
+
|
| 761 |
+
# Return error response with proper format
|
| 762 |
+
return (
|
| 763 |
+
error_msg, # path
|
| 764 |
+
"--", # images
|
| 765 |
+
f"{error_msg}\n\n**Details**: {str(e)}\n\n**Error Code**: {error_code}", # llm_answer
|
| 766 |
+
cited_pages, # cited_pages_display
|
| 767 |
+
None, # csv_download
|
| 768 |
+
None, # doc_download
|
| 769 |
+
None # excel_download
|
| 770 |
+
)
|
| 771 |
|
| 772 |
# Prepare downloads
|
| 773 |
csv_download = self._prepare_csv_download(csv_filepath)
|
|
|
|
| 793 |
|
| 794 |
except Exception as e:
|
| 795 |
error_msg = f"Error during search: {str(e)}"
|
| 796 |
+
print(f"β Search error: {error_msg}")
|
| 797 |
+
# Return exactly 7 outputs to match Gradio expectations
|
| 798 |
return error_msg, "--", error_msg, [], None, None, None, None
|
| 799 |
+
|
| 800 |
+
def _select_relevant_pages_new_format(self, search_results, query, num_results):
|
| 801 |
"""
|
| 802 |
+
Intelligent page selection for new Milvus format: (score, doc_id)
|
|
|
|
| 803 |
"""
|
| 804 |
if len(search_results) <= num_results:
|
| 805 |
return search_results
|
| 806 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
# Sort by relevance score
|
| 808 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
| 809 |
|
| 810 |
+
# Simple strategy: take top N results
|
| 811 |
+
selected = sorted_results[:num_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
|
| 813 |
+
print(f"Requested {num_results} pages, selected {len(selected)} pages")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
|
| 815 |
return selected
|
| 816 |
|
| 817 |
+
def _get_relevance_level(self, score):
|
| 818 |
+
"""Get human-readable relevance level based on score"""
|
| 819 |
+
if score >= 0.90:
|
| 820 |
+
return "π’ EXCELLENT - Highly relevant"
|
| 821 |
+
elif score >= 0.80:
|
| 822 |
+
return "π‘ VERY GOOD - Very relevant"
|
| 823 |
+
elif score >= 0.70:
|
| 824 |
+
return "π GOOD - Relevant"
|
| 825 |
+
elif score >= 0.60:
|
| 826 |
+
return "π΅ MODERATE - Somewhat relevant"
|
| 827 |
+
elif score >= 0.50:
|
| 828 |
+
return "π£ BASIC - Minimally relevant"
|
| 829 |
+
else:
|
| 830 |
+
return "π΄ POOR - Not relevant"
|
| 831 |
+
|
| 832 |
def _optimize_consecutive_pages(self, selected, all_results, target_count=None):
|
| 833 |
"""
|
| 834 |
Optimize selection to include consecutive pages when beneficial
|
|
|
|
| 1365 |
cell_str = str(cell)
|
| 1366 |
if ',' in cell_str or '"' in cell_str or '\n' in cell_str:
|
| 1367 |
# Escape quotes and wrap in quotes
|
| 1368 |
+
cell_str = '"' + cell_str.replace('"', '""') + '"'
|
| 1369 |
escaped_row.append(cell_str)
|
| 1370 |
csv_lines.append(','.join(escaped_row))
|
| 1371 |
|
|
|
|
| 2996 |
# Fallback to simple response with enhanced prompt
|
| 2997 |
return rag.get_answer_from_gemini(detailed_prompt, img_paths), None, None, None
|
| 2998 |
|
| 2999 |
+
# Authentication and team collection methods removed for simplified app
|
| 3000 |
+
|
| 3001 |
+
def _is_huggingface_spaces(self):
|
| 3002 |
+
"""Check if running in Hugging Face Spaces environment"""
|
| 3003 |
+
return (
|
| 3004 |
+
os.path.exists("/tmp") and
|
| 3005 |
+
os.access("/tmp", os.W_OK) and
|
| 3006 |
+
(os.getenv('SPACE_ID') or os.getenv('HF_SPACE_ID'))
|
| 3007 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3008 |
|
| 3009 |
+
def _get_optimal_base_dir(self):
|
| 3010 |
+
"""Get the optimal base directory based on environment"""
|
| 3011 |
+
if self._is_huggingface_spaces():
|
| 3012 |
+
base_dir = "/tmp/pages"
|
| 3013 |
+
print(f"π Detected Hugging Face Spaces environment, using: {base_dir}")
|
| 3014 |
+
else:
|
| 3015 |
+
# Use relative path from app directory
|
| 3016 |
+
app_dir = os.path.dirname(os.path.abspath(__file__))
|
| 3017 |
+
base_dir = os.path.join(app_dir, "pages")
|
| 3018 |
+
print(f"π» Using local development path: {base_dir}")
|
| 3019 |
+
|
| 3020 |
+
# Ensure directory exists
|
| 3021 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 3022 |
+
return base_dir
|
| 3023 |
+
|
| 3024 |
+
def _ensure_base_directory(self):
|
| 3025 |
+
"""Ensure the base directory for storing pages exists"""
|
| 3026 |
+
base_output_dir = self._get_optimal_base_dir()
|
| 3027 |
+
|
| 3028 |
+
# Create the base directory if it doesn't exist
|
| 3029 |
+
if not os.path.exists(base_output_dir):
|
| 3030 |
+
try:
|
| 3031 |
+
os.makedirs(base_output_dir, exist_ok=True)
|
| 3032 |
+
print(f"β
Created base directory: {base_output_dir}")
|
| 3033 |
+
except Exception as e:
|
| 3034 |
+
print(f"β Failed to create base directory {base_output_dir}: {e}")
|
| 3035 |
+
# Fallback to current working directory
|
| 3036 |
+
base_output_dir = os.path.join(os.getcwd(), "pages")
|
| 3037 |
+
os.makedirs(base_output_dir, exist_ok=True)
|
| 3038 |
+
print(f"β
Using fallback directory: {base_output_dir}")
|
| 3039 |
+
|
| 3040 |
+
return base_output_dir
|
| 3041 |
|
| 3042 |
+
def _debug_file_paths(self, base_output_dir, coll_num, display_page_num):
|
| 3043 |
+
"""Helper function to debug file path issues"""
|
| 3044 |
+
img_path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}.png")
|
| 3045 |
+
path = os.path.join(base_output_dir, coll_num, f"page_{display_page_num}")
|
| 3046 |
+
|
| 3047 |
+
# Check if directory exists
|
| 3048 |
+
dir_path = os.path.dirname(img_path)
|
| 3049 |
+
dir_exists = os.path.exists(dir_path)
|
| 3050 |
|
| 3051 |
+
# Check if file exists
|
| 3052 |
+
file_exists = os.path.exists(img_path)
|
|
|
|
| 3053 |
|
| 3054 |
+
# Get absolute paths for debugging
|
| 3055 |
+
abs_img_path = os.path.abspath(img_path)
|
| 3056 |
+
abs_dir_path = os.path.abspath(dir_path)
|
| 3057 |
|
| 3058 |
+
print(f"π Path Debug for {coll_num}/page_{display_page_num}:")
|
| 3059 |
+
print(f" Base dir: {base_output_dir}")
|
| 3060 |
+
print(f" Directory: {dir_path} (exists: {dir_exists})")
|
| 3061 |
+
print(f" File: {img_path} (exists: {file_exists})")
|
| 3062 |
+
print(f" Abs dir: {abs_dir_path}")
|
| 3063 |
+
print(f" Abs file: {abs_img_path}")
|
| 3064 |
|
| 3065 |
+
return img_path, path, file_exists
|
| 3066 |
+
|
| 3067 |
+
def _cleanup_invalid_collections(self):
|
| 3068 |
+
"""Remove collections that no longer exist in Milvus from indexed_docs"""
|
| 3069 |
+
invalid_collections = []
|
| 3070 |
+
|
| 3071 |
+
for collection_name in list(self.indexed_docs.keys()):
|
| 3072 |
+
try:
|
| 3073 |
+
# Try to create a middleware instance to check if collection exists
|
| 3074 |
+
middleware = Middleware(collection_name, create_collection=False)
|
| 3075 |
+
print(f"οΏ½οΏ½ Collection {collection_name} is valid")
|
| 3076 |
+
except Exception as e:
|
| 3077 |
+
print(f"β οΈ Collection {collection_name} not accessible: {e}")
|
| 3078 |
+
invalid_collections.append(collection_name)
|
| 3079 |
+
|
| 3080 |
+
# Remove invalid collections
|
| 3081 |
+
for collection_name in invalid_collections:
|
| 3082 |
+
if collection_name in self.indexed_docs:
|
| 3083 |
+
del self.indexed_docs[collection_name]
|
| 3084 |
+
print(f"ποΈ Removed invalid collection: {collection_name}")
|
| 3085 |
+
|
| 3086 |
+
return len(invalid_collections)
|
| 3087 |
+
|
| 3088 |
+
def _check_collections_exist(self):
|
| 3089 |
+
# This method should be implemented to check if collections exist in Milvus
|
| 3090 |
+
pass
|
| 3091 |
|
| 3092 |
def create_ui():
|
| 3093 |
app = PDFSearchApp()
|
| 3094 |
|
| 3095 |
with gr.Blocks(theme=gr.themes.Ocean(), css="footer{display:none !important}") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3096 |
gr.Markdown("# Collar Multimodal RAG Demo - Streamlined")
|
| 3097 |
+
gr.Markdown("Basic document upload and search (no authentication)")
|
| 3098 |
+
|
| 3099 |
+
# Document Upload
|
| 3100 |
+
with gr.Tab("π Document Upload"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3101 |
with gr.Column():
|
| 3102 |
+
gr.Markdown("### Upload Documents")
|
| 3103 |
folder_name_input = gr.Textbox(
|
| 3104 |
+
label="Collection Name (Optional)",
|
| 3105 |
+
placeholder="Optional name for this document collection"
|
| 3106 |
)
|
| 3107 |
max_pages_input = gr.Slider(
|
| 3108 |
minimum=1,
|
|
|
|
| 3112 |
label="Max pages to extract and index per document"
|
| 3113 |
)
|
| 3114 |
file_input = gr.Files(
|
| 3115 |
+
label="Upload PPTs/PDFs (Multiple files supported)",
|
| 3116 |
file_count="multiple"
|
| 3117 |
)
|
| 3118 |
+
upload_btn = gr.Button("Upload", variant="primary")
|
| 3119 |
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3120 |
|
| 3121 |
# Enhanced Query Tab
|
| 3122 |
with gr.Tab("π Advanced Query"):
|
|
|
|
| 3185 |
|
| 3186 |
|
| 3187 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3188 |
upload_btn.click(
|
| 3189 |
fn=app.upload_and_convert,
|
| 3190 |
+
inputs=[file_input, max_pages_input, folder_name_input],
|
| 3191 |
outputs=[upload_status]
|
| 3192 |
)
|
| 3193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3194 |
# Query events
|
| 3195 |
search_btn.click(
|
| 3196 |
fn=app.search_documents,
|
| 3197 |
+
inputs=[query_input, num_results],
|
| 3198 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 3199 |
)
|
| 3200 |
|
colpali_manager.py
CHANGED
|
@@ -25,7 +25,7 @@ import dotenv
|
|
| 25 |
dotenv_file = dotenv.find_dotenv()
|
| 26 |
dotenv.load_dotenv(dotenv_file)
|
| 27 |
|
| 28 |
-
model_name =
|
| 29 |
device = get_torch_device("cuda") #try using cpu instead of cuda?
|
| 30 |
|
| 31 |
#switch to locally downloading models & loading locally rather than from hf
|
|
@@ -97,7 +97,7 @@ class ColpaliManager:
|
|
| 97 |
return [Image.open(path) for path in paths]
|
| 98 |
|
| 99 |
@spaces.GPU
|
| 100 |
-
def process_images(self, image_paths:list[str], batch_size=
|
| 101 |
model.to("cuda")
|
| 102 |
print(f"Processing {len(image_paths)} image_paths")
|
| 103 |
|
|
@@ -161,7 +161,7 @@ class ColpaliManager:
|
|
| 161 |
|
| 162 |
dataloader = DataLoader(
|
| 163 |
dataset=ListDataset[str](texts),
|
| 164 |
-
batch_size=
|
| 165 |
shuffle=False,
|
| 166 |
collate_fn=lambda x: processor.process_queries(x),
|
| 167 |
)
|
|
|
|
| 25 |
dotenv_file = dotenv.find_dotenv()
|
| 26 |
dotenv.load_dotenv(dotenv_file)
|
| 27 |
|
| 28 |
+
model_name = 'vidore/colpali-v1.3' #"vidore/colSmol-256M"
|
| 29 |
device = get_torch_device("cuda") #try using cpu instead of cuda?
|
| 30 |
|
| 31 |
#switch to locally downloading models & loading locally rather than from hf
|
|
|
|
| 97 |
return [Image.open(path) for path in paths]
|
| 98 |
|
| 99 |
@spaces.GPU
|
| 100 |
+
def process_images(self, image_paths:list[str], batch_size=5):
|
| 101 |
model.to("cuda")
|
| 102 |
print(f"Processing {len(image_paths)} image_paths")
|
| 103 |
|
|
|
|
| 161 |
|
| 162 |
dataloader = DataLoader(
|
| 163 |
dataset=ListDataset[str](texts),
|
| 164 |
+
batch_size=5, #OG is 5, try reducing batch size to maximise gpu use
|
| 165 |
shuffle=False,
|
| 166 |
collate_fn=lambda x: processor.process_queries(x),
|
| 167 |
)
|
middleware.py
CHANGED
|
@@ -43,20 +43,40 @@ class Middleware:
|
|
| 43 |
print("Indexing completed")
|
| 44 |
|
| 45 |
return image_paths
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def search(self, search_queries: list[str], topk: int = 10):
|
| 50 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
final_res = []
|
| 53 |
|
| 54 |
-
for query in search_queries:
|
| 55 |
-
print(f"
|
|
|
|
|
|
|
| 56 |
query_vec = colpali_manager.process_text([query])[0]
|
|
|
|
|
|
|
|
|
|
| 57 |
search_res = self.milvus_manager.search(query_vec, topk=topk)
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
final_res.append(search_res)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return final_res
|
| 62 |
|
|
|
|
| 43 |
print("Indexing completed")
|
| 44 |
|
| 45 |
return image_paths
|
| 46 |
+
|
| 47 |
+
def drop_collection(self):
|
| 48 |
+
"""Drop the current collection from Milvus"""
|
| 49 |
+
return self.milvus_manager.drop_collection()
|
| 50 |
|
| 51 |
|
| 52 |
def search(self, search_queries: list[str], topk: int = 10):
|
| 53 |
+
print(f"\nπ MIDDLEWARE SEARCH INITIATED")
|
| 54 |
+
print(f"π Queries to process: {len(search_queries)}")
|
| 55 |
+
print(f"π― Top-k requested: {topk}")
|
| 56 |
+
print("-" * 60)
|
| 57 |
|
| 58 |
final_res = []
|
| 59 |
|
| 60 |
+
for i, query in enumerate(search_queries, 1):
|
| 61 |
+
print(f"\nπ Processing Query {i}/{len(search_queries)}: '{query}'")
|
| 62 |
+
print(f"π Converting query to vector representation...")
|
| 63 |
+
|
| 64 |
query_vec = colpali_manager.process_text([query])[0]
|
| 65 |
+
print(f"β
Query vector generated (dimension: {len(query_vec)})")
|
| 66 |
+
|
| 67 |
+
print(f"π Executing vector search in Milvus...")
|
| 68 |
search_res = self.milvus_manager.search(query_vec, topk=topk)
|
| 69 |
+
|
| 70 |
+
print(f"β
Search completed: {len(search_res)} results retrieved")
|
| 71 |
+
if search_res:
|
| 72 |
+
print(f"π Score range: {search_res[0][0]:.4f} (highest) to {search_res[-1][0]:.4f} (lowest)")
|
| 73 |
+
|
| 74 |
final_res.append(search_res)
|
| 75 |
|
| 76 |
+
print(f"\nπ MIDDLEWARE SEARCH COMPLETED")
|
| 77 |
+
print(f"π Total queries processed: {len(search_queries)}")
|
| 78 |
+
print(f"π Total results across all queries: {sum(len(res) for res in final_res)}")
|
| 79 |
+
print("=" * 60)
|
| 80 |
+
|
| 81 |
return final_res
|
| 82 |
|
milvus_manager.py
CHANGED
|
@@ -1,49 +1,24 @@
|
|
| 1 |
from pymilvus import MilvusClient, DataType
|
| 2 |
-
try:
|
| 3 |
-
from milvus import default_server # Milvus Lite
|
| 4 |
-
except Exception:
|
| 5 |
-
default_server = None
|
| 6 |
import numpy as np
|
| 7 |
import concurrent.futures
|
| 8 |
-
|
| 9 |
-
import os
|
| 10 |
|
| 11 |
class MilvusManager:
|
| 12 |
def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
|
| 13 |
-
|
| 14 |
-
#import environ variables from .env
|
| 15 |
-
import dotenv
|
| 16 |
-
# Load the .env file
|
| 17 |
-
dotenv_file = dotenv.find_dotenv()
|
| 18 |
-
dotenv.load_dotenv(dotenv_file)
|
| 19 |
-
|
| 20 |
-
# Start embedded Milvus Lite server and connect locally
|
| 21 |
-
if default_server is not None:
|
| 22 |
-
try:
|
| 23 |
-
# Optionally set base dir here if desired, e.g. default_server.set_base_dir('volumes/milvus_lite')
|
| 24 |
-
default_server.start()
|
| 25 |
-
except Exception:
|
| 26 |
-
pass
|
| 27 |
-
local_uri = f"http://127.0.0.1:{default_server.listen_port}"
|
| 28 |
-
self.client = MilvusClient(uri=local_uri)
|
| 29 |
-
else:
|
| 30 |
-
# Fallback to standard local server (assumes docker-compose or system service)
|
| 31 |
-
self.client = MilvusClient(uri="http://127.0.0.1:19530")
|
| 32 |
self.collection_name = collection_name
|
|
|
|
|
|
|
| 33 |
self.dim = dim
|
| 34 |
|
| 35 |
-
if
|
| 36 |
-
self.client.load_collection(collection_name=self.collection_name)
|
| 37 |
-
print("Loaded existing collection.")
|
| 38 |
-
elif create_collection:
|
| 39 |
self.create_collection()
|
| 40 |
self.create_index()
|
| 41 |
|
|
|
|
| 42 |
def create_collection(self):
|
| 43 |
if self.client.has_collection(collection_name=self.collection_name):
|
| 44 |
-
|
| 45 |
-
return
|
| 46 |
-
|
| 47 |
schema = self.client.create_schema(
|
| 48 |
auto_id=True,
|
| 49 |
enable_dynamic_fields=True,
|
|
@@ -61,16 +36,19 @@ class MilvusManager:
|
|
| 61 |
)
|
| 62 |
|
| 63 |
def create_index(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
index_params = self.client.prepare_index_params()
|
| 65 |
-
|
| 66 |
index_params.add_index(
|
| 67 |
field_name="vector",
|
| 68 |
index_name="vector_index",
|
| 69 |
-
index_type="
|
| 70 |
-
metric_type=
|
| 71 |
params={
|
| 72 |
-
"M":
|
| 73 |
-
"efConstruction":
|
| 74 |
},
|
| 75 |
)
|
| 76 |
|
|
@@ -78,78 +56,33 @@ class MilvusManager:
|
|
| 78 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
| 79 |
)
|
| 80 |
|
| 81 |
-
def
|
| 82 |
-
|
| 83 |
-
collections = self.client.list_collections()
|
| 84 |
-
|
| 85 |
-
# Set search parameters (here, using Inner Product metric).
|
| 86 |
-
search_params = {"metric_type": os.environ["metrictype"], "params": {}} #default metric type is "IP"
|
| 87 |
-
|
| 88 |
-
# Set to store unique (doc_id, collection_name) pairs across all collections.
|
| 89 |
-
doc_collection_pairs = set()
|
| 90 |
-
|
| 91 |
-
# Query each collection individually
|
| 92 |
-
for collection in collections:
|
| 93 |
-
self.client.load_collection(collection_name=collection)
|
| 94 |
-
print("collection loaded:"+ collection)
|
| 95 |
-
results = self.client.search(
|
| 96 |
-
collection,
|
| 97 |
-
data,
|
| 98 |
-
limit=int(os.environ["topk"]), # Adjust limit per collection as needed. (default is 50)
|
| 99 |
-
output_fields=["vector", "seq_id", "doc_id"],
|
| 100 |
-
search_params=search_params,
|
| 101 |
-
)
|
| 102 |
-
# Accumulate document IDs along with their originating collection.
|
| 103 |
-
for r_id in range(len(results)):
|
| 104 |
-
for r in range(len(results[r_id])):
|
| 105 |
-
doc_id = results[r_id][r]["entity"]["doc_id"]
|
| 106 |
-
doc_collection_pairs.add((doc_id, collection))
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
collection_name=collection_name,
|
| 114 |
-
filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
|
| 115 |
-
output_fields=["seq_id", "vector", "doc"],
|
| 116 |
-
limit=16380,
|
| 117 |
-
)
|
| 118 |
-
# Stack the vectors for dot product computation.
|
| 119 |
-
doc_vecs = np.vstack(
|
| 120 |
-
[doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
| 121 |
-
)
|
| 122 |
-
# Compute a similarity score via dot product.
|
| 123 |
-
score = np.dot(data, doc_vecs.T).max(1).sum()
|
| 124 |
-
return (score, doc_id, collection_name)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
| 128 |
-
futures = {
|
| 129 |
-
executor.submit(rerank_single_doc, doc_id, data, self.client, collection): (doc_id, collection)
|
| 130 |
-
for doc_id, collection in doc_collection_pairs
|
| 131 |
-
}
|
| 132 |
-
for future in concurrent.futures.as_completed(futures):
|
| 133 |
-
score, doc_id, collection = future.result()
|
| 134 |
-
scores.append((score, doc_id, collection))
|
| 135 |
-
#doc_id is page number!
|
| 136 |
-
|
| 137 |
-
# Sort the reranked results by score in descending order.
|
| 138 |
-
scores.sort(key=lambda x: x[0], reverse=True)
|
| 139 |
-
# Unload the collection after search to free memory.
|
| 140 |
-
self.client.release_collection(collection_name=collection)
|
| 141 |
-
|
| 142 |
-
return scores[:topk] if len(scores) >= topk else scores #topk is the number of scores to return back
|
| 143 |
-
"""
|
| 144 |
search_params = {"metric_type": "IP", "params": {}}
|
| 145 |
results = self.client.search(
|
| 146 |
self.collection_name,
|
| 147 |
data,
|
| 148 |
-
limit=50,
|
| 149 |
output_fields=["vector", "seq_id", "doc_id"],
|
| 150 |
search_params=search_params,
|
| 151 |
)
|
| 152 |
-
doc_ids =
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
scores = []
|
| 155 |
|
|
@@ -161,10 +94,10 @@ class MilvusManager:
|
|
| 161 |
limit=1000,
|
| 162 |
)
|
| 163 |
doc_vecs = np.vstack(
|
| 164 |
-
[
|
| 165 |
)
|
| 166 |
score = np.dot(data, doc_vecs.T).max(1).sum()
|
| 167 |
-
return score, doc_id
|
| 168 |
|
| 169 |
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
| 170 |
futures = {
|
|
@@ -178,13 +111,59 @@ class MilvusManager:
|
|
| 178 |
scores.append((score, doc_id))
|
| 179 |
|
| 180 |
scores.sort(key=lambda x: x[0], reverse=True)
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
def insert(self, data):
|
| 185 |
-
colbert_vecs = data["colbert_vecs"]
|
| 186 |
seq_length = len(colbert_vecs)
|
| 187 |
-
doc_ids = [data["doc_id"]
|
| 188 |
seq_ids = list(range(seq_length))
|
| 189 |
docs = [""] * seq_length
|
| 190 |
docs[0] = data["filepath"]
|
|
@@ -202,17 +181,38 @@ class MilvusManager:
|
|
| 202 |
],
|
| 203 |
)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
}
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
|
| 215 |
def insert_images_data(self, image_data):
|
| 216 |
data = self.get_images_as_doc(image_data)
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pymilvus import MilvusClient, DataType
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import concurrent.futures
|
| 4 |
+
|
|
|
|
| 5 |
|
| 6 |
class MilvusManager:
|
| 7 |
def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
|
| 8 |
+
self.client = MilvusClient(uri=milvus_uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
self.collection_name = collection_name
|
| 10 |
+
if self.client.has_collection(collection_name=self.collection_name):
|
| 11 |
+
self.client.load_collection(collection_name)
|
| 12 |
self.dim = dim
|
| 13 |
|
| 14 |
+
if create_collection:
|
|
|
|
|
|
|
|
|
|
| 15 |
self.create_collection()
|
| 16 |
self.create_index()
|
| 17 |
|
| 18 |
+
|
| 19 |
def create_collection(self):
|
| 20 |
if self.client.has_collection(collection_name=self.collection_name):
|
| 21 |
+
self.client.drop_collection(collection_name=self.collection_name)
|
|
|
|
|
|
|
| 22 |
schema = self.client.create_schema(
|
| 23 |
auto_id=True,
|
| 24 |
enable_dynamic_fields=True,
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
def create_index(self):
|
| 39 |
+
self.client.release_collection(collection_name=self.collection_name)
|
| 40 |
+
self.client.drop_index(
|
| 41 |
+
collection_name=self.collection_name, index_name="vector"
|
| 42 |
+
)
|
| 43 |
index_params = self.client.prepare_index_params()
|
|
|
|
| 44 |
index_params.add_index(
|
| 45 |
field_name="vector",
|
| 46 |
index_name="vector_index",
|
| 47 |
+
index_type="FLAT",
|
| 48 |
+
metric_type="IP",
|
| 49 |
params={
|
| 50 |
+
"M": 16,
|
| 51 |
+
"efConstruction": 500,
|
| 52 |
},
|
| 53 |
)
|
| 54 |
|
|
|
|
| 56 |
collection_name=self.collection_name, index_params=index_params, sync=True
|
| 57 |
)
|
| 58 |
|
| 59 |
+
def create_scalar_index(self):
|
| 60 |
+
self.client.release_collection(collection_name=self.collection_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
index_params = self.client.prepare_index_params()
|
| 63 |
+
index_params.add_index(
|
| 64 |
+
field_name="doc_id",
|
| 65 |
+
index_name="int32_index",
|
| 66 |
+
index_type="INVERTED",
|
| 67 |
+
)
|
| 68 |
|
| 69 |
+
self.client.create_index(
|
| 70 |
+
collection_name=self.collection_name, index_params=index_params, sync=True
|
| 71 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
def search(self, data, topk):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
search_params = {"metric_type": "IP", "params": {}}
|
| 75 |
results = self.client.search(
|
| 76 |
self.collection_name,
|
| 77 |
data,
|
| 78 |
+
limit=int(50),
|
| 79 |
output_fields=["vector", "seq_id", "doc_id"],
|
| 80 |
search_params=search_params,
|
| 81 |
)
|
| 82 |
+
doc_ids = set()
|
| 83 |
+
for r_id in range(len(results)):
|
| 84 |
+
for r in range(len(results[r_id])):
|
| 85 |
+
doc_ids.add(results[r_id][r]["entity"]["doc_id"])
|
| 86 |
|
| 87 |
scores = []
|
| 88 |
|
|
|
|
| 94 |
limit=1000,
|
| 95 |
)
|
| 96 |
doc_vecs = np.vstack(
|
| 97 |
+
[doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
|
| 98 |
)
|
| 99 |
score = np.dot(data, doc_vecs.T).max(1).sum()
|
| 100 |
+
return (score, doc_id)
|
| 101 |
|
| 102 |
with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
|
| 103 |
futures = {
|
|
|
|
| 111 |
scores.append((score, doc_id))
|
| 112 |
|
| 113 |
scores.sort(key=lambda x: x[0], reverse=True)
|
| 114 |
+
|
| 115 |
+
# π DETAILED SCORE LOGGING - Print page numbers with highest scores
|
| 116 |
+
print("\n" + "="*80)
|
| 117 |
+
print("π RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES")
|
| 118 |
+
print("="*80)
|
| 119 |
+
print(f"π Collection: {self.collection_name}")
|
| 120 |
+
print(f"π Total documents found: {len(scores)}")
|
| 121 |
+
print(f"π― Requested top-k: {topk}")
|
| 122 |
+
print("-"*80)
|
| 123 |
+
|
| 124 |
+
# Display top 10 scores with detailed information
|
| 125 |
+
display_count = min(10, len(scores))
|
| 126 |
+
for i, (score, doc_id) in enumerate(scores[:display_count]):
|
| 127 |
+
page_num = doc_id + 1 # Convert doc_id to page number (0-based to 1-based)
|
| 128 |
+
relevance_level = self._get_relevance_level(score)
|
| 129 |
+
print(f"π Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
| 130 |
+
|
| 131 |
+
if len(scores) > display_count:
|
| 132 |
+
print(f"... and {len(scores) - display_count} more results")
|
| 133 |
+
|
| 134 |
+
print("-"*80)
|
| 135 |
+
print(f"π HIGHEST SCORING PAGES:")
|
| 136 |
+
top_3 = scores[:3]
|
| 137 |
+
for i, (score, doc_id) in enumerate(top_3, 1):
|
| 138 |
+
page_num = doc_id + 1
|
| 139 |
+
print(f" {i}. Page {page_num} - Score: {score:.4f}")
|
| 140 |
+
|
| 141 |
+
print("="*80 + "\n")
|
| 142 |
+
|
| 143 |
+
if len(scores) >= topk:
|
| 144 |
+
return scores[:topk]
|
| 145 |
+
else:
|
| 146 |
+
return scores
|
| 147 |
+
|
| 148 |
+
def _get_relevance_level(self, score):
|
| 149 |
+
"""Get human-readable relevance level based on score"""
|
| 150 |
+
if score >= 0.90:
|
| 151 |
+
return "π’ EXCELLENT - Highly relevant"
|
| 152 |
+
elif score >= 0.80:
|
| 153 |
+
return "π‘ VERY GOOD - Very relevant"
|
| 154 |
+
elif score >= 0.70:
|
| 155 |
+
return "π GOOD - Relevant"
|
| 156 |
+
elif score >= 0.60:
|
| 157 |
+
return "π΅ MODERATE - Somewhat relevant"
|
| 158 |
+
elif score >= 0.50:
|
| 159 |
+
return "π£ BASIC - Minimally relevant"
|
| 160 |
+
else:
|
| 161 |
+
return "π΄ POOR - Not relevant"
|
| 162 |
|
| 163 |
def insert(self, data):
|
| 164 |
+
colbert_vecs = [vec for vec in data["colbert_vecs"]]
|
| 165 |
seq_length = len(colbert_vecs)
|
| 166 |
+
doc_ids = [data["doc_id"] for i in range(seq_length)]
|
| 167 |
seq_ids = list(range(seq_length))
|
| 168 |
docs = [""] * seq_length
|
| 169 |
docs[0] = data["filepath"]
|
|
|
|
| 181 |
],
|
| 182 |
)
|
| 183 |
|
| 184 |
+
|
| 185 |
+
def get_images_as_doc(self, images_with_vectors:list):
|
| 186 |
+
|
| 187 |
+
images_data = []
|
| 188 |
+
|
| 189 |
+
for i in range(len(images_with_vectors)):
|
| 190 |
+
data = {
|
| 191 |
+
"colbert_vecs": images_with_vectors[i]["colbert_vecs"],
|
| 192 |
+
"doc_id": i,
|
| 193 |
+
"filepath": images_with_vectors[i]["filepath"],
|
| 194 |
}
|
| 195 |
+
images_data.append(data)
|
| 196 |
+
|
| 197 |
+
return images_data
|
| 198 |
+
|
| 199 |
|
| 200 |
def insert_images_data(self, image_data):
|
| 201 |
data = self.get_images_as_doc(image_data)
|
| 202 |
+
|
| 203 |
+
for i in range(len(data)):
|
| 204 |
+
self.insert(data[i])
|
| 205 |
+
|
| 206 |
+
def drop_collection(self):
|
| 207 |
+
"""Drop the current collection from Milvus"""
|
| 208 |
+
try:
|
| 209 |
+
if self.client.has_collection(collection_name=self.collection_name):
|
| 210 |
+
self.client.drop_collection(collection_name=self.collection_name)
|
| 211 |
+
print(f"ποΈ Dropped Milvus collection: {self.collection_name}")
|
| 212 |
+
return True
|
| 213 |
+
else:
|
| 214 |
+
print(f"β οΈ Collection {self.collection_name} does not exist in Milvus")
|
| 215 |
+
return False
|
| 216 |
+
except Exception as e:
|
| 217 |
+
print(f"β Error dropping collection {self.collection_name}: {e}")
|
| 218 |
+
return False
|
pdf_manager.py
CHANGED
|
@@ -4,7 +4,21 @@ import shutil
|
|
| 4 |
|
| 5 |
class PdfManager:
|
| 6 |
def __init__(self):
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def clear_and_recreate_dir(self, output_folder):
|
| 10 |
|
|
@@ -19,7 +33,8 @@ class PdfManager:
|
|
| 19 |
#print("Clearing is unused for now for persistency")
|
| 20 |
|
| 21 |
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
|
| 22 |
-
|
|
|
|
| 23 |
images = convert_from_path(pdf_path)
|
| 24 |
|
| 25 |
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
|
|
@@ -35,7 +50,7 @@ class PdfManager:
|
|
| 35 |
if pages and i not in pages:
|
| 36 |
continue
|
| 37 |
|
| 38 |
-
full_save_path = f"
|
| 39 |
|
| 40 |
#print(f"Saving image to {full_save_path}")
|
| 41 |
|
|
@@ -43,4 +58,4 @@ class PdfManager:
|
|
| 43 |
|
| 44 |
num_page_processed += 1
|
| 45 |
|
| 46 |
-
return [f"
|
|
|
|
| 4 |
|
| 5 |
class PdfManager:
|
| 6 |
def __init__(self):
|
| 7 |
+
# Use relative paths for Hugging Face Spaces compatibility
|
| 8 |
+
# Get the directory where the main application file is located
|
| 9 |
+
app_dir = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
|
| 11 |
+
# Use /tmp for Hugging Face Spaces, fallback to relative path
|
| 12 |
+
if os.path.exists("/tmp") and os.access("/tmp", os.W_OK):
|
| 13 |
+
self.base_output_dir = "/tmp/pages"
|
| 14 |
+
print(f"β
Using /tmp directory for Hugging Face Spaces: {self.base_output_dir}")
|
| 15 |
+
else:
|
| 16 |
+
# Fallback to relative path from app directory
|
| 17 |
+
self.base_output_dir = os.path.join(app_dir, "pages")
|
| 18 |
+
print(f"β
Using relative path: {self.base_output_dir}")
|
| 19 |
+
|
| 20 |
+
# Ensure the base directory exists
|
| 21 |
+
os.makedirs(self.base_output_dir, exist_ok=True)
|
| 22 |
|
| 23 |
def clear_and_recreate_dir(self, output_folder):
|
| 24 |
|
|
|
|
| 33 |
#print("Clearing is unused for now for persistency")
|
| 34 |
|
| 35 |
def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
|
| 36 |
+
# Use absolute path for Hugging Face Spaces compatibility
|
| 37 |
+
output_folder = os.path.join(self.base_output_dir, id)
|
| 38 |
images = convert_from_path(pdf_path)
|
| 39 |
|
| 40 |
print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
|
|
|
|
| 50 |
if pages and i not in pages:
|
| 51 |
continue
|
| 52 |
|
| 53 |
+
full_save_path = os.path.join(output_folder, f"page_{i + 1}.png")
|
| 54 |
|
| 55 |
#print(f"Saving image to {full_save_path}")
|
| 56 |
|
|
|
|
| 58 |
|
| 59 |
num_page_processed += 1
|
| 60 |
|
| 61 |
+
return [os.path.join(output_folder, f"page_{i + 1}.png") for i in range(num_page_processed)]
|
rag.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
from typing import List
|
| 6 |
from utils import encode_image
|
| 7 |
from PIL import Image
|
| 8 |
-
from
|
| 9 |
import torch
|
| 10 |
import subprocess
|
| 11 |
import psutil
|
|
@@ -64,30 +64,28 @@ class Rag:
|
|
| 64 |
|
| 65 |
return response_text
|
| 66 |
|
| 67 |
-
def get_answer_from_gemini(self, query
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
try:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
pass
|
| 85 |
-
|
| 86 |
-
chat_session = model.start_chat()
|
| 87 |
-
response = chat_session.send_message([*images, query])
|
| 88 |
-
return response.text
|
| 89 |
except Exception as e:
|
| 90 |
-
print(f"
|
| 91 |
return f"Error: {str(e)}"
|
| 92 |
|
| 93 |
#os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
|
|
@@ -100,13 +98,160 @@ class Rag:
|
|
| 100 |
dotenv_file = dotenv.find_dotenv()
|
| 101 |
dotenv.load_dotenv(dotenv_file)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
except Exception as e:
|
| 109 |
-
print(f"
|
| 110 |
return None
|
| 111 |
|
| 112 |
|
|
|
|
| 5 |
from typing import List
|
| 6 |
from utils import encode_image
|
| 7 |
from PIL import Image
|
| 8 |
+
from ollama import chat
|
| 9 |
import torch
|
| 10 |
import subprocess
|
| 11 |
import psutil
|
|
|
|
| 64 |
|
| 65 |
return response_text
|
| 66 |
|
| 67 |
+
def get_answer_from_gemini(self, query, imagePaths):
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
|
| 71 |
+
|
| 72 |
try:
|
| 73 |
+
client = genai.Client(api_key="AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs")
|
| 74 |
+
|
| 75 |
+
images = [Image.open(path) for path in imagePaths]
|
| 76 |
+
|
| 77 |
+
response = client.models.generate_content(
|
| 78 |
+
model="gemini-2.5-flash",
|
| 79 |
+
contents=[images, query],
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
print(response.text)
|
| 83 |
+
answer = response.text
|
| 84 |
+
|
| 85 |
+
return answer
|
| 86 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
except Exception as e:
|
| 88 |
+
print(f"An error occurred while querying Gemini: {e}")
|
| 89 |
return f"Error: {str(e)}"
|
| 90 |
|
| 91 |
#os.environ['OPENAI_API_KEY'] = "for the love of Jesus let this work"
|
|
|
|
| 98 |
dotenv_file = dotenv.find_dotenv()
|
| 99 |
dotenv.load_dotenv(dotenv_file)
|
| 100 |
|
| 101 |
+
#ollama method below
|
| 102 |
+
|
| 103 |
+
torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
|
| 107 |
+
if os.environ['ollama'] == "minicpm-v":
|
| 108 |
+
os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
|
| 109 |
+
elif os.environ['ollama'] == "gemma3":
|
| 110 |
+
os.environ['ollama'] = "gemma3:12b" #set to upscaled version 12b when needed
|
| 111 |
+
# Add specific environment variables for Gemma3 to prevent raw token issues
|
| 112 |
+
os.environ['OLLAMA_KEEP_ALIVE'] = "5m"
|
| 113 |
+
os.environ['OLLAMA_ORIGINS'] = "*"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# Close model thread (colpali)
|
| 117 |
+
print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
|
| 121 |
+
# Enhanced prompt for more detailed responses with explicit page usage
|
| 122 |
+
enhanced_query = f"""
|
| 123 |
+
Please provide a comprehensive and detailed answer to the following query.
|
| 124 |
+
Use ALL available information from the provided document images to give a thorough response.
|
| 125 |
+
|
| 126 |
+
Query: {query}
|
| 127 |
+
|
| 128 |
+
CRITICAL INSTRUCTIONS:
|
| 129 |
+
- You have been provided with {len(imagesPaths)} document page(s)
|
| 130 |
+
- You MUST reference information from ALL {len(imagesPaths)} page(s) in your response
|
| 131 |
+
- Do not skip any pages - each page contains relevant information
|
| 132 |
+
- If you mention one page, you must also mention the others
|
| 133 |
+
- Ensure your response reflects the complete information from all pages
|
| 134 |
+
|
| 135 |
+
Instructions for detailed response:
|
| 136 |
+
1. Provide extensive background information and context
|
| 137 |
+
2. Include specific details, examples, and data points from ALL documents
|
| 138 |
+
3. Explain concepts thoroughly with step-by-step breakdowns
|
| 139 |
+
4. Provide comprehensive analysis rather than simple answers when requested
|
| 140 |
+
5. Explicitly reference each page and what information it contributes
|
| 141 |
+
6. Cross-reference information between pages when relevant
|
| 142 |
+
7. Ensure no page is left unmentioned in your analysis
|
| 143 |
+
|
| 144 |
+
SPECIAL INSTRUCTIONS FOR TABULAR DATA:
|
| 145 |
+
- If the query requests a table, list, or structured data, organize your response in a clear, structured format
|
| 146 |
+
- Use numbered lists, bullet points, or clear categories when appropriate
|
| 147 |
+
- Include specific data points or comparisons when available
|
| 148 |
+
- Structure information in a way that can be easily converted to a table format
|
| 149 |
+
|
| 150 |
+
IMPORTANT: Respond with natural, human-readable text only. Do not include any special tokens, codes, or technical identifiers in your response.
|
| 151 |
+
|
| 152 |
+
Make sure to acknowledge and use information from all {len(imagesPaths)} provided pages.
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
# Try with current model first
|
| 156 |
+
current_model = os.environ['ollama']
|
| 157 |
+
|
| 158 |
+
# Set different options based on the model
|
| 159 |
+
if "gemma3" in current_model.lower():
|
| 160 |
+
# Specific options for Gemma3 to prevent raw token issues
|
| 161 |
+
model_options = {
|
| 162 |
+
"num_predict": 1024, # Shorter responses for Gemma3
|
| 163 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>", "<|im_end|>"], # More stop tokens
|
| 164 |
+
"top_k": 20, # Lower top_k for more focused generation
|
| 165 |
+
"top_p": 0.8, # Lower top_p for more deterministic output
|
| 166 |
+
"repeat_penalty": 1.2, # Higher repeat penalty
|
| 167 |
+
"seed": 42, # Consistent results
|
| 168 |
+
"temperature": 0.7, # Lower temperature for more focused responses
|
| 169 |
+
}
|
| 170 |
+
else:
|
| 171 |
+
# Default options for other models
|
| 172 |
+
model_options = {
|
| 173 |
+
"num_predict": 2048, # Limit response length
|
| 174 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>"], # Stop at end tokens
|
| 175 |
+
"top_k": 40, # Reduce randomness
|
| 176 |
+
"top_p": 0.9, # Nucleus sampling
|
| 177 |
+
"repeat_penalty": 1.1, # Prevent repetition
|
| 178 |
+
"seed": 42, # Consistent results
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
response = chat(
|
| 182 |
+
model=current_model,
|
| 183 |
+
messages=[
|
| 184 |
+
{
|
| 185 |
+
'role': 'user',
|
| 186 |
+
'content': enhanced_query,
|
| 187 |
+
'images': imagesPaths,
|
| 188 |
+
"temperature":float(os.environ['temperature']), #test if temp makes a diff
|
| 189 |
+
}
|
| 190 |
+
],
|
| 191 |
+
options=model_options
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
answer = response.message.content
|
| 195 |
+
|
| 196 |
+
# Clean the response to handle raw token issues
|
| 197 |
+
cleaned_answer = self._clean_raw_token_response(answer)
|
| 198 |
+
|
| 199 |
+
# If the cleaned answer is still problematic, try fallback models
|
| 200 |
+
if cleaned_answer and "β **Model Response Error**" in cleaned_answer:
|
| 201 |
+
print(f"β οΈ Primary model {current_model} failed, trying fallback models...")
|
| 202 |
+
|
| 203 |
+
# List of fallback models to try
|
| 204 |
+
fallback_models = [
|
| 205 |
+
"llama3.2-vision:latest",
|
| 206 |
+
"llava:latest",
|
| 207 |
+
"bakllava:latest",
|
| 208 |
+
"llama3.2:latest"
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
for fallback_model in fallback_models:
|
| 212 |
+
try:
|
| 213 |
+
print(f"π Trying fallback model: {fallback_model}")
|
| 214 |
+
response = chat(
|
| 215 |
+
model=fallback_model,
|
| 216 |
+
messages=[
|
| 217 |
+
{
|
| 218 |
+
'role': 'user',
|
| 219 |
+
'content': enhanced_query,
|
| 220 |
+
'images': imagesPaths,
|
| 221 |
+
"temperature":float(os.environ['temperature']),
|
| 222 |
+
}
|
| 223 |
+
],
|
| 224 |
+
options={
|
| 225 |
+
"num_predict": 2048,
|
| 226 |
+
"stop": ["<eos>", "<|endoftext|>", "</s>"],
|
| 227 |
+
"top_k": 40,
|
| 228 |
+
"top_p": 0.9,
|
| 229 |
+
"repeat_penalty": 1.1,
|
| 230 |
+
"seed": 42,
|
| 231 |
+
}
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
fallback_answer = response.message.content
|
| 235 |
+
cleaned_fallback = self._clean_raw_token_response(fallback_answer)
|
| 236 |
+
|
| 237 |
+
if cleaned_fallback and "β **Model Response Error**" not in cleaned_fallback:
|
| 238 |
+
print(f"β
Fallback model {fallback_model} succeeded")
|
| 239 |
+
return cleaned_fallback
|
| 240 |
+
|
| 241 |
+
except Exception as fallback_error:
|
| 242 |
+
print(f"β Fallback model {fallback_model} failed: {fallback_error}")
|
| 243 |
+
continue
|
| 244 |
+
|
| 245 |
+
# If all fallbacks fail, return the original error
|
| 246 |
+
return cleaned_answer
|
| 247 |
+
|
| 248 |
+
print(f"Original response: {answer}")
|
| 249 |
+
print(f"Cleaned response: {cleaned_answer}")
|
| 250 |
+
|
| 251 |
+
return cleaned_answer
|
| 252 |
+
|
| 253 |
except Exception as e:
|
| 254 |
+
print(f"An error occurred while querying OpenAI: {e}")
|
| 255 |
return None
|
| 256 |
|
| 257 |
|