Spaces:
Runtime error
Runtime error
logging
Browse files- app.py +50 -37
- score_utilizer.py +7 -7
app.py
CHANGED
|
@@ -538,8 +538,9 @@ class PDFSearchApp:
|
|
| 538 |
return str(e)
|
| 539 |
|
| 540 |
|
| 541 |
-
def search_documents(self, query
|
| 542 |
print(f"Searching for query: {query}")
|
|
|
|
| 543 |
|
| 544 |
if not query:
|
| 545 |
print("Please enter a search query")
|
|
@@ -594,22 +595,15 @@ class PDFSearchApp:
|
|
| 594 |
|
| 595 |
middleware = Middleware(collection_name, create_collection=False)
|
| 596 |
|
| 597 |
-
#
|
| 598 |
-
# Get more results than
|
| 599 |
-
|
| 600 |
-
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
| 601 |
|
| 602 |
-
#
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
if optimal_count != num_results:
|
| 608 |
-
print(f"\nπ― DYNAMIC OPTIMIZATION APPLIED:")
|
| 609 |
-
print(f" Requested pages: {num_results}")
|
| 610 |
-
print(f" Optimal pages: {optimal_count}")
|
| 611 |
-
print(f" Query complexity: {query_complexity}")
|
| 612 |
-
num_results = optimal_count
|
| 613 |
|
| 614 |
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
| 615 |
print(f"\nπ SEARCH RESULTS SUMMARY")
|
|
@@ -652,12 +646,12 @@ class PDFSearchApp:
|
|
| 652 |
if not search_results:
|
| 653 |
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
| 654 |
|
| 655 |
-
#
|
| 656 |
-
selected_results = self.
|
| 657 |
|
| 658 |
# π SELECTION LOGGING - Show which pages were selected
|
| 659 |
print(f"\nπ― PAGE SELECTION RESULTS")
|
| 660 |
-
print(f"π
|
| 661 |
print(f"π Selected: {len(selected_results)} pages")
|
| 662 |
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
| 663 |
print("-" * 60)
|
|
@@ -814,23 +808,47 @@ class PDFSearchApp:
|
|
| 814 |
# Return exactly 7 outputs to match Gradio expectations
|
| 815 |
return error_msg, "--", error_msg, [], None, None, None, None
|
| 816 |
|
| 817 |
-
def
|
| 818 |
"""
|
| 819 |
-
|
| 820 |
-
|
| 821 |
"""
|
| 822 |
-
if
|
| 823 |
-
return
|
| 824 |
|
| 825 |
-
# Sort by relevance score
|
| 826 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
| 827 |
|
| 828 |
-
#
|
| 829 |
-
|
| 830 |
|
| 831 |
-
print(f"
|
|
|
|
|
|
|
| 832 |
|
| 833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
|
| 835 |
def _select_highest_scoring_pages(self, sorted_results, query, num_results):
|
| 836 |
"""
|
|
@@ -3436,13 +3454,8 @@ def create_ui():
|
|
| 3436 |
placeholder="Ask about any topic in your documents...",
|
| 3437 |
lines=2
|
| 3438 |
)
|
| 3439 |
-
|
| 3440 |
-
|
| 3441 |
-
maximum=10,
|
| 3442 |
-
value=3,
|
| 3443 |
-
step=1,
|
| 3444 |
-
label="Number of pages to retrieve and cite"
|
| 3445 |
-
)
|
| 3446 |
search_btn = gr.Button("Search Documents", variant="primary")
|
| 3447 |
|
| 3448 |
gr.Markdown("### Results")
|
|
@@ -3502,7 +3515,7 @@ def create_ui():
|
|
| 3502 |
# Query events
|
| 3503 |
search_btn.click(
|
| 3504 |
fn=app.search_documents,
|
| 3505 |
-
inputs=[query_input
|
| 3506 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 3507 |
)
|
| 3508 |
|
|
|
|
| 538 |
return str(e)
|
| 539 |
|
| 540 |
|
| 541 |
+
def search_documents(self, query):
|
| 542 |
print(f"Searching for query: {query}")
|
| 543 |
+
print(f"π― MODE: Returning only TOP 3 highest-scoring pages")
|
| 544 |
|
| 545 |
if not query:
|
| 546 |
print("Please enter a search query")
|
|
|
|
| 595 |
|
| 596 |
middleware = Middleware(collection_name, create_collection=False)
|
| 597 |
|
| 598 |
+
# π― TOP 3 PAGES MODE: Always return only the top 3 highest-scoring pages
|
| 599 |
+
# Get more results than needed to allow for intelligent filtering
|
| 600 |
+
search_results = middleware.search([query], topk=20)[0] # Get 20 results for better selection
|
|
|
|
| 601 |
|
| 602 |
+
# Fixed to always return top 3 pages
|
| 603 |
+
num_results = 3
|
| 604 |
+
print(f"\nπ― TOP 3 PAGES MODE:")
|
| 605 |
+
print(f" Always returning: {num_results} highest-scoring pages")
|
| 606 |
+
print(f" Selection strategy: Score-based prioritization")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
|
| 608 |
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
| 609 |
print(f"\nπ SEARCH RESULTS SUMMARY")
|
|
|
|
| 646 |
if not search_results:
|
| 647 |
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
| 648 |
|
| 649 |
+
# π― TOP 3 SELECTION: Always select exactly the top 3 highest-scoring pages
|
| 650 |
+
selected_results = self._select_top_3_pages(search_results, query)
|
| 651 |
|
| 652 |
# π SELECTION LOGGING - Show which pages were selected
|
| 653 |
print(f"\nπ― PAGE SELECTION RESULTS")
|
| 654 |
+
print(f"π Mode: Top 3 highest-scoring pages")
|
| 655 |
print(f"π Selected: {len(selected_results)} pages")
|
| 656 |
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
| 657 |
print("-" * 60)
|
|
|
|
| 808 |
# Return exactly 7 outputs to match Gradio expectations
|
| 809 |
return error_msg, "--", error_msg, [], None, None, None, None
|
| 810 |
|
| 811 |
+
def _select_top_3_pages(self, search_results, query):
|
| 812 |
"""
|
| 813 |
+
Select exactly the top 3 highest-scoring pages
|
| 814 |
+
Simplified selection focused on the best 3 pages only
|
| 815 |
"""
|
| 816 |
+
if not search_results:
|
| 817 |
+
return []
|
| 818 |
|
| 819 |
+
# Sort by relevance score (highest first)
|
| 820 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
| 821 |
|
| 822 |
+
# Always return exactly the top 3 pages
|
| 823 |
+
top_3 = sorted_results[:3]
|
| 824 |
|
| 825 |
+
print(f"\nπ― TOP 3 PAGES SELECTION:")
|
| 826 |
+
print(f"π Total available results: {len(search_results)}")
|
| 827 |
+
print(f"π― Selected: Top 3 highest-scoring pages")
|
| 828 |
|
| 829 |
+
# Log the selected pages with scores
|
| 830 |
+
for i, (score, doc_id) in enumerate(top_3, 1):
|
| 831 |
+
page_num = doc_id + 1
|
| 832 |
+
relevance_level = self._get_relevance_level(score)
|
| 833 |
+
print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
| 834 |
+
|
| 835 |
+
# Calculate selection quality metrics
|
| 836 |
+
if top_3:
|
| 837 |
+
scores = [result[0] for result in top_3]
|
| 838 |
+
avg_score = sum(scores) / len(scores)
|
| 839 |
+
print(f"\nπ TOP 3 SELECTION QUALITY:")
|
| 840 |
+
print(f" Average score: {avg_score:.4f}")
|
| 841 |
+
print(f" Highest score: {scores[0]:.4f}")
|
| 842 |
+
print(f" Lowest score: {scores[-1]:.4f}")
|
| 843 |
+
print(f" Score range: {scores[0] - scores[-1]:.4f}")
|
| 844 |
+
|
| 845 |
+
return top_3
|
| 846 |
+
|
| 847 |
+
def _select_relevant_pages_new_format(self, search_results, query, num_results):
|
| 848 |
+
"""
|
| 849 |
+
Legacy function - kept for compatibility but now redirects to top 3 selection
|
| 850 |
+
"""
|
| 851 |
+
return self._select_top_3_pages(search_results, query)
|
| 852 |
|
| 853 |
def _select_highest_scoring_pages(self, sorted_results, query, num_results):
|
| 854 |
"""
|
|
|
|
| 3454 |
placeholder="Ask about any topic in your documents...",
|
| 3455 |
lines=2
|
| 3456 |
)
|
| 3457 |
+
# Removed number of pages input - always returns top 3 pages
|
| 3458 |
+
gr.Markdown("π― **Top 3 Pages Mode**: System automatically returns the 3 highest-scoring pages")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3459 |
search_btn = gr.Button("Search Documents", variant="primary")
|
| 3460 |
|
| 3461 |
gr.Markdown("### Results")
|
|
|
|
| 3515 |
# Query events
|
| 3516 |
search_btn.click(
|
| 3517 |
fn=app.search_documents,
|
| 3518 |
+
inputs=[query_input],
|
| 3519 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
| 3520 |
)
|
| 3521 |
|
score_utilizer.py
CHANGED
|
@@ -154,7 +154,7 @@ class ScoreUtilizer:
|
|
| 154 |
|
| 155 |
return stats
|
| 156 |
|
| 157 |
-
def get_highest_scoring_pages(self, parsed_data: Dict, count: int =
|
| 158 |
"""
|
| 159 |
Get the highest-scoring pages from parsed data
|
| 160 |
|
|
@@ -227,8 +227,8 @@ class ScoreUtilizer:
|
|
| 227 |
report.append("=" * 60)
|
| 228 |
|
| 229 |
# Top pages summary
|
| 230 |
-
top_pages = self.get_highest_scoring_pages(parsed_data,
|
| 231 |
-
report.append(f"\nπ TOP
|
| 232 |
for i, page in enumerate(top_pages, 1):
|
| 233 |
report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
|
| 234 |
|
|
@@ -253,10 +253,10 @@ class ScoreUtilizer:
|
|
| 253 |
|
| 254 |
# Usage suggestions
|
| 255 |
report.append(f"\nπ‘ USAGE SUGGESTIONS:")
|
| 256 |
-
report.append(f" 1.
|
| 257 |
-
report.append(f" 2.
|
| 258 |
-
report.append(f" 3.
|
| 259 |
-
report.append(f" 4.
|
| 260 |
|
| 261 |
report.append("=" * 60)
|
| 262 |
|
|
|
|
| 154 |
|
| 155 |
return stats
|
| 156 |
|
| 157 |
+
def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 3) -> List[Dict]:
|
| 158 |
"""
|
| 159 |
Get the highest-scoring pages from parsed data
|
| 160 |
|
|
|
|
| 227 |
report.append("=" * 60)
|
| 228 |
|
| 229 |
# Top pages summary
|
| 230 |
+
top_pages = self.get_highest_scoring_pages(parsed_data, 3)
|
| 231 |
+
report.append(f"\nπ TOP 3 HIGHEST-SCORING PAGES:")
|
| 232 |
for i, page in enumerate(top_pages, 1):
|
| 233 |
report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
|
| 234 |
|
|
|
|
| 253 |
|
| 254 |
# Usage suggestions
|
| 255 |
report.append(f"\nπ‘ USAGE SUGGESTIONS:")
|
| 256 |
+
report.append(f" 1. System automatically uses top 3 pages for RAG responses")
|
| 257 |
+
report.append(f" 2. Excellent pages provide primary context")
|
| 258 |
+
report.append(f" 3. Very good pages ensure comprehensive coverage")
|
| 259 |
+
report.append(f" 4. Top 3 selection optimizes response quality")
|
| 260 |
|
| 261 |
report.append("=" * 60)
|
| 262 |
|