Spaces:
Running
Running
Commit
·
36af225
1
Parent(s):
9d3a49a
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -31,6 +31,8 @@ CONFIG = {
|
|
| 31 |
'MAX_AUDIO_DURATION': 120, # 2 minutes
|
| 32 |
'GROQ_API_KEY': os.getenv('GAPI'),
|
| 33 |
'AUDIO_CLIP_DURATION': 10, # First 10 seconds only
|
|
|
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
# Global session storage
|
|
@@ -429,6 +431,190 @@ def reset_session():
|
|
| 429 |
})
|
| 430 |
return "✅ नया सत्र शुरू किया गया!", "", "", gr.update(visible=False), "प्रश्न: 0/5"
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
# Create Gradio interface
|
| 433 |
def create_interface():
|
| 434 |
"""Create the Gradio interface"""
|
|
@@ -479,17 +665,53 @@ def create_interface():
|
|
| 479 |
interactive=False
|
| 480 |
)
|
| 481 |
|
| 482 |
-
# Document upload section
|
| 483 |
-
gr.Markdown("### 📁 Step 1:
|
| 484 |
-
gr.Markdown("**Note:** Please ensure your PDF contains selectable text (not scanned images)")
|
| 485 |
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
pdf_upload = gr.File(
|
| 488 |
label="Upload PDF / PDF अपलोड करें",
|
| 489 |
file_types=[".pdf"],
|
| 490 |
type="filepath"
|
| 491 |
)
|
| 492 |
-
|
| 493 |
|
| 494 |
doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
|
| 495 |
|
|
@@ -555,11 +777,21 @@ def create_interface():
|
|
| 555 |
outputs=[auth_section, main_section, auth_status]
|
| 556 |
)
|
| 557 |
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
|
| 564 |
ask_button.click(
|
| 565 |
process_query,
|
|
|
|
| 31 |
'MAX_AUDIO_DURATION': 120, # 2 minutes
|
| 32 |
'GROQ_API_KEY': os.getenv('GAPI'),
|
| 33 |
'AUDIO_CLIP_DURATION': 10, # First 10 seconds only
|
| 34 |
+
'BOOK_THUMBNAILS_DIR': './book_thumbnails',
|
| 35 |
+
'OCR_BOOKS_DIR': './ocr_books',
|
| 36 |
}
|
| 37 |
|
| 38 |
# Global session storage
|
|
|
|
| 431 |
})
|
| 432 |
return "✅ नया सत्र शुरू किया गया!", "", "", gr.update(visible=False), "प्रश्न: 0/5"
|
| 433 |
|
| 434 |
+
# Book management functions
|
| 435 |
+
def get_available_books():
|
| 436 |
+
"""Get list of available books with their thumbnails and text files"""
|
| 437 |
+
books = []
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
# Get all image files from thumbnails directory
|
| 441 |
+
thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
|
| 442 |
+
ocr_dir = CONFIG['OCR_BOOKS_DIR']
|
| 443 |
+
|
| 444 |
+
if os.path.exists(thumbnail_dir):
|
| 445 |
+
thumbnail_files = [f for f in os.listdir(thumbnail_dir)
|
| 446 |
+
if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
|
| 447 |
+
else:
|
| 448 |
+
thumbnail_files = []
|
| 449 |
+
|
| 450 |
+
# Get all text files from OCR directory
|
| 451 |
+
if os.path.exists(ocr_dir):
|
| 452 |
+
text_files = [f for f in os.listdir(ocr_dir)
|
| 453 |
+
if f.lower().endswith('.txt')]
|
| 454 |
+
else:
|
| 455 |
+
text_files = []
|
| 456 |
+
|
| 457 |
+
# Create book entries
|
| 458 |
+
for text_file in text_files:
|
| 459 |
+
book_name = os.path.splitext(text_file)[0]
|
| 460 |
+
|
| 461 |
+
# Look for matching thumbnail
|
| 462 |
+
thumbnail_path = None
|
| 463 |
+
for thumb_file in thumbnail_files:
|
| 464 |
+
thumb_name = os.path.splitext(thumb_file)[0]
|
| 465 |
+
if thumb_name.lower() == book_name.lower():
|
| 466 |
+
thumbnail_path = os.path.join(thumbnail_dir, thumb_file)
|
| 467 |
+
break
|
| 468 |
+
|
| 469 |
+
# If no matching thumbnail found, use a default placeholder
|
| 470 |
+
if not thumbnail_path:
|
| 471 |
+
# Create a simple text-based placeholder
|
| 472 |
+
placeholder_path = create_text_placeholder(book_name)
|
| 473 |
+
thumbnail_path = placeholder_path
|
| 474 |
+
|
| 475 |
+
books.append({
|
| 476 |
+
'name': book_name,
|
| 477 |
+
'display_name': book_name.replace('_', ' ').title(),
|
| 478 |
+
'text_file': os.path.join(ocr_dir, text_file),
|
| 479 |
+
'thumbnail': thumbnail_path
|
| 480 |
+
})
|
| 481 |
+
|
| 482 |
+
return books
|
| 483 |
+
|
| 484 |
+
except Exception as e:
|
| 485 |
+
print(f"Error getting available books: {str(e)}")
|
| 486 |
+
return []
|
| 487 |
+
|
| 488 |
+
def create_text_placeholder(book_name):
|
| 489 |
+
"""Create a simple text placeholder image for books without thumbnails"""
|
| 490 |
+
try:
|
| 491 |
+
import matplotlib.pyplot as plt
|
| 492 |
+
import matplotlib.patches as patches
|
| 493 |
+
|
| 494 |
+
# Create a simple text-based image
|
| 495 |
+
fig, ax = plt.subplots(1, 1, figsize=(3, 4))
|
| 496 |
+
ax.set_xlim(0, 1)
|
| 497 |
+
ax.set_ylim(0, 1)
|
| 498 |
+
ax.axis('off')
|
| 499 |
+
|
| 500 |
+
# Add background
|
| 501 |
+
rect = patches.Rectangle((0, 0), 1, 1, linewidth=2, edgecolor='#2E86AB', facecolor='#E8F4FD')
|
| 502 |
+
ax.add_patch(rect)
|
| 503 |
+
|
| 504 |
+
# Add text
|
| 505 |
+
ax.text(0.5, 0.5, book_name.replace('_', '\n'),
|
| 506 |
+
ha='center', va='center', fontsize=10, weight='bold', color='#2E86AB')
|
| 507 |
+
|
| 508 |
+
# Save to temporary file
|
| 509 |
+
placeholder_path = os.path.join(tempfile.gettempdir(), f"{book_name}_placeholder.png")
|
| 510 |
+
plt.savefig(placeholder_path, dpi=100, bbox_inches='tight')
|
| 511 |
+
plt.close()
|
| 512 |
+
|
| 513 |
+
return placeholder_path
|
| 514 |
+
|
| 515 |
+
except Exception as e:
|
| 516 |
+
print(f"Error creating placeholder: {str(e)}")
|
| 517 |
+
return None
|
| 518 |
+
|
| 519 |
+
def load_book_text(book_info):
|
| 520 |
+
"""Load text content from a pre-existing book"""
|
| 521 |
+
try:
|
| 522 |
+
with open(book_info['text_file'], 'r', encoding='utf-8') as file:
|
| 523 |
+
content = file.read()
|
| 524 |
+
|
| 525 |
+
if not content.strip():
|
| 526 |
+
return "Error: Empty text file"
|
| 527 |
+
|
| 528 |
+
return content
|
| 529 |
+
|
| 530 |
+
except Exception as e:
|
| 531 |
+
return f"Error loading book text: {str(e)}"
|
| 532 |
+
|
| 533 |
+
def process_selected_book(selected_book_name):
|
| 534 |
+
"""Process a pre-selected book"""
|
| 535 |
+
if not selected_book_name or selected_book_name == "None":
|
| 536 |
+
return "कृपया एक पुस्तक चुनें।", "", "", gr.update(visible=False)
|
| 537 |
+
|
| 538 |
+
try:
|
| 539 |
+
# Get available books
|
| 540 |
+
available_books = get_available_books()
|
| 541 |
+
|
| 542 |
+
# Find the selected book
|
| 543 |
+
selected_book = None
|
| 544 |
+
for book in available_books:
|
| 545 |
+
if book['name'] == selected_book_name:
|
| 546 |
+
selected_book = book
|
| 547 |
+
break
|
| 548 |
+
|
| 549 |
+
if not selected_book:
|
| 550 |
+
return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
|
| 551 |
+
|
| 552 |
+
# Load text content
|
| 553 |
+
text_content = load_book_text(selected_book)
|
| 554 |
+
|
| 555 |
+
if not text_content.strip() or "Error" in text_content:
|
| 556 |
+
return text_content, "", "", gr.update(visible=False)
|
| 557 |
+
|
| 558 |
+
# Extract metadata (use book name if no metadata found in text)
|
| 559 |
+
author_name, book_title = extract_metadata(text_content)
|
| 560 |
+
|
| 561 |
+
# If metadata extraction didn't work well, use the book name
|
| 562 |
+
if author_name == "अज्ञात लेखक":
|
| 563 |
+
author_name = "संग्रहित पुस्तक"
|
| 564 |
+
if book_title == "अनाम पुस्तक":
|
| 565 |
+
book_title = selected_book['display_name']
|
| 566 |
+
|
| 567 |
+
SESSION_DATA['author_name'] = author_name
|
| 568 |
+
SESSION_DATA['book_title'] = book_title
|
| 569 |
+
|
| 570 |
+
# Create chunks
|
| 571 |
+
chunks = chunk_text(text_content)
|
| 572 |
+
SESSION_DATA['document_chunks'] = chunks
|
| 573 |
+
|
| 574 |
+
# Create embeddings and index
|
| 575 |
+
print("Creating embeddings and search index for selected book...")
|
| 576 |
+
SESSION_DATA['faiss_index'] = create_embeddings(chunks)
|
| 577 |
+
|
| 578 |
+
# Reset query count
|
| 579 |
+
SESSION_DATA['query_count'] = 0
|
| 580 |
+
|
| 581 |
+
# Calculate statistics
|
| 582 |
+
word_count = len(text_content.split())
|
| 583 |
+
char_count = len(text_content)
|
| 584 |
+
|
| 585 |
+
success_msg = f"""✅ पुस्तक सफलतापूर्वक लोड की गई!
|
| 586 |
+
|
| 587 |
+
📖 पुस्तक: {book_title}
|
| 588 |
+
✍️ लेखक: {author_name}
|
| 589 |
+
📄 टेक्स्ट खंड: {len(chunks)}
|
| 590 |
+
📊 शब्द संख्या: {word_count:,}
|
| 591 |
+
📝 अक्षर संख्या: {char_count:,}
|
| 592 |
+
|
| 593 |
+
अब आप प्रश्न पूछ सकते हैं।"""
|
| 594 |
+
|
| 595 |
+
return success_msg, book_title, author_name, gr.update(visible=True)
|
| 596 |
+
|
| 597 |
+
except Exception as e:
|
| 598 |
+
return f"पुस्तक लोड करने में त्रुटि: {str(e)}", "", "", gr.update(visible=False)
|
| 599 |
+
|
| 600 |
+
def create_book_gallery():
|
| 601 |
+
"""Create a gallery of available books with thumbnails"""
|
| 602 |
+
available_books = get_available_books()
|
| 603 |
+
|
| 604 |
+
if not available_books:
|
| 605 |
+
return [], "कोई पुस्तक उपलब्ध नहीं है।"
|
| 606 |
+
|
| 607 |
+
# Create gallery data: list of (image_path, title) tuples
|
| 608 |
+
gallery_data = []
|
| 609 |
+
book_names = ["None"] # Add None option
|
| 610 |
+
|
| 611 |
+
for book in available_books:
|
| 612 |
+
if book['thumbnail'] and os.path.exists(book['thumbnail']):
|
| 613 |
+
gallery_data.append((book['thumbnail'], book['display_name']))
|
| 614 |
+
book_names.append(book['name'])
|
| 615 |
+
|
| 616 |
+
return gallery_data, book_names
|
| 617 |
+
|
| 618 |
# Create Gradio interface
|
| 619 |
def create_interface():
|
| 620 |
"""Create the Gradio interface"""
|
|
|
|
| 665 |
interactive=False
|
| 666 |
)
|
| 667 |
|
| 668 |
+
# Document selection/upload section
|
| 669 |
+
gr.Markdown("### 📁 Step 1: Choose Your Book / अपनी पुस्तक चुनें")
|
|
|
|
| 670 |
|
| 671 |
+
# Book selection section
|
| 672 |
+
with gr.Tab("📚 Select from Library / पुस्तकालय से चुनें"):
|
| 673 |
+
gr.Markdown("**Choose from available books / उपलब्ध पुस्तकों में से चुनें**")
|
| 674 |
+
|
| 675 |
+
# Initialize book gallery and dropdown
|
| 676 |
+
available_books = get_available_books()
|
| 677 |
+
gallery_data, book_options = create_book_gallery()
|
| 678 |
+
|
| 679 |
+
if available_books:
|
| 680 |
+
book_gallery = gr.Gallery(
|
| 681 |
+
value=gallery_data,
|
| 682 |
+
label="Available Books / उपलब्ध पुस्तकें",
|
| 683 |
+
show_label=True,
|
| 684 |
+
elem_id="book_gallery",
|
| 685 |
+
columns=3,
|
| 686 |
+
rows=2,
|
| 687 |
+
height="auto",
|
| 688 |
+
allow_preview=True
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
book_dropdown = gr.Dropdown(
|
| 692 |
+
choices=book_options,
|
| 693 |
+
label="Select Book / पुस्तक चुनें",
|
| 694 |
+
value="None",
|
| 695 |
+
interactive=True
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
select_book_btn = gr.Button("📖 Load Selected Book / चुनी गई पुस्तक लोड करें", variant="primary")
|
| 699 |
+
else:
|
| 700 |
+
gr.Markdown("⚠️ No books available in library / पुस्तकालय में कोई पुस्तक उपलब्ध नहीं है")
|
| 701 |
+
book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
|
| 702 |
+
select_book_btn = gr.Button("No books available", interactive=False)
|
| 703 |
+
|
| 704 |
+
# PDF upload section
|
| 705 |
+
with gr.Tab("📄 Upload PDF / PDF ��पलोड करें"):
|
| 706 |
+
gr.Markdown("**Upload your own PDF / अपनी PDF अपलोड करें**")
|
| 707 |
+
gr.Markdown("**Note:** Please ensure your PDF contains selectable text (not scanned images)")
|
| 708 |
+
|
| 709 |
pdf_upload = gr.File(
|
| 710 |
label="Upload PDF / PDF अपलोड करें",
|
| 711 |
file_types=[".pdf"],
|
| 712 |
type="filepath"
|
| 713 |
)
|
| 714 |
+
process_pdf_btn = gr.Button("📖 Process PDF / PDF प्रसंस्करित करें", variant="primary")
|
| 715 |
|
| 716 |
doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
|
| 717 |
|
|
|
|
| 777 |
outputs=[auth_section, main_section, auth_status]
|
| 778 |
)
|
| 779 |
|
| 780 |
+
# Book selection event handler
|
| 781 |
+
if 'select_book_btn' in locals():
|
| 782 |
+
select_book_btn.click(
|
| 783 |
+
process_selected_book,
|
| 784 |
+
inputs=[book_dropdown],
|
| 785 |
+
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 786 |
+
)
|
| 787 |
+
|
| 788 |
+
# PDF upload event handler
|
| 789 |
+
if 'process_pdf_btn' in locals():
|
| 790 |
+
process_pdf_btn.click(
|
| 791 |
+
process_document,
|
| 792 |
+
inputs=[pdf_upload],
|
| 793 |
+
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 794 |
+
)
|
| 795 |
|
| 796 |
ask_button.click(
|
| 797 |
process_query,
|