Spaces:
Running
Running
pranavinani
commited on
Commit
·
6cd5233
1
Parent(s):
e2e039a
fixed pdf upload not working
Browse files
app.py
CHANGED
|
@@ -179,9 +179,11 @@ def extract_text_from_pdf(pdf_path):
|
|
| 179 |
if not text_content.strip():
|
| 180 |
return "Error: No selectable text found in PDF. Please ensure the PDF contains selectable text, not just images."
|
| 181 |
|
|
|
|
| 182 |
return text_content
|
| 183 |
|
| 184 |
except Exception as e:
|
|
|
|
| 185 |
return f"Error extracting text: {str(e)}"
|
| 186 |
|
| 187 |
def extract_metadata(text):
|
|
@@ -320,23 +322,36 @@ def process_document(pdf_file):
|
|
| 320 |
return "कृपया एक PDF फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
|
| 321 |
|
| 322 |
try:
|
|
|
|
|
|
|
| 323 |
# Check file size
|
| 324 |
file_size = os.path.getsize(pdf_file.name)
|
|
|
|
|
|
|
| 325 |
if file_size > CONFIG['MAX_FILE_SIZE']:
|
| 326 |
return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
|
| 327 |
|
| 328 |
# Extract text (no OCR - assumes selectable text)
|
|
|
|
| 329 |
text_content = extract_text_from_pdf(pdf_file.name)
|
| 330 |
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
return text_content, "", "", gr.update(visible=False)
|
| 333 |
|
|
|
|
|
|
|
| 334 |
# Extract metadata
|
|
|
|
| 335 |
author_name, book_title = extract_metadata(text_content)
|
| 336 |
SESSION_DATA['author_name'] = author_name
|
| 337 |
SESSION_DATA['book_title'] = book_title
|
| 338 |
|
| 339 |
# Create chunks
|
|
|
|
| 340 |
chunks = chunk_text(text_content)
|
| 341 |
SESSION_DATA['document_chunks'] = chunks
|
| 342 |
|
|
@@ -351,6 +366,8 @@ def process_document(pdf_file):
|
|
| 351 |
word_count = len(text_content.split())
|
| 352 |
char_count = len(text_content)
|
| 353 |
|
|
|
|
|
|
|
| 354 |
success_msg = f"""✅ दस्तावेज़ सफलतापूर्वक प्रसंस्करित!
|
| 355 |
|
| 356 |
📖 पुस्तक: {book_title}
|
|
@@ -364,7 +381,9 @@ def process_document(pdf_file):
|
|
| 364 |
return success_msg, book_title, author_name, gr.update(visible=True)
|
| 365 |
|
| 366 |
except Exception as e:
|
| 367 |
-
|
|
|
|
|
|
|
| 368 |
|
| 369 |
# Query processing function
|
| 370 |
def process_query(audio_input, text_input):
|
|
@@ -770,6 +789,7 @@ def create_interface():
|
|
| 770 |
available_books = get_available_books()
|
| 771 |
gallery_data, book_options = create_book_gallery()
|
| 772 |
|
|
|
|
| 773 |
if available_books:
|
| 774 |
book_gallery = gr.Gallery(
|
| 775 |
value=gallery_data,
|
|
@@ -792,6 +812,7 @@ def create_interface():
|
|
| 792 |
select_book_btn = gr.Button("📖 Load Selected Book / चुनी गई पुस्तक लोड करें", variant="primary")
|
| 793 |
else:
|
| 794 |
gr.Markdown("⚠️ No books available in library / पुस्तकालय में कोई पुस्तक उपलब्ध नहीं है")
|
|
|
|
| 795 |
book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
|
| 796 |
select_book_btn = gr.Button("No books available", interactive=False)
|
| 797 |
|
|
@@ -871,28 +892,27 @@ def create_interface():
|
|
| 871 |
outputs=[auth_section, main_section, auth_status]
|
| 872 |
)
|
| 873 |
|
| 874 |
-
#
|
| 875 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
select_book_btn.click(
|
| 877 |
process_selected_book,
|
| 878 |
inputs=[book_dropdown],
|
| 879 |
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 880 |
)
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
# PDF upload event handler
|
| 890 |
-
if 'process_pdf_btn' in locals():
|
| 891 |
-
process_pdf_btn.click(
|
| 892 |
-
process_document,
|
| 893 |
-
inputs=[pdf_upload],
|
| 894 |
-
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 895 |
-
)
|
| 896 |
|
| 897 |
ask_button.click(
|
| 898 |
process_query,
|
|
|
|
| 179 |
if not text_content.strip():
|
| 180 |
return "Error: No selectable text found in PDF. Please ensure the PDF contains selectable text, not just images."
|
| 181 |
|
| 182 |
+
print(f"Successfully extracted {len(text_content)} characters from PDF")
|
| 183 |
return text_content
|
| 184 |
|
| 185 |
except Exception as e:
|
| 186 |
+
print(f"PDF extraction error: {str(e)}")
|
| 187 |
return f"Error extracting text: {str(e)}"
|
| 188 |
|
| 189 |
def extract_metadata(text):
|
|
|
|
| 322 |
return "कृपया एक PDF फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
|
| 323 |
|
| 324 |
try:
|
| 325 |
+
print(f"Processing uploaded file: {pdf_file.name}")
|
| 326 |
+
|
| 327 |
# Check file size
|
| 328 |
file_size = os.path.getsize(pdf_file.name)
|
| 329 |
+
print(f"File size: {file_size} bytes")
|
| 330 |
+
|
| 331 |
if file_size > CONFIG['MAX_FILE_SIZE']:
|
| 332 |
return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
|
| 333 |
|
| 334 |
# Extract text (no OCR - assumes selectable text)
|
| 335 |
+
print("Extracting text from PDF...")
|
| 336 |
text_content = extract_text_from_pdf(pdf_file.name)
|
| 337 |
|
| 338 |
+
# Check if extraction failed
|
| 339 |
+
if not text_content.strip():
|
| 340 |
+
return "Error: फ़ाइल से टेक्स्ट निकालने में असफल।", "", "", gr.update(visible=False)
|
| 341 |
+
|
| 342 |
+
if text_content.startswith("Error"):
|
| 343 |
return text_content, "", "", gr.update(visible=False)
|
| 344 |
|
| 345 |
+
print(f"Text extraction successful. Length: {len(text_content)} characters")
|
| 346 |
+
|
| 347 |
# Extract metadata
|
| 348 |
+
print("Extracting metadata...")
|
| 349 |
author_name, book_title = extract_metadata(text_content)
|
| 350 |
SESSION_DATA['author_name'] = author_name
|
| 351 |
SESSION_DATA['book_title'] = book_title
|
| 352 |
|
| 353 |
# Create chunks
|
| 354 |
+
print("Creating text chunks...")
|
| 355 |
chunks = chunk_text(text_content)
|
| 356 |
SESSION_DATA['document_chunks'] = chunks
|
| 357 |
|
|
|
|
| 366 |
word_count = len(text_content.split())
|
| 367 |
char_count = len(text_content)
|
| 368 |
|
| 369 |
+
print(f"Processing complete. Chunks: {len(chunks)}, Words: {word_count}")
|
| 370 |
+
|
| 371 |
success_msg = f"""✅ दस्तावेज़ सफलतापूर्वक प्रसंस्करित!
|
| 372 |
|
| 373 |
📖 पुस्तक: {book_title}
|
|
|
|
| 381 |
return success_msg, book_title, author_name, gr.update(visible=True)
|
| 382 |
|
| 383 |
except Exception as e:
|
| 384 |
+
error_msg = f"दस्तावेज़ प्रसंस्करण में त्रुटि: {str(e)}"
|
| 385 |
+
print(f"Error in process_document: {str(e)}")
|
| 386 |
+
return error_msg, "", "", gr.update(visible=False)
|
| 387 |
|
| 388 |
# Query processing function
|
| 389 |
def process_query(audio_input, text_input):
|
|
|
|
| 789 |
available_books = get_available_books()
|
| 790 |
gallery_data, book_options = create_book_gallery()
|
| 791 |
|
| 792 |
+
# Always create these components, even if no books are available
|
| 793 |
if available_books:
|
| 794 |
book_gallery = gr.Gallery(
|
| 795 |
value=gallery_data,
|
|
|
|
| 812 |
select_book_btn = gr.Button("📖 Load Selected Book / चुनी गई पुस्तक लोड करें", variant="primary")
|
| 813 |
else:
|
| 814 |
gr.Markdown("⚠️ No books available in library / पुस्तकालय में कोई पुस्तक उपलब्ध नहीं है")
|
| 815 |
+
book_gallery = None
|
| 816 |
book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
|
| 817 |
select_book_btn = gr.Button("No books available", interactive=False)
|
| 818 |
|
|
|
|
| 892 |
outputs=[auth_section, main_section, auth_status]
|
| 893 |
)
|
| 894 |
|
| 895 |
+
# PDF upload event handler - Always available
|
| 896 |
+
process_pdf_btn.click(
|
| 897 |
+
process_document,
|
| 898 |
+
inputs=[pdf_upload],
|
| 899 |
+
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 900 |
+
)
|
| 901 |
+
|
| 902 |
+
# Book selection event handler - Only if books are available
|
| 903 |
+
if available_books:
|
| 904 |
select_book_btn.click(
|
| 905 |
process_selected_book,
|
| 906 |
inputs=[book_dropdown],
|
| 907 |
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 908 |
)
|
| 909 |
+
|
| 910 |
+
# Gallery selection event handler - Only if gallery exists
|
| 911 |
+
if book_gallery is not None:
|
| 912 |
+
book_gallery.select(
|
| 913 |
+
handle_gallery_selection,
|
| 914 |
+
outputs=[book_dropdown]
|
| 915 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
|
| 917 |
ask_button.click(
|
| 918 |
process_query,
|