Spaces:
Running
Running
pranavinani
commited on
Commit
·
3f24fca
1
Parent(s):
6cd5233
added text file
Browse files
app.py
CHANGED
|
@@ -153,6 +153,29 @@ def text_to_speech(text):
|
|
| 153 |
return None
|
| 154 |
|
| 155 |
# Text extraction functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def extract_text_from_pdf(pdf_path):
|
| 157 |
"""Extract text from PDF using PyMuPDF (assumes selectable text)"""
|
| 158 |
text_content = ""
|
|
@@ -186,6 +209,17 @@ def extract_text_from_pdf(pdf_path):
|
|
| 186 |
print(f"PDF extraction error: {str(e)}")
|
| 187 |
return f"Error extracting text: {str(e)}"
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
def extract_metadata(text):
|
| 190 |
"""Extract author name and book title from text"""
|
| 191 |
lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
|
|
@@ -316,24 +350,29 @@ def authenticate(passcode):
|
|
| 316 |
return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
|
| 317 |
|
| 318 |
# Document processing function
|
| 319 |
-
def process_document(
|
| 320 |
-
"""Process uploaded PDF
|
| 321 |
-
if
|
| 322 |
-
return "कृपया एक PDF फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
|
| 323 |
|
| 324 |
try:
|
| 325 |
-
print(f"Processing uploaded file: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
# Check file size
|
| 328 |
-
file_size = os.path.getsize(
|
| 329 |
print(f"File size: {file_size} bytes")
|
| 330 |
|
| 331 |
if file_size > CONFIG['MAX_FILE_SIZE']:
|
| 332 |
return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
|
| 333 |
|
| 334 |
-
# Extract text
|
| 335 |
-
print("Extracting text from
|
| 336 |
-
text_content =
|
| 337 |
|
| 338 |
# Check if extraction failed
|
| 339 |
if not text_content.strip():
|
|
@@ -452,7 +491,7 @@ def reset_session():
|
|
| 452 |
|
| 453 |
# Book management functions
|
| 454 |
def get_available_books():
|
| 455 |
-
"""Get list of available books with their thumbnails and
|
| 456 |
books = []
|
| 457 |
|
| 458 |
try:
|
|
@@ -466,16 +505,17 @@ def get_available_books():
|
|
| 466 |
else:
|
| 467 |
thumbnail_files = []
|
| 468 |
|
| 469 |
-
# Get all
|
| 470 |
if os.path.exists(ocr_dir):
|
| 471 |
-
|
| 472 |
-
|
| 473 |
else:
|
| 474 |
-
|
| 475 |
|
| 476 |
-
# Create book entries for
|
| 477 |
-
for
|
| 478 |
-
book_name = os.path.splitext(
|
|
|
|
| 479 |
|
| 480 |
# Look for matching thumbnail
|
| 481 |
thumbnail_path = None
|
|
@@ -493,8 +533,9 @@ def get_available_books():
|
|
| 493 |
|
| 494 |
books.append({
|
| 495 |
'name': book_name,
|
| 496 |
-
'display_name': book_name.replace('_', ' ').title(),
|
| 497 |
-
'
|
|
|
|
| 498 |
'thumbnail': thumbnail_path
|
| 499 |
})
|
| 500 |
|
|
@@ -535,11 +576,11 @@ def create_text_placeholder(book_name):
|
|
| 535 |
print(f"Error creating placeholder: {str(e)}")
|
| 536 |
return None
|
| 537 |
|
| 538 |
-
def
|
| 539 |
-
"""Load text content from a pre-existing PDF
|
| 540 |
try:
|
| 541 |
-
# Extract text from
|
| 542 |
-
text_content =
|
| 543 |
|
| 544 |
if not text_content.strip() or "Error" in text_content:
|
| 545 |
return text_content
|
|
@@ -547,7 +588,7 @@ def load_book_pdf(book_info):
|
|
| 547 |
return text_content
|
| 548 |
|
| 549 |
except Exception as e:
|
| 550 |
-
return f"Error loading
|
| 551 |
|
| 552 |
def process_selected_book(selected_book_name):
|
| 553 |
"""Process a pre-selected book"""
|
|
@@ -568,8 +609,8 @@ def process_selected_book(selected_book_name):
|
|
| 568 |
if not selected_book:
|
| 569 |
return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
|
| 570 |
|
| 571 |
-
# Load
|
| 572 |
-
text_content =
|
| 573 |
|
| 574 |
if not text_content.strip() or "Error" in text_content:
|
| 575 |
return text_content, "", "", gr.update(visible=False)
|
|
@@ -663,26 +704,27 @@ def ensure_lfs_files_downloaded():
|
|
| 663 |
if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
|
| 664 |
print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
|
| 665 |
|
| 666 |
-
# Check if
|
| 667 |
ocr_dir = CONFIG['OCR_BOOKS_DIR']
|
| 668 |
if os.path.exists(ocr_dir):
|
| 669 |
-
|
| 670 |
|
| 671 |
-
for
|
| 672 |
-
|
| 673 |
|
| 674 |
# Check if file is an LFS pointer (small text file)
|
| 675 |
-
if os.path.exists(
|
| 676 |
-
file_size = os.path.getsize(
|
| 677 |
|
| 678 |
# LFS pointer files are typically very small (< 200 bytes)
|
| 679 |
-
|
| 680 |
-
|
|
|
|
| 681 |
|
| 682 |
# Try to download using git lfs pull for this specific file
|
| 683 |
try:
|
| 684 |
result = subprocess.run(
|
| 685 |
-
['git', 'lfs', 'pull', '--include', f"ocr_books/{
|
| 686 |
cwd=os.getcwd(),
|
| 687 |
capture_output=True,
|
| 688 |
text=True,
|
|
@@ -690,16 +732,17 @@ def ensure_lfs_files_downloaded():
|
|
| 690 |
)
|
| 691 |
|
| 692 |
if result.returncode == 0:
|
| 693 |
-
print(f"✅ Successfully downloaded {
|
| 694 |
else:
|
| 695 |
-
print(f"⚠️ Could not download {
|
| 696 |
|
| 697 |
except subprocess.TimeoutExpired:
|
| 698 |
-
print(f"⏰ Timeout downloading {
|
| 699 |
except Exception as e:
|
| 700 |
-
print(f"❌ Error downloading {
|
| 701 |
else:
|
| 702 |
-
|
|
|
|
| 703 |
|
| 704 |
# Also check thumbnails
|
| 705 |
thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
|
|
@@ -816,17 +859,17 @@ def create_interface():
|
|
| 816 |
book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
|
| 817 |
select_book_btn = gr.Button("No books available", interactive=False)
|
| 818 |
|
| 819 |
-
# PDF upload section
|
| 820 |
-
with gr.Tab("📄 Upload
|
| 821 |
-
gr.Markdown("**Upload your own PDF / अपनी PDF अपलोड करें**")
|
| 822 |
-
gr.Markdown("**Note:**
|
| 823 |
|
| 824 |
-
|
| 825 |
-
label="Upload PDF / PDF अपलोड करें",
|
| 826 |
-
file_types=[".pdf"],
|
| 827 |
type="filepath"
|
| 828 |
)
|
| 829 |
-
|
| 830 |
|
| 831 |
doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
|
| 832 |
|
|
@@ -877,7 +920,7 @@ def create_interface():
|
|
| 877 |
with gr.Column():
|
| 878 |
gr.Markdown("""
|
| 879 |
**Requirements & Limits / आवश्यकताएं और सीमा:**
|
| 880 |
-
- PDF with selectable text (no scanned images)
|
| 881 |
- Max file size: 10MB
|
| 882 |
- Max queries: 5 per session
|
| 883 |
- Audio transcription: First 10 seconds only
|
|
@@ -892,10 +935,10 @@ def create_interface():
|
|
| 892 |
outputs=[auth_section, main_section, auth_status]
|
| 893 |
)
|
| 894 |
|
| 895 |
-
#
|
| 896 |
-
|
| 897 |
process_document,
|
| 898 |
-
inputs=[
|
| 899 |
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 900 |
)
|
| 901 |
|
|
|
|
| 153 |
return None
|
| 154 |
|
| 155 |
# Text extraction functions
|
| 156 |
+
def extract_text_from_txt(txt_path):
|
| 157 |
+
"""Extract text from TXT file"""
|
| 158 |
+
try:
|
| 159 |
+
# Try different encodings
|
| 160 |
+
encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252']
|
| 161 |
+
|
| 162 |
+
for encoding in encodings:
|
| 163 |
+
try:
|
| 164 |
+
with open(txt_path, 'r', encoding=encoding) as file:
|
| 165 |
+
text_content = file.read()
|
| 166 |
+
|
| 167 |
+
if text_content.strip():
|
| 168 |
+
print(f"Successfully extracted {len(text_content)} characters from TXT file using {encoding} encoding")
|
| 169 |
+
return text_content
|
| 170 |
+
except UnicodeDecodeError:
|
| 171 |
+
continue
|
| 172 |
+
|
| 173 |
+
return "Error: Could not decode TXT file with any supported encoding"
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
print(f"TXT extraction error: {str(e)}")
|
| 177 |
+
return f"Error extracting text: {str(e)}"
|
| 178 |
+
|
| 179 |
def extract_text_from_pdf(pdf_path):
|
| 180 |
"""Extract text from PDF using PyMuPDF (assumes selectable text)"""
|
| 181 |
text_content = ""
|
|
|
|
| 209 |
print(f"PDF extraction error: {str(e)}")
|
| 210 |
return f"Error extracting text: {str(e)}"
|
| 211 |
|
| 212 |
+
def extract_text_from_file(file_path):
|
| 213 |
+
"""Extract text from file (supports PDF and TXT)"""
|
| 214 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
| 215 |
+
|
| 216 |
+
if file_extension == '.pdf':
|
| 217 |
+
return extract_text_from_pdf(file_path)
|
| 218 |
+
elif file_extension == '.txt':
|
| 219 |
+
return extract_text_from_txt(file_path)
|
| 220 |
+
else:
|
| 221 |
+
return f"Error: Unsupported file format {file_extension}. Only PDF and TXT files are supported."
|
| 222 |
+
|
| 223 |
def extract_metadata(text):
|
| 224 |
"""Extract author name and book title from text"""
|
| 225 |
lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
|
|
|
|
| 350 |
return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
|
| 351 |
|
| 352 |
# Document processing function
|
| 353 |
+
def process_document(document_file):
|
| 354 |
+
"""Process uploaded document (PDF or TXT)"""
|
| 355 |
+
if document_file is None:
|
| 356 |
+
return "कृपया एक PDF या TXT फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
|
| 357 |
|
| 358 |
try:
|
| 359 |
+
print(f"Processing uploaded file: {document_file.name}")
|
| 360 |
+
|
| 361 |
+
# Check file extension
|
| 362 |
+
file_extension = os.path.splitext(document_file.name)[1].lower()
|
| 363 |
+
if file_extension not in ['.pdf', '.txt']:
|
| 364 |
+
return "केवल PDF और TXT फ़ाइलें समर्थित हैं।", "", "", gr.update(visible=False)
|
| 365 |
|
| 366 |
# Check file size
|
| 367 |
+
file_size = os.path.getsize(document_file.name)
|
| 368 |
print(f"File size: {file_size} bytes")
|
| 369 |
|
| 370 |
if file_size > CONFIG['MAX_FILE_SIZE']:
|
| 371 |
return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
|
| 372 |
|
| 373 |
+
# Extract text using unified function
|
| 374 |
+
print(f"Extracting text from {file_extension.upper()} file...")
|
| 375 |
+
text_content = extract_text_from_file(document_file.name)
|
| 376 |
|
| 377 |
# Check if extraction failed
|
| 378 |
if not text_content.strip():
|
|
|
|
| 491 |
|
| 492 |
# Book management functions
|
| 493 |
def get_available_books():
|
| 494 |
+
"""Get list of available books with their thumbnails and document files (PDF/TXT)"""
|
| 495 |
books = []
|
| 496 |
|
| 497 |
try:
|
|
|
|
| 505 |
else:
|
| 506 |
thumbnail_files = []
|
| 507 |
|
| 508 |
+
# Get all supported document files from OCR directory
|
| 509 |
if os.path.exists(ocr_dir):
|
| 510 |
+
document_files = [f for f in os.listdir(ocr_dir)
|
| 511 |
+
if f.lower().endswith(('.pdf', '.txt'))]
|
| 512 |
else:
|
| 513 |
+
document_files = []
|
| 514 |
|
| 515 |
+
# Create book entries for document files
|
| 516 |
+
for doc_file in document_files:
|
| 517 |
+
book_name = os.path.splitext(doc_file)[0]
|
| 518 |
+
file_extension = os.path.splitext(doc_file)[1].lower()
|
| 519 |
|
| 520 |
# Look for matching thumbnail
|
| 521 |
thumbnail_path = None
|
|
|
|
| 533 |
|
| 534 |
books.append({
|
| 535 |
'name': book_name,
|
| 536 |
+
'display_name': f"{book_name.replace('_', ' ').title()} ({file_extension.upper()})",
|
| 537 |
+
'document_file': os.path.join(ocr_dir, doc_file),
|
| 538 |
+
'file_type': file_extension,
|
| 539 |
'thumbnail': thumbnail_path
|
| 540 |
})
|
| 541 |
|
|
|
|
| 576 |
print(f"Error creating placeholder: {str(e)}")
|
| 577 |
return None
|
| 578 |
|
| 579 |
+
def load_book_document(book_info):
|
| 580 |
+
"""Load text content from a pre-existing document (PDF or TXT)"""
|
| 581 |
try:
|
| 582 |
+
# Extract text from document using the unified function
|
| 583 |
+
text_content = extract_text_from_file(book_info['document_file'])
|
| 584 |
|
| 585 |
if not text_content.strip() or "Error" in text_content:
|
| 586 |
return text_content
|
|
|
|
| 588 |
return text_content
|
| 589 |
|
| 590 |
except Exception as e:
|
| 591 |
+
return f"Error loading document: {str(e)}"
|
| 592 |
|
| 593 |
def process_selected_book(selected_book_name):
|
| 594 |
"""Process a pre-selected book"""
|
|
|
|
| 609 |
if not selected_book:
|
| 610 |
return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
|
| 611 |
|
| 612 |
+
# Load document content and extract text
|
| 613 |
+
text_content = load_book_document(selected_book)
|
| 614 |
|
| 615 |
if not text_content.strip() or "Error" in text_content:
|
| 616 |
return text_content, "", "", gr.update(visible=False)
|
|
|
|
| 704 |
if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
|
| 705 |
print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
|
| 706 |
|
| 707 |
+
# Check if document files exist and are not LFS pointers
|
| 708 |
ocr_dir = CONFIG['OCR_BOOKS_DIR']
|
| 709 |
if os.path.exists(ocr_dir):
|
| 710 |
+
document_files = [f for f in os.listdir(ocr_dir) if f.lower().endswith(('.pdf', '.txt'))]
|
| 711 |
|
| 712 |
+
for doc_file in document_files:
|
| 713 |
+
doc_path = os.path.join(ocr_dir, doc_file)
|
| 714 |
|
| 715 |
# Check if file is an LFS pointer (small text file)
|
| 716 |
+
if os.path.exists(doc_path):
|
| 717 |
+
file_size = os.path.getsize(doc_path)
|
| 718 |
|
| 719 |
# LFS pointer files are typically very small (< 200 bytes)
|
| 720 |
+
# But TXT files might legitimately be small, so only check PDFs for LFS
|
| 721 |
+
if file_size < 200 and doc_file.lower().endswith('.pdf'):
|
| 722 |
+
print(f"📁 {doc_file} appears to be an LFS pointer, attempting download...")
|
| 723 |
|
| 724 |
# Try to download using git lfs pull for this specific file
|
| 725 |
try:
|
| 726 |
result = subprocess.run(
|
| 727 |
+
['git', 'lfs', 'pull', '--include', f"ocr_books/{doc_file}"],
|
| 728 |
cwd=os.getcwd(),
|
| 729 |
capture_output=True,
|
| 730 |
text=True,
|
|
|
|
| 732 |
)
|
| 733 |
|
| 734 |
if result.returncode == 0:
|
| 735 |
+
print(f"✅ Successfully downloaded {doc_file}")
|
| 736 |
else:
|
| 737 |
+
print(f"⚠️ Could not download {doc_file}: {result.stderr}")
|
| 738 |
|
| 739 |
except subprocess.TimeoutExpired:
|
| 740 |
+
print(f"⏰ Timeout downloading {doc_file}")
|
| 741 |
except Exception as e:
|
| 742 |
+
print(f"❌ Error downloading {doc_file}: {str(e)}")
|
| 743 |
else:
|
| 744 |
+
file_type = "PDF" if doc_file.lower().endswith('.pdf') else "TXT"
|
| 745 |
+
print(f"✅ {doc_file} ({file_type}) already available ({file_size:,} bytes)")
|
| 746 |
|
| 747 |
# Also check thumbnails
|
| 748 |
thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
|
|
|
|
| 859 |
book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
|
| 860 |
select_book_btn = gr.Button("No books available", interactive=False)
|
| 861 |
|
| 862 |
+
# PDF/TXT upload section
|
| 863 |
+
with gr.Tab("📄 Upload Document / दस्तावेज़ अपलोड करें"):
|
| 864 |
+
gr.Markdown("**Upload your own PDF or TXT file / अपनी PDF या TXT फ़ाइल अपलोड करें**")
|
| 865 |
+
gr.Markdown("**Note:** For PDF files, please ensure they contain selectable text (not scanned images)")
|
| 866 |
|
| 867 |
+
document_upload = gr.File(
|
| 868 |
+
label="Upload PDF or TXT / PDF या TXT अपलोड करें",
|
| 869 |
+
file_types=[".pdf", ".txt"],
|
| 870 |
type="filepath"
|
| 871 |
)
|
| 872 |
+
process_document_btn = gr.Button("📖 Process Document / दस्तावेज़ प्रसंस्करित करें", variant="primary")
|
| 873 |
|
| 874 |
doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
|
| 875 |
|
|
|
|
| 920 |
with gr.Column():
|
| 921 |
gr.Markdown("""
|
| 922 |
**Requirements & Limits / आवश्यकताएं और सीमा:**
|
| 923 |
+
- PDF with selectable text (no scanned images) or TXT files
|
| 924 |
- Max file size: 10MB
|
| 925 |
- Max queries: 5 per session
|
| 926 |
- Audio transcription: First 10 seconds only
|
|
|
|
| 935 |
outputs=[auth_section, main_section, auth_status]
|
| 936 |
)
|
| 937 |
|
| 938 |
+
# Document upload event handler - Always available
|
| 939 |
+
process_document_btn.click(
|
| 940 |
process_document,
|
| 941 |
+
inputs=[document_upload],
|
| 942 |
outputs=[doc_status, book_title_display, author_display, query_section]
|
| 943 |
)
|
| 944 |
|
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d569c284ba23b1980668089f154898d7e6fc0d3f7f075678fc7370fc8b3a2a02
|
| 3 |
-
size 52006233
|
|
|
|
|
|
|
|
|
|
|
|
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ocr_books/PANINIYA Volume 41.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2050a21e70ba883faaf794b04e7051d7754e2e79eead02248b1829230c8cb645
|
| 3 |
-
size 75749563
|
|
|
|
|
|
|
|
|
|
|
|
ocr_books/PANINIYA Volume 41.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|