pranavinani commited on
Commit
3f24fca
·
1 Parent(s): 6cd5233

added text file

Browse files
app.py CHANGED
@@ -153,6 +153,29 @@ def text_to_speech(text):
153
  return None
154
 
155
  # Text extraction functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def extract_text_from_pdf(pdf_path):
157
  """Extract text from PDF using PyMuPDF (assumes selectable text)"""
158
  text_content = ""
@@ -186,6 +209,17 @@ def extract_text_from_pdf(pdf_path):
186
  print(f"PDF extraction error: {str(e)}")
187
  return f"Error extracting text: {str(e)}"
188
 
 
 
 
 
 
 
 
 
 
 
 
189
  def extract_metadata(text):
190
  """Extract author name and book title from text"""
191
  lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
@@ -316,24 +350,29 @@ def authenticate(passcode):
316
  return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
317
 
318
  # Document processing function
319
- def process_document(pdf_file):
320
- """Process uploaded PDF document"""
321
- if pdf_file is None:
322
- return "कृपया एक PDF फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
323
 
324
  try:
325
- print(f"Processing uploaded file: {pdf_file.name}")
 
 
 
 
 
326
 
327
  # Check file size
328
- file_size = os.path.getsize(pdf_file.name)
329
  print(f"File size: {file_size} bytes")
330
 
331
  if file_size > CONFIG['MAX_FILE_SIZE']:
332
  return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
333
 
334
- # Extract text (no OCR - assumes selectable text)
335
- print("Extracting text from PDF...")
336
- text_content = extract_text_from_pdf(pdf_file.name)
337
 
338
  # Check if extraction failed
339
  if not text_content.strip():
@@ -452,7 +491,7 @@ def reset_session():
452
 
453
  # Book management functions
454
  def get_available_books():
455
- """Get list of available books with their thumbnails and PDF files"""
456
  books = []
457
 
458
  try:
@@ -466,16 +505,17 @@ def get_available_books():
466
  else:
467
  thumbnail_files = []
468
 
469
- # Get all PDF files from OCR directory
470
  if os.path.exists(ocr_dir):
471
- pdf_files = [f for f in os.listdir(ocr_dir)
472
- if f.lower().endswith('.pdf')]
473
  else:
474
- pdf_files = []
475
 
476
- # Create book entries for PDF files
477
- for pdf_file in pdf_files:
478
- book_name = os.path.splitext(pdf_file)[0]
 
479
 
480
  # Look for matching thumbnail
481
  thumbnail_path = None
@@ -493,8 +533,9 @@ def get_available_books():
493
 
494
  books.append({
495
  'name': book_name,
496
- 'display_name': book_name.replace('_', ' ').title(),
497
- 'pdf_file': os.path.join(ocr_dir, pdf_file),
 
498
  'thumbnail': thumbnail_path
499
  })
500
 
@@ -535,11 +576,11 @@ def create_text_placeholder(book_name):
535
  print(f"Error creating placeholder: {str(e)}")
536
  return None
537
 
538
- def load_book_pdf(book_info):
539
- """Load text content from a pre-existing PDF book"""
540
  try:
541
- # Extract text from PDF using the existing function
542
- text_content = extract_text_from_pdf(book_info['pdf_file'])
543
 
544
  if not text_content.strip() or "Error" in text_content:
545
  return text_content
@@ -547,7 +588,7 @@ def load_book_pdf(book_info):
547
  return text_content
548
 
549
  except Exception as e:
550
- return f"Error loading PDF book: {str(e)}"
551
 
552
  def process_selected_book(selected_book_name):
553
  """Process a pre-selected book"""
@@ -568,8 +609,8 @@ def process_selected_book(selected_book_name):
568
  if not selected_book:
569
  return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
570
 
571
- # Load PDF content and extract text
572
- text_content = load_book_pdf(selected_book)
573
 
574
  if not text_content.strip() or "Error" in text_content:
575
  return text_content, "", "", gr.update(visible=False)
@@ -663,26 +704,27 @@ def ensure_lfs_files_downloaded():
663
  if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
664
  print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
665
 
666
- # Check if PDF files exist and are not LFS pointers
667
  ocr_dir = CONFIG['OCR_BOOKS_DIR']
668
  if os.path.exists(ocr_dir):
669
- pdf_files = [f for f in os.listdir(ocr_dir) if f.lower().endswith('.pdf')]
670
 
671
- for pdf_file in pdf_files:
672
- pdf_path = os.path.join(ocr_dir, pdf_file)
673
 
674
  # Check if file is an LFS pointer (small text file)
675
- if os.path.exists(pdf_path):
676
- file_size = os.path.getsize(pdf_path)
677
 
678
  # LFS pointer files are typically very small (< 200 bytes)
679
- if file_size < 200:
680
- print(f"📁 {pdf_file} appears to be an LFS pointer, attempting download...")
 
681
 
682
  # Try to download using git lfs pull for this specific file
683
  try:
684
  result = subprocess.run(
685
- ['git', 'lfs', 'pull', '--include', f"ocr_books/{pdf_file}"],
686
  cwd=os.getcwd(),
687
  capture_output=True,
688
  text=True,
@@ -690,16 +732,17 @@ def ensure_lfs_files_downloaded():
690
  )
691
 
692
  if result.returncode == 0:
693
- print(f"✅ Successfully downloaded {pdf_file}")
694
  else:
695
- print(f"⚠️ Could not download {pdf_file}: {result.stderr}")
696
 
697
  except subprocess.TimeoutExpired:
698
- print(f"⏰ Timeout downloading {pdf_file}")
699
  except Exception as e:
700
- print(f"❌ Error downloading {pdf_file}: {str(e)}")
701
  else:
702
- print(f"✅ {pdf_file} already downloaded ({file_size:,} bytes)")
 
703
 
704
  # Also check thumbnails
705
  thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
@@ -816,17 +859,17 @@ def create_interface():
816
  book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
817
  select_book_btn = gr.Button("No books available", interactive=False)
818
 
819
- # PDF upload section
820
- with gr.Tab("📄 Upload PDF / PDF अपलोड करें"):
821
- gr.Markdown("**Upload your own PDF / अपनी PDF अपलोड करें**")
822
- gr.Markdown("**Note:** Please ensure your PDF contains selectable text (not scanned images)")
823
 
824
- pdf_upload = gr.File(
825
- label="Upload PDF / PDF अपलोड करें",
826
- file_types=[".pdf"],
827
  type="filepath"
828
  )
829
- process_pdf_btn = gr.Button("📖 Process PDF / PDF प्रसंस्करित करें", variant="primary")
830
 
831
  doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
832
 
@@ -877,7 +920,7 @@ def create_interface():
877
  with gr.Column():
878
  gr.Markdown("""
879
  **Requirements & Limits / आवश्यकताएं और सीमा:**
880
- - PDF with selectable text (no scanned images)
881
  - Max file size: 10MB
882
  - Max queries: 5 per session
883
  - Audio transcription: First 10 seconds only
@@ -892,10 +935,10 @@ def create_interface():
892
  outputs=[auth_section, main_section, auth_status]
893
  )
894
 
895
- # PDF upload event handler - Always available
896
- process_pdf_btn.click(
897
  process_document,
898
- inputs=[pdf_upload],
899
  outputs=[doc_status, book_title_display, author_display, query_section]
900
  )
901
 
 
153
  return None
154
 
155
  # Text extraction functions
156
+ def extract_text_from_txt(txt_path):
157
+ """Extract text from TXT file"""
158
+ try:
159
+ # Try different encodings
160
+ encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252']
161
+
162
+ for encoding in encodings:
163
+ try:
164
+ with open(txt_path, 'r', encoding=encoding) as file:
165
+ text_content = file.read()
166
+
167
+ if text_content.strip():
168
+ print(f"Successfully extracted {len(text_content)} characters from TXT file using {encoding} encoding")
169
+ return text_content
170
+ except UnicodeDecodeError:
171
+ continue
172
+
173
+ return "Error: Could not decode TXT file with any supported encoding"
174
+
175
+ except Exception as e:
176
+ print(f"TXT extraction error: {str(e)}")
177
+ return f"Error extracting text: {str(e)}"
178
+
179
  def extract_text_from_pdf(pdf_path):
180
  """Extract text from PDF using PyMuPDF (assumes selectable text)"""
181
  text_content = ""
 
209
  print(f"PDF extraction error: {str(e)}")
210
  return f"Error extracting text: {str(e)}"
211
 
212
+ def extract_text_from_file(file_path):
213
+ """Extract text from file (supports PDF and TXT)"""
214
+ file_extension = os.path.splitext(file_path)[1].lower()
215
+
216
+ if file_extension == '.pdf':
217
+ return extract_text_from_pdf(file_path)
218
+ elif file_extension == '.txt':
219
+ return extract_text_from_txt(file_path)
220
+ else:
221
+ return f"Error: Unsupported file format {file_extension}. Only PDF and TXT files are supported."
222
+
223
  def extract_metadata(text):
224
  """Extract author name and book title from text"""
225
  lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
 
350
  return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
351
 
352
  # Document processing function
353
+ def process_document(document_file):
354
+ """Process uploaded document (PDF or TXT)"""
355
+ if document_file is None:
356
+ return "कृपया एक PDF या TXT फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
357
 
358
  try:
359
+ print(f"Processing uploaded file: {document_file.name}")
360
+
361
+ # Check file extension
362
+ file_extension = os.path.splitext(document_file.name)[1].lower()
363
+ if file_extension not in ['.pdf', '.txt']:
364
+ return "केवल PDF और TXT फ़ाइलें समर्थित हैं।", "", "", gr.update(visible=False)
365
 
366
  # Check file size
367
+ file_size = os.path.getsize(document_file.name)
368
  print(f"File size: {file_size} bytes")
369
 
370
  if file_size > CONFIG['MAX_FILE_SIZE']:
371
  return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
372
 
373
+ # Extract text using unified function
374
+ print(f"Extracting text from {file_extension.upper()} file...")
375
+ text_content = extract_text_from_file(document_file.name)
376
 
377
  # Check if extraction failed
378
  if not text_content.strip():
 
491
 
492
  # Book management functions
493
  def get_available_books():
494
+ """Get list of available books with their thumbnails and document files (PDF/TXT)"""
495
  books = []
496
 
497
  try:
 
505
  else:
506
  thumbnail_files = []
507
 
508
+ # Get all supported document files from OCR directory
509
  if os.path.exists(ocr_dir):
510
+ document_files = [f for f in os.listdir(ocr_dir)
511
+ if f.lower().endswith(('.pdf', '.txt'))]
512
  else:
513
+ document_files = []
514
 
515
+ # Create book entries for document files
516
+ for doc_file in document_files:
517
+ book_name = os.path.splitext(doc_file)[0]
518
+ file_extension = os.path.splitext(doc_file)[1].lower()
519
 
520
  # Look for matching thumbnail
521
  thumbnail_path = None
 
533
 
534
  books.append({
535
  'name': book_name,
536
+ 'display_name': f"{book_name.replace('_', ' ').title()} ({file_extension.upper()})",
537
+ 'document_file': os.path.join(ocr_dir, doc_file),
538
+ 'file_type': file_extension,
539
  'thumbnail': thumbnail_path
540
  })
541
 
 
576
  print(f"Error creating placeholder: {str(e)}")
577
  return None
578
 
579
+ def load_book_document(book_info):
580
+ """Load text content from a pre-existing document (PDF or TXT)"""
581
  try:
582
+ # Extract text from document using the unified function
583
+ text_content = extract_text_from_file(book_info['document_file'])
584
 
585
  if not text_content.strip() or "Error" in text_content:
586
  return text_content
 
588
  return text_content
589
 
590
  except Exception as e:
591
+ return f"Error loading document: {str(e)}"
592
 
593
  def process_selected_book(selected_book_name):
594
  """Process a pre-selected book"""
 
609
  if not selected_book:
610
  return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
611
 
612
+ # Load document content and extract text
613
+ text_content = load_book_document(selected_book)
614
 
615
  if not text_content.strip() or "Error" in text_content:
616
  return text_content, "", "", gr.update(visible=False)
 
704
  if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
705
  print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
706
 
707
+ # Check if document files exist and are not LFS pointers
708
  ocr_dir = CONFIG['OCR_BOOKS_DIR']
709
  if os.path.exists(ocr_dir):
710
+ document_files = [f for f in os.listdir(ocr_dir) if f.lower().endswith(('.pdf', '.txt'))]
711
 
712
+ for doc_file in document_files:
713
+ doc_path = os.path.join(ocr_dir, doc_file)
714
 
715
  # Check if file is an LFS pointer (small text file)
716
+ if os.path.exists(doc_path):
717
+ file_size = os.path.getsize(doc_path)
718
 
719
  # LFS pointer files are typically very small (< 200 bytes)
720
+ # But TXT files might legitimately be small, so only check PDFs for LFS
721
+ if file_size < 200 and doc_file.lower().endswith('.pdf'):
722
+ print(f"📁 {doc_file} appears to be an LFS pointer, attempting download...")
723
 
724
  # Try to download using git lfs pull for this specific file
725
  try:
726
  result = subprocess.run(
727
+ ['git', 'lfs', 'pull', '--include', f"ocr_books/{doc_file}"],
728
  cwd=os.getcwd(),
729
  capture_output=True,
730
  text=True,
 
732
  )
733
 
734
  if result.returncode == 0:
735
+ print(f"✅ Successfully downloaded {doc_file}")
736
  else:
737
+ print(f"⚠️ Could not download {doc_file}: {result.stderr}")
738
 
739
  except subprocess.TimeoutExpired:
740
+ print(f"⏰ Timeout downloading {doc_file}")
741
  except Exception as e:
742
+ print(f"❌ Error downloading {doc_file}: {str(e)}")
743
  else:
744
+ file_type = "PDF" if doc_file.lower().endswith('.pdf') else "TXT"
745
+ print(f"✅ {doc_file} ({file_type}) already available ({file_size:,} bytes)")
746
 
747
  # Also check thumbnails
748
  thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
 
859
  book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
860
  select_book_btn = gr.Button("No books available", interactive=False)
861
 
862
+ # PDF/TXT upload section
863
+ with gr.Tab("📄 Upload Document / दस्तावेज़ अपलोड करें"):
864
+ gr.Markdown("**Upload your own PDF or TXT file / अपनी PDF या TXT फ़ाइल अपलोड करें**")
865
+ gr.Markdown("**Note:** For PDF files, please ensure they contain selectable text (not scanned images)")
866
 
867
+ document_upload = gr.File(
868
+ label="Upload PDF or TXT / PDF या TXT अपलोड करें",
869
+ file_types=[".pdf", ".txt"],
870
  type="filepath"
871
  )
872
+ process_document_btn = gr.Button("📖 Process Document / दस्तावेज़ प्रसंस्करित करें", variant="primary")
873
 
874
  doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
875
 
 
920
  with gr.Column():
921
  gr.Markdown("""
922
  **Requirements & Limits / आवश्यकताएं और सीमा:**
923
+ - PDF with selectable text (no scanned images) or TXT files
924
  - Max file size: 10MB
925
  - Max queries: 5 per session
926
  - Audio transcription: First 10 seconds only
 
935
  outputs=[auth_section, main_section, auth_status]
936
  )
937
 
938
+ # Document upload event handler - Always available
939
+ process_document_btn.click(
940
  process_document,
941
+ inputs=[document_upload],
942
  outputs=[doc_status, book_title_display, author_display, query_section]
943
  )
944
 
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d569c284ba23b1980668089f154898d7e6fc0d3f7f075678fc7370fc8b3a2a02
3
- size 52006233
 
 
 
 
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.txt ADDED
The diff for this file is too large to render. See raw diff
 
ocr_books/PANINIYA Volume 41.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2050a21e70ba883faaf794b04e7051d7754e2e79eead02248b1829230c8cb645
3
- size 75749563
 
 
 
 
ocr_books/PANINIYA Volume 41.txt ADDED
The diff for this file is too large to render. See raw diff