Hindi-Rag

Sleeping

App Files Files Community

pranavinani commited on Jul 12, 2025

Commit

3f24fca

1 Parent(s): 6cd5233

added text file

Browse files

Files changed (5) hide show

app.py +95 -52
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.pdf +0 -3
ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.txt +0 -0
ocr_books/PANINIYA Volume 41.pdf +0 -3
ocr_books/PANINIYA Volume 41.txt +0 -0

app.py CHANGED Viewed

@@ -153,6 +153,29 @@ def text_to_speech(text):
         return None
 # Text extraction functions
 def extract_text_from_pdf(pdf_path):
     """Extract text from PDF using PyMuPDF (assumes selectable text)"""
     text_content = ""
@@ -186,6 +209,17 @@ def extract_text_from_pdf(pdf_path):
         print(f"PDF extraction error: {str(e)}")
         return f"Error extracting text: {str(e)}"
 def extract_metadata(text):
     """Extract author name and book title from text"""
     lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
@@ -316,24 +350,29 @@ def authenticate(passcode):
         return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
 # Document processing function
-def process_document(pdf_file):
-    """Process uploaded PDF document"""
-    if pdf_file is None:
-        return "कृपया एक PDF फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
     try:
-        print(f"Processing uploaded file: {pdf_file.name}")
         # Check file size
-        file_size = os.path.getsize(pdf_file.name)
         print(f"File size: {file_size} bytes")
         if file_size > CONFIG['MAX_FILE_SIZE']:
             return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
-        # Extract text (no OCR - assumes selectable text)
-        print("Extracting text from PDF...")
-        text_content = extract_text_from_pdf(pdf_file.name)
         # Check if extraction failed
         if not text_content.strip():
@@ -452,7 +491,7 @@ def reset_session():
 # Book management functions
 def get_available_books():
-    """Get list of available books with their thumbnails and PDF files"""
     books = []
     try:
@@ -466,16 +505,17 @@ def get_available_books():
         else:
             thumbnail_files = []
-        # Get all PDF files from OCR directory
         if os.path.exists(ocr_dir):
-            pdf_files = [f for f in os.listdir(ocr_dir)
-                         if f.lower().endswith('.pdf')]
         else:
-            pdf_files = []
-        # Create book entries for PDF files
-        for pdf_file in pdf_files:
-            book_name = os.path.splitext(pdf_file)[0]
             # Look for matching thumbnail
             thumbnail_path = None
@@ -493,8 +533,9 @@ def get_available_books():
             books.append({
                 'name': book_name,
-                'display_name': book_name.replace('_', ' ').title(),
-                'pdf_file': os.path.join(ocr_dir, pdf_file),
                 'thumbnail': thumbnail_path
             })
@@ -535,11 +576,11 @@ def create_text_placeholder(book_name):
         print(f"Error creating placeholder: {str(e)}")
         return None
-def load_book_pdf(book_info):
-    """Load text content from a pre-existing PDF book"""
     try:
-        # Extract text from PDF using the existing function
-        text_content = extract_text_from_pdf(book_info['pdf_file'])
         if not text_content.strip() or "Error" in text_content:
             return text_content
@@ -547,7 +588,7 @@ def load_book_pdf(book_info):
         return text_content
     except Exception as e:
-        return f"Error loading PDF book: {str(e)}"
 def process_selected_book(selected_book_name):
     """Process a pre-selected book"""
@@ -568,8 +609,8 @@ def process_selected_book(selected_book_name):
         if not selected_book:
             return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
-        # Load PDF content and extract text
-        text_content = load_book_pdf(selected_book)
         if not text_content.strip() or "Error" in text_content:
             return text_content, "", "", gr.update(visible=False)
@@ -663,26 +704,27 @@ def ensure_lfs_files_downloaded():
         if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
             print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
-            # Check if PDF files exist and are not LFS pointers
             ocr_dir = CONFIG['OCR_BOOKS_DIR']
             if os.path.exists(ocr_dir):
-                pdf_files = [f for f in os.listdir(ocr_dir) if f.lower().endswith('.pdf')]
-                for pdf_file in pdf_files:
-                    pdf_path = os.path.join(ocr_dir, pdf_file)
                     # Check if file is an LFS pointer (small text file)
-                    if os.path.exists(pdf_path):
-                        file_size = os.path.getsize(pdf_path)
                         # LFS pointer files are typically very small (< 200 bytes)
-                        if file_size < 200:
-                            print(f"📁 {pdf_file} appears to be an LFS pointer, attempting download...")
                             # Try to download using git lfs pull for this specific file
                             try:
                                 result = subprocess.run(
-                                    ['git', 'lfs', 'pull', '--include', f"ocr_books/{pdf_file}"],
                                     cwd=os.getcwd(),
                                     capture_output=True,
                                     text=True,
@@ -690,16 +732,17 @@ def ensure_lfs_files_downloaded():
                                 )
                                 if result.returncode == 0:
-                                    print(f"✅ Successfully downloaded {pdf_file}")
                                 else:
-                                    print(f"⚠️ Could not download {pdf_file}: {result.stderr}")
                             except subprocess.TimeoutExpired:
-                                print(f"⏰ Timeout downloading {pdf_file}")
                             except Exception as e:
-                                print(f"❌ Error downloading {pdf_file}: {str(e)}")
                         else:
-                            print(f"✅ {pdf_file} already downloaded ({file_size:,} bytes)")
             # Also check thumbnails
             thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
@@ -816,17 +859,17 @@ def create_interface():
                     book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
                     select_book_btn = gr.Button("No books available", interactive=False)
-            # PDF upload section
-            with gr.Tab("📄 Upload PDF / PDF अपलोड करें"):
-                gr.Markdown("**Upload your own PDF / अपनी PDF अपलोड करें**")
-                gr.Markdown("**Note:** Please ensure your PDF contains selectable text (not scanned images)")
-                pdf_upload = gr.File(
-                    label="Upload PDF / PDF अपलोड करें",
-                    file_types=[".pdf"],
                     type="filepath"
                 )
-                process_pdf_btn = gr.Button("📖 Process PDF / PDF प्रसंस्करित करें", variant="primary")
             doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
@@ -877,7 +920,7 @@ def create_interface():
                 with gr.Column():
                     gr.Markdown("""
                     **Requirements & Limits / आवश्यकताएं और सीमा:**
-                    - PDF with selectable text (no scanned images)
                     - Max file size: 10MB
                     - Max queries: 5 per session
                     - Audio transcription: First 10 seconds only
@@ -892,10 +935,10 @@ def create_interface():
             outputs=[auth_section, main_section, auth_status]
         )
-        # PDF upload event handler - Always available
-        process_pdf_btn.click(
             process_document,
-            inputs=[pdf_upload],
             outputs=[doc_status, book_title_display, author_display, query_section]
         )

         return None
 # Text extraction functions
+def extract_text_from_txt(txt_path):
+    """Extract text from TXT file"""
+    try:
+        # Try different encodings
+        encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252']
+        for encoding in encodings:
+            try:
+                with open(txt_path, 'r', encoding=encoding) as file:
+                    text_content = file.read()
+                if text_content.strip():
+                    print(f"Successfully extracted {len(text_content)} characters from TXT file using {encoding} encoding")
+                    return text_content
+            except UnicodeDecodeError:
+                continue
+        return "Error: Could not decode TXT file with any supported encoding"
+    except Exception as e:
+        print(f"TXT extraction error: {str(e)}")
+        return f"Error extracting text: {str(e)}"
 def extract_text_from_pdf(pdf_path):
     """Extract text from PDF using PyMuPDF (assumes selectable text)"""
     text_content = ""
         print(f"PDF extraction error: {str(e)}")
         return f"Error extracting text: {str(e)}"
+def extract_text_from_file(file_path):
+    """Extract text from file (supports PDF and TXT)"""
+    file_extension = os.path.splitext(file_path)[1].lower()
+    if file_extension == '.pdf':
+        return extract_text_from_pdf(file_path)
+    elif file_extension == '.txt':
+        return extract_text_from_txt(file_path)
+    else:
+        return f"Error: Unsupported file format {file_extension}. Only PDF and TXT files are supported."
 def extract_metadata(text):
     """Extract author name and book title from text"""
     lines = [line.strip() for line in text.split('\n')[:25] if line.strip()]
         return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode / गलत पासकोड"
 # Document processing function
+def process_document(document_file):
+    """Process uploaded document (PDF or TXT)"""
+    if document_file is None:
+        return "कृपया एक PDF या TXT फ़ाइल अपलोड करें।", "", "", gr.update(visible=False)
     try:
+        print(f"Processing uploaded file: {document_file.name}")
+        # Check file extension
+        file_extension = os.path.splitext(document_file.name)[1].lower()
+        if file_extension not in ['.pdf', '.txt']:
+            return "केवल PDF और TXT फ़ाइलें समर्थित हैं।", "", "", gr.update(visible=False)
         # Check file size
+        file_size = os.path.getsize(document_file.name)
         print(f"File size: {file_size} bytes")
         if file_size > CONFIG['MAX_FILE_SIZE']:
             return f"फ़ाइल बहुत बड़ी है! अधिकतम आकार: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=False)
+        # Extract text using unified function
+        print(f"Extracting text from {file_extension.upper()} file...")
+        text_content = extract_text_from_file(document_file.name)
         # Check if extraction failed
         if not text_content.strip():
 # Book management functions
 def get_available_books():
+    """Get list of available books with their thumbnails and document files (PDF/TXT)"""
     books = []
     try:
         else:
             thumbnail_files = []
+        # Get all supported document files from OCR directory
         if os.path.exists(ocr_dir):
+            document_files = [f for f in os.listdir(ocr_dir)
+                            if f.lower().endswith(('.pdf', '.txt'))]
         else:
+            document_files = []
+        # Create book entries for document files
+        for doc_file in document_files:
+            book_name = os.path.splitext(doc_file)[0]
+            file_extension = os.path.splitext(doc_file)[1].lower()
             # Look for matching thumbnail
             thumbnail_path = None
             books.append({
                 'name': book_name,
+                'display_name': f"{book_name.replace('_', ' ').title()} ({file_extension.upper()})",
+                'document_file': os.path.join(ocr_dir, doc_file),
+                'file_type': file_extension,
                 'thumbnail': thumbnail_path
             })
         print(f"Error creating placeholder: {str(e)}")
         return None
+def load_book_document(book_info):
+    """Load text content from a pre-existing document (PDF or TXT)"""
     try:
+        # Extract text from document using the unified function
+        text_content = extract_text_from_file(book_info['document_file'])
         if not text_content.strip() or "Error" in text_content:
             return text_content
         return text_content
     except Exception as e:
+        return f"Error loading document: {str(e)}"
 def process_selected_book(selected_book_name):
     """Process a pre-selected book"""
         if not selected_book:
             return "चुनी गई पुस्तक नहीं मिली।", "", "", gr.update(visible=False)
+        # Load document content and extract text
+        text_content = load_book_document(selected_book)
         if not text_content.strip() or "Error" in text_content:
             return text_content, "", "", gr.update(visible=False)
         if os.getenv('SPACE_ID') or os.getenv('HUGGINGFACE_HUB_CACHE'):
             print("🔄 Detected Hugging Face Spaces environment, checking LFS files...")
+            # Check if document files exist and are not LFS pointers
             ocr_dir = CONFIG['OCR_BOOKS_DIR']
             if os.path.exists(ocr_dir):
+                document_files = [f for f in os.listdir(ocr_dir) if f.lower().endswith(('.pdf', '.txt'))]
+                for doc_file in document_files:
+                    doc_path = os.path.join(ocr_dir, doc_file)
                     # Check if file is an LFS pointer (small text file)
+                    if os.path.exists(doc_path):
+                        file_size = os.path.getsize(doc_path)
                         # LFS pointer files are typically very small (< 200 bytes)
+                        # But TXT files might legitimately be small, so only check PDFs for LFS
+                        if file_size < 200 and doc_file.lower().endswith('.pdf'):
+                            print(f"📁 {doc_file} appears to be an LFS pointer, attempting download...")
                             # Try to download using git lfs pull for this specific file
                             try:
                                 result = subprocess.run(
+                                    ['git', 'lfs', 'pull', '--include', f"ocr_books/{doc_file}"],
                                     cwd=os.getcwd(),
                                     capture_output=True,
                                     text=True,
                                 )
                                 if result.returncode == 0:
+                                    print(f"✅ Successfully downloaded {doc_file}")
                                 else:
+                                    print(f"⚠️ Could not download {doc_file}: {result.stderr}")
                             except subprocess.TimeoutExpired:
+                                print(f"⏰ Timeout downloading {doc_file}")
                             except Exception as e:
+                                print(f"❌ Error downloading {doc_file}: {str(e)}")
                         else:
+                            file_type = "PDF" if doc_file.lower().endswith('.pdf') else "TXT"
+                            print(f"✅ {doc_file} ({file_type}) already available ({file_size:,} bytes)")
             # Also check thumbnails
             thumbnail_dir = CONFIG['BOOK_THUMBNAILS_DIR']
                     book_dropdown = gr.Dropdown(choices=["None"], value="None", visible=False)
                     select_book_btn = gr.Button("No books available", interactive=False)
+            # PDF/TXT upload section
+            with gr.Tab("📄 Upload Document / दस्तावेज़ अपलोड करें"):
+                gr.Markdown("**Upload your own PDF or TXT file / अपनी PDF या TXT फ़ाइल अपलोड करें**")
+                gr.Markdown("**Note:** For PDF files, please ensure they contain selectable text (not scanned images)")
+                document_upload = gr.File(
+                    label="Upload PDF or TXT / PDF या TXT अपलोड करें",
+                    file_types=[".pdf", ".txt"],
                     type="filepath"
                 )
+                process_document_btn = gr.Button("📖 Process Document / दस्तावेज़ प्रसंस्करित करें", variant="primary")
             doc_status = gr.Textbox(label="Processing Status / प्रसंस्करण स्थिति", interactive=False)
                 with gr.Column():
                     gr.Markdown("""
                     **Requirements & Limits / आवश्यकताएं और सीमा:**
+                    - PDF with selectable text (no scanned images) or TXT files
                     - Max file size: 10MB
                     - Max queries: 5 per session
                     - Audio transcription: First 10 seconds only
             outputs=[auth_section, main_section, auth_status]
         )
+        # Document upload event handler - Always available
+        process_document_btn.click(
             process_document,
+            inputs=[document_upload],
             outputs=[doc_status, book_title_display, author_display, query_section]
         )

ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d569c284ba23b1980668089f154898d7e6fc0d3f7f075678fc7370fc8b3a2a02
-size 52006233

ocr_books/Bhartiya Gyan Parampara ke Vivid Aayam Book.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ocr_books/PANINIYA Volume 41.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2050a21e70ba883faaf794b04e7051d7754e2e79eead02248b1829230c8cb645
-size 75749563

ocr_books/PANINIYA Volume 41.txt ADDED Viewed

The diff for this file is too large to render. See raw diff