Spaces:

Ansemin101
/

Markit

Paused

App Files Files Community

AnseMin commited on Mar 10, 2025

Commit

67baccc

1 Parent(s): f89b538

changes to full force ocr to accept pdf - attempt 1

Browse files

Files changed (3) hide show

app.py +8 -6
fix_tesseract_huggingface.py +0 -144
src/parsers/docling_parser.py +68 -9

app.py CHANGED Viewed

@@ -5,11 +5,16 @@ import shutil
 from pathlib import Path
 import urllib.request
 # Run setup.sh at startup
 try:
-    print("Running setup.sh...")
-    subprocess.run(["bash", "setup.sh"], check=False)
-    print("setup.sh completed")
 except Exception as e:
     print(f"Error running setup.sh: {e}")
@@ -21,9 +26,6 @@ try:
 except ImportError:
     print("python-dotenv not installed, skipping .env file loading")
-# Get the current directory
-current_dir = os.path.dirname(os.path.abspath(__file__))
 # Function to setup Tesseract
 def setup_tesseract():
     """Setup Tesseract OCR environment."""

 from pathlib import Path
 import urllib.request
+# Get the current directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
 # Run setup.sh at startup
 try:
+    setup_script = os.path.join(current_dir, "setup.sh")
+    if os.path.exists(setup_script):
+        print("Running setup.sh...")
+        subprocess.run(["bash", setup_script], check=False)
+        print("setup.sh completed")
 except Exception as e:
     print(f"Error running setup.sh: {e}")
 except ImportError:
     print("python-dotenv not installed, skipping .env file loading")
 # Function to setup Tesseract
 def setup_tesseract():
     """Setup Tesseract OCR environment."""

fix_tesseract_huggingface.py DELETED Viewed

@@ -1,144 +0,0 @@
-#!/usr/bin/env python
-"""
-Script to diagnose and fix Tesseract issues in Hugging Face environments.
-"""
-import os
-import sys
-import shutil
-import subprocess
-import platform
-from pathlib import Path
-import urllib.request
-def diagnose_tesseract():
-    """Diagnose Tesseract installation and configuration issues."""
-    print("=== Tesseract Diagnostics ===")
-    # Check OS
-    print(f"Operating System: {platform.system()} {platform.release()}")
-    # Check if tesseract is in PATH
-    tesseract_path = shutil.which("tesseract")
-    if tesseract_path:
-        print(f"✅ Tesseract found in PATH: {tesseract_path}")
-        try:
-            version = subprocess.check_output(["tesseract", "--version"],
-                                             stderr=subprocess.STDOUT,
-                                             universal_newlines=True)
-            print(f"✅ Tesseract version info:\n{version.splitlines()[0]}")
-        except (subprocess.SubprocessError, FileNotFoundError) as e:
-            print(f"❌ Error running tesseract: {e}")
-    else:
-        print("❌ Tesseract not found in PATH")
-    # Check common installation locations
-    common_locations = [
-        "/usr/bin/tesseract",
-        "/usr/local/bin/tesseract",
-        "/opt/conda/bin/tesseract",
-        "/app/tesseract/tesseract",
-        r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-    ]
-    for location in common_locations:
-        if os.path.isfile(location) and os.access(location, os.X_OK):
-            print(f"✅ Tesseract executable found at: {location}")
-    # Check TESSDATA_PREFIX
-    tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
-    if tessdata_prefix:
-        print(f"✅ TESSDATA_PREFIX is set to: {tessdata_prefix}")
-        if os.path.exists(tessdata_prefix):
-            print(f"✅ TESSDATA_PREFIX directory exists")
-            eng_traineddata = os.path.join(tessdata_prefix, "eng.traineddata")
-            if os.path.exists(eng_traineddata):
-                print(f"✅ eng.traineddata found at: {eng_traineddata}")
-            else:
-                print(f"❌ eng.traineddata not found at: {eng_traineddata}")
-        else:
-            print(f"❌ TESSDATA_PREFIX directory does not exist: {tessdata_prefix}")
-    else:
-        print("❌ TESSDATA_PREFIX environment variable not set")
-    # Check pytesseract
-    try:
-        import pytesseract
-        print(f"✅ pytesseract is installed")
-        print(f"✅ pytesseract.tesseract_cmd = {pytesseract.pytesseract.tesseract_cmd}")
-    except ImportError:
-        print("❌ pytesseract is not installed")
-    # Check tesserocr
-    try:
-        import tesserocr
-        print(f"✅ tesserocr is installed")
-        print(f"✅ tesserocr version: {tesserocr.tesseract_version()}")
-    except ImportError:
-        print("❌ tesserocr is not installed")
-    except Exception as e:
-        print(f"❌ Error importing tesserocr: {e}")
-def fix_tesseract():
-    """Fix common Tesseract issues."""
-    print("\n=== Fixing Tesseract Issues ===")
-    # Create local tessdata directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    tessdata_dir = os.path.join(current_dir, "tessdata")
-    os.makedirs(tessdata_dir, exist_ok=True)
-    print(f"✅ Created local tessdata directory: {tessdata_dir}")
-    # Set TESSDATA_PREFIX to our local directory
-    os.environ['TESSDATA_PREFIX'] = tessdata_dir
-    print(f"✅ Set TESSDATA_PREFIX to: {tessdata_dir}")
-    # Download eng.traineddata
-    eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
-    if not os.path.exists(eng_traineddata):
-        try:
-            print("Downloading eng.traineddata...")
-            url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
-            urllib.request.urlretrieve(url, eng_traineddata)
-            print("✅ Downloaded eng.traineddata")
-        except Exception as e:
-            print(f"❌ Error downloading eng.traineddata: {e}")
-    else:
-        print("✅ eng.traineddata already exists")
-    # Configure pytesseract
-    try:
-        import pytesseract
-        tesseract_path = shutil.which("tesseract")
-        if tesseract_path:
-            pytesseract.pytesseract.tesseract_cmd = tesseract_path
-            print(f"✅ Set pytesseract.tesseract_cmd to {tesseract_path}")
-        else:
-            # Try common locations
-            common_locations = [
-                "/usr/bin/tesseract",
-                "/usr/local/bin/tesseract",
-                "/app/tesseract/tesseract"
-            ]
-            for location in common_locations:
-                if os.path.isfile(location) and os.access(location, os.X_OK):
-                    pytesseract.pytesseract.tesseract_cmd = location
-                    print(f"✅ Set pytesseract.tesseract_cmd to {location}")
-                    break
-    except ImportError:
-        print("❌ pytesseract not installed, please install it with: pip install pytesseract")
-    # Add TESSDATA_PREFIX to .env file for persistence
-    try:
-        with open(".env", "a") as f:
-            f.write(f"\nTESSDATAFIX_PREFIX={tessdata_dir}\n")
-        print("✅ Added TESSDATA_PREFIX to .env file")
-    except Exception as e:
-        print(f"❌ Error adding TESSDATA_PREFIX to .env file: {e}")
-    print("\n=== Tesseract Fix Complete ===")
-    print("Please restart your application for changes to take effect.")
-if __name__ == "__main__":
-    diagnose_tesseract()
-    fix_tesseract()

src/parsers/docling_parser.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Union
 import json
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
@@ -124,25 +126,82 @@ class DoclingParser(DocumentParser):
     def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
         """Apply full force OCR to a document."""
         input_doc = Path(file_path)
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_ocr = True
         pipeline_options.do_table_structure = True
         pipeline_options.table_structure_options.do_cell_matching = True
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
         pipeline_options.ocr_options = ocr_options
-        converter = DocumentConverter(
-            format_options={
-                InputFormat.PDF: PdfFormatOption(
-                    pipeline_options=pipeline_options,
-                )
-            }
         )
-        doc = converter.convert(input_doc).document
-        return doc.export_to_markdown()
 # Register the parser with the registry

 from pathlib import Path
 from typing import Dict, List, Optional, Any, Union
 import json
+import os
+import shutil
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
     def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
         """Apply full force OCR to a document."""
         input_doc = Path(file_path)
+        file_extension = input_doc.suffix.lower()
+        # Debug information
+        print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
+        # Set up pipeline options
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_ocr = True
         pipeline_options.do_table_structure = True
         pipeline_options.table_structure_options.do_cell_matching = True
+        # Find tesseract executable
+        tesseract_cmd = None
+        tesseract_paths = [
+            "tesseract",  # Default PATH
+            "/usr/bin/tesseract",  # Common Linux location
+            "/app/tesseract/tesseract",  # Possible custom location in Hugging Face
+            "/opt/conda/bin/tesseract",  # Possible Conda env in Hugging Face
+            r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Windows location
+        ]
+        for path in tesseract_paths:
+            if shutil.which(path) or (os.path.isfile(path) and os.access(path, os.X_OK)):
+                tesseract_cmd = path
+                print(f"Found tesseract at: {tesseract_cmd}")
+                break
+        if not tesseract_cmd:
+            print("Warning: Tesseract executable not found. Using default configuration.")
+            tesseract_cmd = "tesseract"  # Use default as fallback
+        # Configure OCR options with explicit tesseract path
+        ocr_options = TesseractCliOcrOptions(
+            force_full_page_ocr=True,
+            tesseract_cmd=tesseract_cmd
+        )
         pipeline_options.ocr_options = ocr_options
+        # Set up format options for both PDF and image formats
+        format_options = {}
+        # Always include PDF format option
+        format_options[InputFormat.PDF] = PdfFormatOption(
+            pipeline_options=pipeline_options,
         )
+        # For image files, we need to handle them differently
+        if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
+            # For image files, we'll use the same pipeline options
+            # but we need to specify the input format as IMAGE
+            print(f"Processing as image file: {file_extension}")
+            # Note: InputFormat.IMAGE is used for image files in Docling
+            format_options[InputFormat.IMAGE] = PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        # Create converter with appropriate format options
+        converter = DocumentConverter(format_options=format_options)
+        try:
+            # Convert the document
+            result = converter.convert(input_doc)
+            doc = result.document
+            return doc.export_to_markdown()
+        except Exception as e:
+            # Provide detailed error information
+            print(f"Error during full force OCR: {e}")
+            print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
+            # Try fallback to regular OCR if full force fails
+            try:
+                print("Attempting fallback to regular tesseract_cli OCR...")
+                return self.parse(file_path, ocr_method="tesseract_cli")
+            except Exception as fallback_error:
+                print(f"Fallback OCR also failed: {fallback_error}")
+                return f"OCR failed for {input_doc}. Error: {str(e)}"
 # Register the parser with the registry