AnseMin commited on
Commit
67baccc
·
1 Parent(s): f89b538

changes to full force ocr to accept pdf - attempt 1

Browse files
app.py CHANGED
@@ -5,11 +5,16 @@ import shutil
5
  from pathlib import Path
6
  import urllib.request
7
 
 
 
 
8
  # Run setup.sh at startup
9
  try:
10
- print("Running setup.sh...")
11
- subprocess.run(["bash", "setup.sh"], check=False)
12
- print("setup.sh completed")
 
 
13
  except Exception as e:
14
  print(f"Error running setup.sh: {e}")
15
 
@@ -21,9 +26,6 @@ try:
21
  except ImportError:
22
  print("python-dotenv not installed, skipping .env file loading")
23
 
24
- # Get the current directory
25
- current_dir = os.path.dirname(os.path.abspath(__file__))
26
-
27
  # Function to setup Tesseract
28
  def setup_tesseract():
29
  """Setup Tesseract OCR environment."""
 
5
  from pathlib import Path
6
  import urllib.request
7
 
8
+ # Get the current directory
9
+ current_dir = os.path.dirname(os.path.abspath(__file__))
10
+
11
  # Run setup.sh at startup
12
  try:
13
+ setup_script = os.path.join(current_dir, "setup.sh")
14
+ if os.path.exists(setup_script):
15
+ print("Running setup.sh...")
16
+ subprocess.run(["bash", setup_script], check=False)
17
+ print("setup.sh completed")
18
  except Exception as e:
19
  print(f"Error running setup.sh: {e}")
20
 
 
26
  except ImportError:
27
  print("python-dotenv not installed, skipping .env file loading")
28
 
 
 
 
29
  # Function to setup Tesseract
30
  def setup_tesseract():
31
  """Setup Tesseract OCR environment."""
fix_tesseract_huggingface.py DELETED
@@ -1,144 +0,0 @@
1
- #!/usr/bin/env python
2
- """
3
- Script to diagnose and fix Tesseract issues in Hugging Face environments.
4
- """
5
-
6
- import os
7
- import sys
8
- import shutil
9
- import subprocess
10
- import platform
11
- from pathlib import Path
12
- import urllib.request
13
-
14
- def diagnose_tesseract():
15
- """Diagnose Tesseract installation and configuration issues."""
16
- print("=== Tesseract Diagnostics ===")
17
-
18
- # Check OS
19
- print(f"Operating System: {platform.system()} {platform.release()}")
20
-
21
- # Check if tesseract is in PATH
22
- tesseract_path = shutil.which("tesseract")
23
- if tesseract_path:
24
- print(f"✅ Tesseract found in PATH: {tesseract_path}")
25
- try:
26
- version = subprocess.check_output(["tesseract", "--version"],
27
- stderr=subprocess.STDOUT,
28
- universal_newlines=True)
29
- print(f"✅ Tesseract version info:\n{version.splitlines()[0]}")
30
- except (subprocess.SubprocessError, FileNotFoundError) as e:
31
- print(f"❌ Error running tesseract: {e}")
32
- else:
33
- print("❌ Tesseract not found in PATH")
34
-
35
- # Check common installation locations
36
- common_locations = [
37
- "/usr/bin/tesseract",
38
- "/usr/local/bin/tesseract",
39
- "/opt/conda/bin/tesseract",
40
- "/app/tesseract/tesseract",
41
- r"C:\Program Files\Tesseract-OCR\tesseract.exe"
42
- ]
43
-
44
- for location in common_locations:
45
- if os.path.isfile(location) and os.access(location, os.X_OK):
46
- print(f"✅ Tesseract executable found at: {location}")
47
-
48
- # Check TESSDATA_PREFIX
49
- tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
50
- if tessdata_prefix:
51
- print(f"✅ TESSDATA_PREFIX is set to: {tessdata_prefix}")
52
- if os.path.exists(tessdata_prefix):
53
- print(f"✅ TESSDATA_PREFIX directory exists")
54
- eng_traineddata = os.path.join(tessdata_prefix, "eng.traineddata")
55
- if os.path.exists(eng_traineddata):
56
- print(f"✅ eng.traineddata found at: {eng_traineddata}")
57
- else:
58
- print(f"❌ eng.traineddata not found at: {eng_traineddata}")
59
- else:
60
- print(f"❌ TESSDATA_PREFIX directory does not exist: {tessdata_prefix}")
61
- else:
62
- print("❌ TESSDATA_PREFIX environment variable not set")
63
-
64
- # Check pytesseract
65
- try:
66
- import pytesseract
67
- print(f"✅ pytesseract is installed")
68
- print(f"✅ pytesseract.tesseract_cmd = {pytesseract.pytesseract.tesseract_cmd}")
69
- except ImportError:
70
- print("❌ pytesseract is not installed")
71
-
72
- # Check tesserocr
73
- try:
74
- import tesserocr
75
- print(f"✅ tesserocr is installed")
76
- print(f"✅ tesserocr version: {tesserocr.tesseract_version()}")
77
- except ImportError:
78
- print("❌ tesserocr is not installed")
79
- except Exception as e:
80
- print(f"❌ Error importing tesserocr: {e}")
81
-
82
- def fix_tesseract():
83
- """Fix common Tesseract issues."""
84
- print("\n=== Fixing Tesseract Issues ===")
85
-
86
- # Create local tessdata directory
87
- current_dir = os.path.dirname(os.path.abspath(__file__))
88
- tessdata_dir = os.path.join(current_dir, "tessdata")
89
- os.makedirs(tessdata_dir, exist_ok=True)
90
- print(f"✅ Created local tessdata directory: {tessdata_dir}")
91
-
92
- # Set TESSDATA_PREFIX to our local directory
93
- os.environ['TESSDATA_PREFIX'] = tessdata_dir
94
- print(f"✅ Set TESSDATA_PREFIX to: {tessdata_dir}")
95
-
96
- # Download eng.traineddata
97
- eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
98
- if not os.path.exists(eng_traineddata):
99
- try:
100
- print("Downloading eng.traineddata...")
101
- url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
102
- urllib.request.urlretrieve(url, eng_traineddata)
103
- print("✅ Downloaded eng.traineddata")
104
- except Exception as e:
105
- print(f"❌ Error downloading eng.traineddata: {e}")
106
- else:
107
- print("✅ eng.traineddata already exists")
108
-
109
- # Configure pytesseract
110
- try:
111
- import pytesseract
112
- tesseract_path = shutil.which("tesseract")
113
- if tesseract_path:
114
- pytesseract.pytesseract.tesseract_cmd = tesseract_path
115
- print(f"✅ Set pytesseract.tesseract_cmd to {tesseract_path}")
116
- else:
117
- # Try common locations
118
- common_locations = [
119
- "/usr/bin/tesseract",
120
- "/usr/local/bin/tesseract",
121
- "/app/tesseract/tesseract"
122
- ]
123
- for location in common_locations:
124
- if os.path.isfile(location) and os.access(location, os.X_OK):
125
- pytesseract.pytesseract.tesseract_cmd = location
126
- print(f"✅ Set pytesseract.tesseract_cmd to {location}")
127
- break
128
- except ImportError:
129
- print("❌ pytesseract not installed, please install it with: pip install pytesseract")
130
-
131
- # Add TESSDATA_PREFIX to .env file for persistence
132
- try:
133
- with open(".env", "a") as f:
134
- f.write(f"\nTESSDATAFIX_PREFIX={tessdata_dir}\n")
135
- print("✅ Added TESSDATA_PREFIX to .env file")
136
- except Exception as e:
137
- print(f"❌ Error adding TESSDATA_PREFIX to .env file: {e}")
138
-
139
- print("\n=== Tesseract Fix Complete ===")
140
- print("Please restart your application for changes to take effect.")
141
-
142
- if __name__ == "__main__":
143
- diagnose_tesseract()
144
- fix_tesseract()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/parsers/docling_parser.py CHANGED
@@ -1,6 +1,8 @@
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
3
  import json
 
 
4
 
5
  from src.parsers.parser_interface import DocumentParser
6
  from src.parsers.parser_registry import ParserRegistry
@@ -124,25 +126,82 @@ class DoclingParser(DocumentParser):
124
  def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
125
  """Apply full force OCR to a document."""
126
  input_doc = Path(file_path)
 
127
 
 
 
 
 
128
  pipeline_options = PdfPipelineOptions()
129
  pipeline_options.do_ocr = True
130
  pipeline_options.do_table_structure = True
131
  pipeline_options.table_structure_options.do_cell_matching = True
132
 
133
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  pipeline_options.ocr_options = ocr_options
135
 
136
- converter = DocumentConverter(
137
- format_options={
138
- InputFormat.PDF: PdfFormatOption(
139
- pipeline_options=pipeline_options,
140
- )
141
- }
142
  )
143
 
144
- doc = converter.convert(input_doc).document
145
- return doc.export_to_markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
 
148
  # Register the parser with the registry
 
1
  from pathlib import Path
2
  from typing import Dict, List, Optional, Any, Union
3
  import json
4
+ import os
5
+ import shutil
6
 
7
  from src.parsers.parser_interface import DocumentParser
8
  from src.parsers.parser_registry import ParserRegistry
 
126
  def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
127
  """Apply full force OCR to a document."""
128
  input_doc = Path(file_path)
129
+ file_extension = input_doc.suffix.lower()
130
 
131
+ # Debug information
132
+ print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
133
+
134
+ # Set up pipeline options
135
  pipeline_options = PdfPipelineOptions()
136
  pipeline_options.do_ocr = True
137
  pipeline_options.do_table_structure = True
138
  pipeline_options.table_structure_options.do_cell_matching = True
139
 
140
+ # Find tesseract executable
141
+ tesseract_cmd = None
142
+ tesseract_paths = [
143
+ "tesseract", # Default PATH
144
+ "/usr/bin/tesseract", # Common Linux location
145
+ "/app/tesseract/tesseract", # Possible custom location in Hugging Face
146
+ "/opt/conda/bin/tesseract", # Possible Conda env in Hugging Face
147
+ r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Windows location
148
+ ]
149
+
150
+ for path in tesseract_paths:
151
+ if shutil.which(path) or (os.path.isfile(path) and os.access(path, os.X_OK)):
152
+ tesseract_cmd = path
153
+ print(f"Found tesseract at: {tesseract_cmd}")
154
+ break
155
+
156
+ if not tesseract_cmd:
157
+ print("Warning: Tesseract executable not found. Using default configuration.")
158
+ tesseract_cmd = "tesseract" # Use default as fallback
159
+
160
+ # Configure OCR options with explicit tesseract path
161
+ ocr_options = TesseractCliOcrOptions(
162
+ force_full_page_ocr=True,
163
+ tesseract_cmd=tesseract_cmd
164
+ )
165
  pipeline_options.ocr_options = ocr_options
166
 
167
+ # Set up format options for both PDF and image formats
168
+ format_options = {}
169
+
170
+ # Always include PDF format option
171
+ format_options[InputFormat.PDF] = PdfFormatOption(
172
+ pipeline_options=pipeline_options,
173
  )
174
 
175
+ # For image files, we need to handle them differently
176
+ if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
177
+ # For image files, we'll use the same pipeline options
178
+ # but we need to specify the input format as IMAGE
179
+ print(f"Processing as image file: {file_extension}")
180
+ # Note: InputFormat.IMAGE is used for image files in Docling
181
+ format_options[InputFormat.IMAGE] = PdfFormatOption(
182
+ pipeline_options=pipeline_options,
183
+ )
184
+
185
+ # Create converter with appropriate format options
186
+ converter = DocumentConverter(format_options=format_options)
187
+
188
+ try:
189
+ # Convert the document
190
+ result = converter.convert(input_doc)
191
+ doc = result.document
192
+ return doc.export_to_markdown()
193
+ except Exception as e:
194
+ # Provide detailed error information
195
+ print(f"Error during full force OCR: {e}")
196
+ print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
197
+
198
+ # Try fallback to regular OCR if full force fails
199
+ try:
200
+ print("Attempting fallback to regular tesseract_cli OCR...")
201
+ return self.parse(file_path, ocr_method="tesseract_cli")
202
+ except Exception as fallback_error:
203
+ print(f"Fallback OCR also failed: {fallback_error}")
204
+ return f"OCR failed for {input_doc}. Error: {str(e)}"
205
 
206
 
207
  # Register the parser with the registry