Vachudev commited on
Commit
11b7950
·
verified ·
1 Parent(s): 041c268

update pdf preprocessing

Browse files
Files changed (1) hide show
  1. ocr_preprocessing_engine.py +20 -219
ocr_preprocessing_engine.py CHANGED
@@ -1,11 +1,3 @@
1
- Based on the "Make OCR Actually Work" video, the failure of OCR is often due to skipping four specific preprocessing steps: **Normalization, Denoising, Deskewing, and Thresholding**. The video demonstrates that even advanced Transformer models fail if an image is rotated or has poor contrast.
2
-
3
- Here is the updated modular pipeline. I have rewritten `ocr_preprocessing_engine.py` to strictly implement the 4-step workflow highlighted in the video, and refined the `prompts.py` to take advantage of the cleaner text output.
4
-
5
- ### 1. Improved `ocr_preprocessing_engine.py`
6
- **Changes:** Added explicit **Normalization** (contrast stretching) and **Denoising** steps before Binarization, as emphasized in the video source.
7
-
8
- ```python
9
  import cv2
10
  import numpy as np
11
  import pytesseract
@@ -16,77 +8,72 @@ import logging
16
 
17
  logger = logging.getLogger("ocr_preprocessor")
18
 
19
- def preprocess_image(image: Image.Image) -> Image.Image:
20
  """
21
- Implements the 4-step pipeline from the 'Make OCR Work' video source:
22
  1. Normalization (Contrast Stretching)
23
- 2. Denoising
24
- 3. Deskewing
25
  4. Thresholding (Binarization)
26
  """
27
  # Convert PIL to OpenCV format
28
  img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
29
 
30
- # 1. Normalization: Stretch pixel intensity to 0-255 range
31
- # This fixes images that look "washed out" or "completely black" due to bad contrast.
32
- norm_img = np.zeros((img_cv.shape, img_cv.shape), dtype=np.uint8)
33
  img_cv = cv2.normalize(img_cv, norm_img, 0, 255, cv2.NORM_MINMAX)
34
 
35
- # Convert to Grayscale for further processing
36
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
37
 
38
- # 2. Denoising: Remove speckles/artifacts
39
- # fastNlMeans is effective but slow; using GaussianBlur as a faster CPU-friendly alternative
40
  denoised = cv2.GaussianBlur(gray, (5, 5), 0)
41
 
42
- # 3. Thresholding (Binarization)
43
- # The video suggests finding the right value. Otsu's method (THRESH_OTSU) automatically
44
- # finds the optimal threshold value to separate text (foreground) from background.
45
  _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
46
 
47
- # 4. Deskewing
48
- # The video notes that without rotation correction, OCR often returns nothing.
49
  coords = np.column_stack(np.where(binary > 0))
50
  angle = cv2.minAreaRect(coords)[-1]
51
 
52
- # Adjust angle convention for OpenCV
53
  if angle < -45:
54
  angle = -(90 + angle)
55
  else:
56
  angle = -angle
57
 
58
- # Rotate only if the skew is noticeable (>0.5 degrees) to avoid interpolation artifacts
59
  if abs(angle) > 0.5:
60
  (h, w) = binary.shape[:2]
61
  center = (w // 2, h // 2)
62
  M = cv2.getRotationMatrix2D(center, angle, 1.0)
63
  binary = cv2.warpAffine(binary, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
64
- logger.info(f"Image deskewed by {angle:.2f} degrees.")
65
 
66
  return Image.fromarray(binary)
67
 
68
  def extract_text_with_preprocessing(file_path: str) -> str:
69
  """
70
- Pipeline: Load -> High-DPI Convert -> 4-Step Preprocess -> Tesseract -> Text
 
71
  """
72
  if not os.path.exists(file_path):
73
  return ""
74
 
75
  text_content = ""
76
  try:
77
- # Load PDF at 300 DPI - Essential for Tesseract accuracy
78
  if file_path.lower().endswith('.pdf'):
79
  images = convert_from_path(file_path, dpi=300)
80
  else:
81
  images = [Image.open(file_path)]
82
 
83
  for i, img in enumerate(images):
84
- # Apply the 4-step video pipeline
85
- processed_img = preprocess_image(img)
86
 
87
  # Tesseract Config:
88
- # --psm 4: Assume variable size text (good for invoices)
89
- # preserve_interword_spaces: Helps extraction of table columns
90
  custom_config = r'--oem 3 --psm 4 -c preserve_interword_spaces=1'
91
 
92
  page_text = pytesseract.image_to_string(processed_img, config=custom_config)
@@ -96,190 +83,4 @@ def extract_text_with_preprocessing(file_path: str) -> str:
96
  logger.error(f"Preprocessing/OCR Error: {e}")
97
  return f"Error processing file: {str(e)}"
98
 
99
- return text_content.strip()
100
- ```
101
-
102
- ### 2. Refined `prompts.py`
103
- **Changes:** Since preprocessing (deskewing/normalization) yields cleaner text, we can be stricter in the SOP. I have updated the System Prompt to explicitly map the "Golden Sample" logic to the output.
104
-
105
- ```python
106
- def get_ocr_extraction_prompt(raw_text: str) -> str:
107
- """
108
- Returns a strict prompt with SOP and One-Shot example.
109
- Refined to handle 'Line Items' specifically as preprocessing makes tables more readable.
110
- """
111
- return f"""<|im_start|>system
112
- You are a precise Invoice Data Extraction Agent.
113
- Your input is raw OCR text from a pre-processed invoice image.
114
-
115
- ### STANDARD OPERATING PROCEDURE (SOP):
116
- 1. **Header Extraction**: Identify the Vendor Name, Invoice Number, and Dates (Invoice & Due).
117
- 2. **Table Parsing**: The OCR preserves inter-word spacing. Use this to identify the 'Line Items' table.
118
- 3. **Normalization**:
119
- - Dates must be YYYY-MM-DD.
120
- - Amounts must be floats (no currency symbols).
121
- 4. **Validation**: If 'Total Amount' is missing, calculate it from line items if possible.
122
- 5. **Output Format**: Return ONLY valid JSON. No Markdown block markers (```json).
123
-
124
- ### ONE-SHOT EXAMPLE (City of Auburn Invoice):
125
- **Input OCR**:
126
- "CITY OF AUBURN... 076248-000... Due: 01/07/25...
127
- Water Total $649.69... Sewer Total $1,333.45... Total New Charges $2,363.39"
128
-
129
- **Correct JSON**:
130
- {{
131
- "invoice_number": "076248-000",
132
- "vendor_name": "City of Auburn",
133
- "invoice_date": "2024-12-18",
134
- "due_date": "2025-01-07",
135
- "total_amount": 2363.39,
136
- "line_items": [
137
- {{"description": "Water Total", "quantity": 1, "rate": 649.69, "amount": 649.69}},
138
- {{"description": "Sewer Total", "quantity": 1, "rate": 1333.45, "amount": 1333.45}}
139
- ]
140
- }}
141
- <|im_end|>
142
-
143
- <|im_start|>user
144
- ### TARGET INVOICE OCR DATA:
145
- {raw_text[:4000]}
146
-
147
- Return the JSON:
148
- <|im_end|>
149
- <|im_start|>assistant
150
- {{
151
- """ # Pre-fill brace to force Qwen into JSON mode
152
- ```
153
-
154
- ### 3. `config.py` (Modular Configuration)
155
- **Changes:** Decouples the "Canonical Schema" (from `prompts.py`) from Zoho's specific API keys. This allows you to edit the field mapping without changing the AI logic.
156
-
157
- ```python
158
- # config.py
159
-
160
- # --- Zoho API Configuration ---
161
- # DO NOT COMMIT REAL CREDENTIALS TO GIT
162
- CLIENT_ID = os.getenv("ZOHO_CLIENT_ID", "YOUR_CLIENT_ID")
163
- CLIENT_SECRET = os.getenv("ZOHO_CLIENT_SECRET", "YOUR_CLIENT_SECRET")
164
- REFRESH_TOKEN = os.getenv("ZOHO_REFRESH_TOKEN", "YOUR_REFRESH_TOKEN")
165
- API_BASE = "https://www.zohoapis.in/crm/v2"
166
-
167
- # --- Schema Mapper ---
168
- # Maps LLM 'Canonical' keys -> Zoho CRM/Books API keys
169
- # If you switch CRM, you only change the right-hand side.
170
- ZOHO_INVOICE_MAP = {
171
- "invoice_number": "invoice_number", # Canonical : Zoho
172
- "vendor_name": "customer_name", # Note: Requires customer ID lookup in logic
173
- "invoice_date": "date",
174
- "due_date": "due_date",
175
- "total_amount": "total",
176
- "line_items": "line_items"
177
- }
178
-
179
- ZOHO_LINE_ITEM_MAP = {
180
- "description": "name",
181
- "quantity": "quantity",
182
- "rate": "rate",
183
- "amount": "item_total"
184
- }
185
- ```
186
-
187
- ### 4. `app.py` (Pipeline Integration)
188
- **Changes:** Integrates the new `ocr_preprocessing_engine`, applies the mapping from `config.py`, and maintains the FastMCP structure.
189
-
190
- ```python
191
- from mcp.server.fastmcp import FastMCP
192
- import logging
193
- import json
194
- import os
195
- from ocr_preprocessing_engine import extract_text_with_preprocessing
196
- from prompts import get_ocr_extraction_prompt
197
- from config import ZOHO_INVOICE_MAP, ZOHO_LINE_ITEM_MAP
198
-
199
- # Initialize FastMCP
200
- mcp = FastMCP("ZohoInvoiceAgent")
201
- logger = logging.getLogger("mcp_server")
202
-
203
- def map_canonical_to_zoho(canonical_data: dict) -> dict:
204
- """
205
- Transforms generic LLM JSON into Zoho-ready JSON using config maps.
206
- """
207
- zoho_payload = {}
208
-
209
- # 1. Map Top-Level Fields
210
- for llm_key, zoho_key in ZOHO_INVOICE_MAP.items():
211
- if llm_key in canonical_data and llm_key != "line_items":
212
- zoho_payload[zoho_key] = canonical_data[llm_key]
213
-
214
- # 2. Map Line Items
215
- if "line_items" in canonical_data and isinstance(canonical_data["line_items"], list):
216
- zoho_items = []
217
- for item in canonical_data["line_items"]:
218
- new_item = {}
219
- for l_key, z_key in ZOHO_LINE_ITEM_MAP.items():
220
- if l_key in item:
221
- new_item[z_key] = item[l_key]
222
- # Zoho API often requires quantity default to 1 if missing
223
- if "quantity" not in new_item:
224
- new_item["quantity"] = 1
225
- zoho_items.append(new_item)
226
- zoho_payload["line_items"] = zoho_items
227
-
228
- return zoho_payload
229
-
230
- @mcp.tool()
231
- def process_invoice_document(file_path: str) -> dict:
232
- """
233
- MCP Tool: Takes an invoice PDF/Image, runs strict preprocessing (Normalize->Deskew->Threshold),
234
- extracts data via Qwen 2.5, and maps it to Zoho API format.
235
- """
236
- if not os.path.exists(file_path):
237
- return {"error": "File not found"}
238
-
239
- # Step 1: Enhanced OCR Preprocessing
240
- # This step is critical to fix rotation and contrast issues before Tesseract runs.
241
- raw_text = extract_text_with_preprocessing(file_path)
242
-
243
- if len(raw_text) < 50:
244
- return {"error": "OCR failed. Image may be too blurry or blank."}
245
-
246
- # Step 2: LLM Extraction (Qwen 2.5)
247
- prompt = get_ocr_extraction_prompt(raw_text)
248
-
249
- # Mocking local_llm_generate for this snippet - ensure this connects to your Qwen pipeline
250
- # Ensure do_sample=False (Greedy Decoding) to reduce erratic json
251
- # response = local_llm_generate(prompt, max_tokens=500, do_sample=False)
252
-
253
- # --- SIMULATED RESPONSE FOR DEMO ---
254
- # In production, replace this with actual model generation
255
- logger.info("Sending text to LLM...")
256
- # -----------------------------------
257
-
258
- try:
259
- # Assuming response["text"] contains the JSON
260
- # Here we pretend the LLM returned the canonical JSON structure
261
- # canonical_data = json.loads("{" + response["text"])
262
-
263
- # For demonstration, let's assume valid extraction:
264
- canonical_data = {
265
- "invoice_number": "INV-001",
266
- "total_amount": 100.00,
267
- "line_items": [{"description": "Service", "rate": 100.00}]
268
- }
269
-
270
- # Step 3: Map to Zoho Structure
271
- zoho_ready_data = map_canonical_to_zoho(canonical_data)
272
-
273
- return {
274
- "status": "success",
275
- "source_file": os.path.basename(file_path),
276
- "canonical_data": canonical_data, # Useful for debugging/user verification
277
- "zoho_payload": zoho_ready_data # Ready for the create_invoice tool
278
- }
279
-
280
- except Exception as e:
281
- return {"error": f"Processing failed: {str(e)}"}
282
-
283
- if __name__ == "__main__":
284
- mcp.run()
285
- ```
 
 
 
 
 
 
 
 
 
1
  import cv2
2
  import numpy as np
3
  import pytesseract
 
8
 
9
  logger = logging.getLogger("ocr_preprocessor")
10
 
11
+ def preprocess_image_for_ocr(image: Image.Image) -> Image.Image:
12
  """
13
+ Applies the 4-step OCR enhancement pipeline (Source: Make OCR Actually Work):
14
  1. Normalization (Contrast Stretching)
15
+ 2. Denoising (Gaussian Blur)
16
+ 3. Deskewing (Rotation Correction)
17
  4. Thresholding (Binarization)
18
  """
19
  # Convert PIL to OpenCV format
20
  img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
21
 
22
+ # 1. Normalization: Maximize contrast range
23
+ norm_img = np.zeros((img_cv.shape, img_cv.shape[5]), dtype=np.uint8)
 
24
  img_cv = cv2.normalize(img_cv, norm_img, 0, 255, cv2.NORM_MINMAX)
25
 
26
+ # Convert to Grayscale
27
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
28
 
29
+ # 2. Denoising: Remove scanning artifacts
 
30
  denoised = cv2.GaussianBlur(gray, (5, 5), 0)
31
 
32
+ # 3. Thresholding (Binarization): Adaptive Otsu's method
33
+ # This separates text (foreground) from background noise
 
34
  _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
35
 
36
+ # 4. Deskewing: Fix rotation
 
37
  coords = np.column_stack(np.where(binary > 0))
38
  angle = cv2.minAreaRect(coords)[-1]
39
 
40
+ # Adjust OpenCV angle calculation
41
  if angle < -45:
42
  angle = -(90 + angle)
43
  else:
44
  angle = -angle
45
 
46
+ # Rotate only if skew is significant (>0.5 degrees)
47
  if abs(angle) > 0.5:
48
  (h, w) = binary.shape[:2]
49
  center = (w // 2, h // 2)
50
  M = cv2.getRotationMatrix2D(center, angle, 1.0)
51
  binary = cv2.warpAffine(binary, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
 
52
 
53
  return Image.fromarray(binary)
54
 
55
  def extract_text_with_preprocessing(file_path: str) -> str:
56
  """
57
+ Converts PDF to 300 DPI images (Source [6]), pre-processes them,
58
+ and runs Tesseract with layout preservation.
59
  """
60
  if not os.path.exists(file_path):
61
  return ""
62
 
63
  text_content = ""
64
  try:
65
+ # Load PDF at 300 DPI (Tesseract optimal standard)
66
  if file_path.lower().endswith('.pdf'):
67
  images = convert_from_path(file_path, dpi=300)
68
  else:
69
  images = [Image.open(file_path)]
70
 
71
  for i, img in enumerate(images):
72
+ processed_img = preprocess_image_for_ocr(img)
 
73
 
74
  # Tesseract Config:
75
+ # --psm 4: Single column variable size (good for invoice layouts)
76
+ # preserve_interword_spaces=1: Helps LLM detect table columns
77
  custom_config = r'--oem 3 --psm 4 -c preserve_interword_spaces=1'
78
 
79
  page_text = pytesseract.image_to_string(processed_img, config=custom_config)
 
83
  logger.error(f"Preprocessing/OCR Error: {e}")
84
  return f"Error processing file: {str(e)}"
85
 
86
+ return text_content.strip()