sarveshpatel commited on
Commit
946e1b3
·
verified ·
1 Parent(s): f7beca4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -223
app.py CHANGED
@@ -1,129 +1,91 @@
1
  import os
2
  import uuid
3
- import gc
4
  from fastapi import FastAPI, UploadFile, File, HTTPException
5
  from fastapi.responses import JSONResponse
6
- from typing import List, Optional
7
  import fitz
8
  from PIL import Image
9
- import io
10
 
11
  # -------------------------------------------------------------------
12
  # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
13
  # -------------------------------------------------------------------
14
  os.environ["PADDLE_HOME"] = "/app/paddle_home"
15
  os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
16
- os.environ["OMP_NUM_THREADS"] = "2" # Match CPU count
17
- os.environ["MKL_NUM_THREADS"] = "2"
18
  os.makedirs("/app/paddle_home", exist_ok=True)
19
  os.makedirs("/app/xdg_cache", exist_ok=True)
20
 
 
21
  from paddleocr import PaddleOCR
22
 
23
  # -------------------------------------------------------------------
24
  # CONFIGURATION
25
  # -------------------------------------------------------------------
26
- MAX_IMAGE_DIMENSION = 1024 # Max width or height
27
- OPTIMAL_DPI = 150 # Lower DPI for faster PDF rendering (was 220)
28
- JPEG_QUALITY = 85 # Good balance of quality and size
29
- UPLOAD_DIR = "/app/uploads"
30
- PDF_IMAGES_DIR = "/app/pdf_images"
31
-
32
- os.makedirs(UPLOAD_DIR, exist_ok=True)
33
- os.makedirs(PDF_IMAGES_DIR, exist_ok=True)
34
-
35
 
36
  # -------------------------------------------------------------------
37
- # IMAGE OPTIMIZATION UTILITIES
38
  # -------------------------------------------------------------------
39
- def optimize_image(image_path: str, output_path: Optional[str] = None) -> str:
40
- """
41
- Resize and optimize image for faster OCR processing.
42
- - Resizes to max dimension of MAX_IMAGE_DIMENSION while maintaining aspect ratio
43
- - Converts to RGB (removes alpha channel if present)
44
- - Saves as optimized JPEG
45
- """
46
- if output_path is None:
47
- output_path = image_path
48
-
49
- with Image.open(image_path) as img:
50
- # Convert to RGB if necessary (handles PNG with alpha, etc.)
51
  if img.mode in ('RGBA', 'LA', 'P'):
52
  img = img.convert('RGB')
53
  elif img.mode != 'RGB':
54
  img = img.convert('RGB')
55
 
56
- # Get current dimensions
57
  width, height = img.size
58
 
59
- # Only resize if larger than max dimension
60
- if width > MAX_IMAGE_DIMENSION or height > MAX_IMAGE_DIMENSION:
61
- # Calculate new dimensions maintaining aspect ratio
62
  if width > height:
63
- new_width = MAX_IMAGE_DIMENSION
64
- new_height = int(height * (MAX_IMAGE_DIMENSION / width))
65
  else:
66
- new_height = MAX_IMAGE_DIMENSION
67
- new_width = int(width * (MAX_IMAGE_DIMENSION / height))
68
 
69
- # Use LANCZOS for high-quality downscaling
70
  img = img.resize((new_width, new_height), Image.LANCZOS)
71
 
72
- # Save optimized image
73
- img.save(output_path, 'JPEG', quality=JPEG_QUALITY, optimize=True)
74
 
75
  return output_path
76
 
77
 
78
- def cleanup_file(file_path: str) -> None:
79
- """Safely remove a file if it exists."""
80
- try:
81
- if file_path and os.path.exists(file_path):
82
- os.remove(file_path)
83
- except Exception:
84
- pass
85
-
86
-
87
- def cleanup_files(file_paths: List[str]) -> None:
88
- """Remove multiple files."""
89
- for fp in file_paths:
90
- cleanup_file(fp)
91
-
92
-
93
  # -------------------------------------------------------------------
94
- # PDF → OPTIMIZED IMAGES
95
  # -------------------------------------------------------------------
96
- def pdf_to_images(pdf_path: str, max_pages: Optional[int] = 3) -> List[str]:
97
- """
98
- Convert PDF pages to optimized images.
99
- Uses lower DPI and resizes for faster OCR.
100
- """
101
  if not os.path.exists(pdf_path):
102
  raise FileNotFoundError(pdf_path)
103
 
104
  doc = fitz.open(pdf_path)
105
  page_count = len(doc)
 
106
  limit = page_count if max_pages is None else min(max_pages, page_count)
107
  output_paths: List[str] = []
108
 
 
 
 
109
  for i in range(limit):
110
  page = doc.load_page(i)
 
111
 
112
- # Use lower DPI for faster rendering
113
- pix = page.get_pixmap(dpi=OPTIMAL_DPI)
114
-
115
- # Generate unique filename
116
  img_name = f"{uuid.uuid4()}.jpg"
117
- img_path = os.path.join(PDF_IMAGES_DIR, img_name)
118
 
119
- # Save initial image
120
- pix.save(img_path)
 
121
 
122
- # Free pixmap memory immediately
123
- pix = None
124
 
125
- # Optimize the saved image (resize if needed)
126
- optimize_image(img_path, img_path)
 
127
 
128
  output_paths.append(img_path)
129
 
@@ -132,184 +94,103 @@ def pdf_to_images(pdf_path: str, max_pages: Optional[int] = 3) -> List[str]:
132
 
133
 
134
  # -------------------------------------------------------------------
135
- # OCR ENGINE - Singleton with optimized settings
136
  # -------------------------------------------------------------------
137
- class OCREngine:
138
- """Singleton OCR engine to avoid re-initialization."""
139
- _instance = None
140
-
141
- @classmethod
142
- def get_instance(cls):
143
- if cls._instance is None:
144
- cls._instance = PaddleOCR(
145
- lang="mr",
146
- text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
147
- use_doc_orientation_classify=False,
148
- use_doc_unwarping=False,
149
- use_textline_orientation=False,
150
- show_log=False, # Reduce logging overhead
151
- )
152
- return cls._instance
153
-
154
-
155
- def extract_text(image_path: str) -> List[dict]:
156
- """
157
- Extract text from an optimized image.
158
- Returns list of {text, confidence} dicts.
159
- """
160
- ocr = OCREngine.get_instance()
161
- result = ocr.predict(input=image_path)
162
-
163
  output = []
164
  for block in result:
165
- texts = block.get("rec_texts", [])
166
- scores = block.get("rec_scores", [])
167
  for t, s in zip(texts, scores):
168
- if t.strip(): # Skip empty text
169
- output.append({
170
- "text": t,
171
- "confidence": round(float(s), 4)
172
- })
173
-
174
  return output
175
 
176
 
177
- def process_single_image(image_path: str, is_temp: bool = False) -> List[dict]:
178
- """
179
- Process a single image: optimize, OCR, cleanup.
180
- """
181
- optimized_path = None
182
-
183
- try:
184
- # Create optimized version
185
- optimized_name = f"opt_{uuid.uuid4()}.jpg"
186
- optimized_path = os.path.join(UPLOAD_DIR, optimized_name)
187
- optimize_image(image_path, optimized_path)
188
-
189
- # Run OCR on optimized image
190
- results = extract_text(optimized_path)
191
-
192
- return results
193
-
194
- finally:
195
- # Cleanup optimized image
196
- cleanup_file(optimized_path)
197
-
198
- # Force garbage collection after each image
199
- gc.collect()
200
-
201
-
202
  # -------------------------------------------------------------------
203
- # FASTAPI APPLICATION
204
  # -------------------------------------------------------------------
205
- app = FastAPI(title="Optimized Marathi OCR API")
 
 
206
 
207
 
208
  @app.post("/ocr")
209
- async def ocr_endpoint(
210
- files: List[UploadFile] = File(...),
211
- max_pages: Optional[int] = 3
212
- ):
213
- """
214
- OCR endpoint supporting PDF and image files.
215
-
216
- - Maximum 15 files per request
217
- - PDFs: processes up to max_pages (default 3)
218
- - Images: jpg, jpeg, png supported
219
- """
220
  if len(files) > 15:
221
  raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
222
 
223
  structured_output = {"files": []}
224
- temp_files_to_cleanup = []
225
 
226
- try:
227
- for index, file in enumerate(files, start=1):
228
- filename = file.filename.lower() if file.filename else f"unknown_{index}"
229
- ext = filename.rsplit(".", 1)[-1] if "." in filename else ""
230
 
231
- # Save uploaded file
232
- temp_name = f"{uuid.uuid4()}.{ext}"
233
- temp_path = os.path.join(UPLOAD_DIR, temp_name)
234
- temp_files_to_cleanup.append(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- content = await file.read()
237
- with open(temp_path, "wb") as f:
238
- f.write(content)
239
-
240
- # Free memory from upload content
241
- del content
242
-
243
- file_record = {
244
- "file_id": f"file_{index}",
245
- "filename": filename,
246
- "pages": []
247
- }
248
-
249
- # -------------------------------
250
- # PDF PROCESSING
251
- # -------------------------------
252
- if filename.endswith(".pdf"):
253
- img_paths = []
254
- try:
255
- img_paths = pdf_to_images(temp_path, max_pages=max_pages)
256
-
257
- for page_idx, img_path in enumerate(img_paths):
258
- ocr_results = extract_text(img_path)
259
-
260
- file_record["pages"].append({
261
- "page_index": page_idx,
262
- "results": ocr_results
263
- })
264
-
265
- # Cleanup each page image immediately after processing
266
- cleanup_file(img_path)
267
- gc.collect()
268
-
269
- finally:
270
- # Ensure all PDF images are cleaned up
271
- cleanup_files(img_paths)
272
-
273
- # -------------------------------
274
- # IMAGE PROCESSING
275
- # -------------------------------
276
- elif filename.endswith((".jpg", ".jpeg", ".png", ".webp", ".bmp")):
277
- ocr_results = process_single_image(temp_path)
278
-
279
  file_record["pages"].append({
280
- "page_index": 0,
281
  "results": ocr_results
282
  })
283
-
284
- else:
285
- raise HTTPException(
286
- status_code=400,
287
- detail=f"Unsupported file type: {filename}. Supported: pdf, jpg, jpeg, png, webp, bmp"
288
- )
289
-
290
- structured_output["files"].append(file_record)
 
 
 
 
291
 
292
- # Cleanup temp file after processing
293
- cleanup_file(temp_path)
294
- gc.collect()
295
 
296
- finally:
297
- # Final cleanup of any remaining temp files
298
- cleanup_files(temp_files_to_cleanup)
299
- gc.collect()
300
-
301
- return JSONResponse(structured_output)
 
 
302
 
 
 
303
 
304
- @app.get("/health")
305
- async def health_check():
306
- """Health check endpoint."""
307
- return {"status": "healthy", "max_dimension": MAX_IMAGE_DIMENSION}
308
 
 
309
 
310
- @app.on_event("startup")
311
- async def startup_event():
312
- """Pre-initialize OCR engine on startup."""
313
- # Warm up the OCR engine
314
- OCREngine.get_instance()
315
- gc.collect()
 
1
  import os
2
  import uuid
 
3
  from fastapi import FastAPI, UploadFile, File, HTTPException
4
  from fastapi.responses import JSONResponse
5
+ from typing import List
6
  import fitz
7
  from PIL import Image
 
8
 
9
  # -------------------------------------------------------------------
10
  # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
11
  # -------------------------------------------------------------------
12
  os.environ["PADDLE_HOME"] = "/app/paddle_home"
13
  os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
 
 
14
  os.makedirs("/app/paddle_home", exist_ok=True)
15
  os.makedirs("/app/xdg_cache", exist_ok=True)
16
 
17
+ # now safe to import paddlex/paddleocr
18
  from paddleocr import PaddleOCR
19
 
20
  # -------------------------------------------------------------------
21
  # CONFIGURATION
22
  # -------------------------------------------------------------------
23
+ MAX_DIMENSION = 1024 # Max width or height for OCR processing
24
+ PDF_DPI = 150 # Lower DPI = faster (was 220)
 
 
 
 
 
 
 
25
 
26
  # -------------------------------------------------------------------
27
+ # IMAGE OPTIMIZATION
28
  # -------------------------------------------------------------------
29
+ def optimize_image_for_ocr(input_path: str, output_path: str) -> str:
30
+ """Resize image if too large, keeping aspect ratio."""
31
+ with Image.open(input_path) as img:
32
+ # Convert to RGB if needed
 
 
 
 
 
 
 
 
33
  if img.mode in ('RGBA', 'LA', 'P'):
34
  img = img.convert('RGB')
35
  elif img.mode != 'RGB':
36
  img = img.convert('RGB')
37
 
 
38
  width, height = img.size
39
 
40
+ # Only resize if larger than MAX_DIMENSION
41
+ if width > MAX_DIMENSION or height > MAX_DIMENSION:
 
42
  if width > height:
43
+ new_width = MAX_DIMENSION
44
+ new_height = int(height * (MAX_DIMENSION / width))
45
  else:
46
+ new_height = MAX_DIMENSION
47
+ new_width = int(width * (MAX_DIMENSION / height))
48
 
 
49
  img = img.resize((new_width, new_height), Image.LANCZOS)
50
 
51
+ img.save(output_path, 'JPEG', quality=85)
 
52
 
53
  return output_path
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # -------------------------------------------------------------------
57
+ # PDF → IMAGE (optimized)
58
  # -------------------------------------------------------------------
59
+ def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
 
 
 
 
60
  if not os.path.exists(pdf_path):
61
  raise FileNotFoundError(pdf_path)
62
 
63
  doc = fitz.open(pdf_path)
64
  page_count = len(doc)
65
+
66
  limit = page_count if max_pages is None else min(max_pages, page_count)
67
  output_paths: List[str] = []
68
 
69
+ out_dir = "/app/pdf_images"
70
+ os.makedirs(out_dir, exist_ok=True)
71
+
72
  for i in range(limit):
73
  page = doc.load_page(i)
74
+ pix = page.get_pixmap(dpi=PDF_DPI) # Lower DPI for speed
75
 
 
 
 
 
76
  img_name = f"{uuid.uuid4()}.jpg"
77
+ img_path = os.path.join(out_dir, img_name)
78
 
79
+ # Save initial
80
+ temp_path = img_path + ".tmp.jpg"
81
+ pix.save(temp_path)
82
 
83
+ # Optimize (resize if needed)
84
+ optimize_image_for_ocr(temp_path, img_path)
85
 
86
+ # Cleanup temp
87
+ if os.path.exists(temp_path):
88
+ os.remove(temp_path)
89
 
90
  output_paths.append(img_path)
91
 
 
94
 
95
 
96
  # -------------------------------------------------------------------
97
+ # OCR ENGINE
98
  # -------------------------------------------------------------------
99
+ ocr_engine = PaddleOCR(
100
+ lang="mr",
101
+ text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
102
+ use_doc_orientation_classify=False,
103
+ use_doc_unwarping=False,
104
+ use_textline_orientation=False
105
+ )
106
+
107
+
108
+ def extract_text(image_path: str):
109
+ result = ocr_engine.predict(input=image_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  output = []
111
  for block in result:
112
+ texts = block["rec_texts"]
113
+ scores = block["rec_scores"]
114
  for t, s in zip(texts, scores):
115
+ output.append({"text": t, "confidence": float(s)})
 
 
 
 
 
116
  return output
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # -------------------------------------------------------------------
120
+ # FASTAPI
121
  # -------------------------------------------------------------------
122
+ app = FastAPI()
123
+ UPLOAD_DIR = "/app/uploads"
124
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
125
 
126
 
127
  @app.post("/ocr")
128
+ async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
 
 
 
 
 
 
 
 
 
 
129
  if len(files) > 15:
130
  raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
131
 
132
  structured_output = {"files": []}
 
133
 
134
+ for index, file in enumerate(files, start=1):
135
+ filename = file.filename.lower()
136
+ ext = filename.split(".")[-1]
 
137
 
138
+ temp_name = f"{uuid.uuid4()}.{ext}"
139
+ temp_path = os.path.join(UPLOAD_DIR, temp_name)
140
+
141
+ with open(temp_path, "wb") as f:
142
+ f.write(await file.read())
143
+
144
+ file_record = {
145
+ "file_id": f"file_{index}",
146
+ "filename": filename,
147
+ "pages": []
148
+ }
149
+
150
+ # -------------------------------
151
+ # PDF
152
+ # -------------------------------
153
+ if filename.endswith(".pdf"):
154
+ img_paths = pdf_to_images(temp_path, max_pages=max_pages)
155
+
156
+ for page_idx, img_path in enumerate(img_paths):
157
+ ocr_results = extract_text(img_path)
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  file_record["pages"].append({
160
+ "page_index": page_idx,
161
  "results": ocr_results
162
  })
163
+
164
+ # Cleanup processed image
165
+ if os.path.exists(img_path):
166
+ os.remove(img_path)
167
+
168
+ # -------------------------------
169
+ # IMAGE
170
+ # -------------------------------
171
+ elif filename.endswith((".jpg", ".jpeg", ".png")):
172
+ # Optimize image before OCR
173
+ optimized_path = os.path.join(UPLOAD_DIR, f"opt_{uuid.uuid4()}.jpg")
174
+ optimize_image_for_ocr(temp_path, optimized_path)
175
 
176
+ ocr_results = extract_text(optimized_path)
 
 
177
 
178
+ file_record["pages"].append({
179
+ "page_index": 0,
180
+ "results": ocr_results
181
+ })
182
+
183
+ # Cleanup optimized image
184
+ if os.path.exists(optimized_path):
185
+ os.remove(optimized_path)
186
 
187
+ else:
188
+ raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
189
 
190
+ # Cleanup uploaded file
191
+ if os.path.exists(temp_path):
192
+ os.remove(temp_path)
 
193
 
194
+ structured_output["files"].append(file_record)
195
 
196
+ return JSONResponse(structured_output)