Soumik Bose commited on
Commit
690d90f
·
1 Parent(s): 6536e0d
Files changed (3) hide show
  1. Dockerfile +20 -5
  2. main.py +183 -95
  3. requirements.txt +3 -0
Dockerfile CHANGED
@@ -2,10 +2,24 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies for RapidOCR and PDF processing
6
  RUN apt-get update && apt-get install -y \
7
  curl \
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  poppler-utils \
 
9
  libgl1 \
10
  libglib2.0-0 \
11
  libgomp1 \
@@ -13,18 +27,19 @@ RUN apt-get update && apt-get install -y \
13
  g++ \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
- # Fix: Ensure logs appear immediately in the console
17
  ENV PYTHONUNBUFFERED=1
18
  ENV PYTHONIOENCODING=UTF-8
19
  ENV HF_HOME=/tmp/cache
20
  ENV PORT=7860
 
21
 
22
- # Copy requirements and install dependencies
23
  COPY requirements.txt .
24
  RUN pip install --upgrade pip setuptools wheel \
25
  && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
26
 
27
- # Copy application files
28
  COPY . .
29
 
30
  # Create non-root user
@@ -36,7 +51,7 @@ RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME}
36
 
37
  EXPOSE $PORT
38
 
39
- # Start the application
40
  CMD bash -c "\
41
  (while true; do curl -s https://xce009-ocr-api.hf.space >/dev/null; sleep 300; done) & \
42
  uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies for BOTH Tesseract and RapidOCR
6
  RUN apt-get update && apt-get install -y \
7
  curl \
8
+ # Tesseract with language packs
9
+ tesseract-ocr \
10
+ tesseract-ocr-eng \
11
+ tesseract-ocr-deu \
12
+ tesseract-ocr-fra \
13
+ tesseract-ocr-spa \
14
+ tesseract-ocr-por \
15
+ tesseract-ocr-ita \
16
+ tesseract-ocr-rus \
17
+ tesseract-ocr-chi-sim \
18
+ tesseract-ocr-jpn \
19
+ tesseract-ocr-kor \
20
+ # PDF processing
21
  poppler-utils \
22
+ # RapidOCR dependencies
23
  libgl1 \
24
  libglib2.0-0 \
25
  libgomp1 \
 
27
  g++ \
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
+ # Environment variables
31
  ENV PYTHONUNBUFFERED=1
32
  ENV PYTHONIOENCODING=UTF-8
33
  ENV HF_HOME=/tmp/cache
34
  ENV PORT=7860
35
+ ENV DEFAULT_OCR_ENGINE=tesseract
36
 
37
+ # Install Python dependencies
38
  COPY requirements.txt .
39
  RUN pip install --upgrade pip setuptools wheel \
40
  && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
41
 
42
+ # Copy application
43
  COPY . .
44
 
45
  # Create non-root user
 
51
 
52
  EXPOSE $PORT
53
 
54
+ # Start application
55
  CMD bash -c "\
56
  (while true; do curl -s https://xce009-ocr-api.hf.space >/dev/null; sleep 300; done) & \
57
  uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"
main.py CHANGED
@@ -13,10 +13,11 @@ from contextvars import ContextVar
13
  import uvicorn
14
  import cv2
15
  import numpy as np
 
16
  from rapidocr_onnxruntime import RapidOCR
17
  from fastapi import (
18
  FastAPI, File, UploadFile, Depends,
19
- HTTPException, Request, status
20
  )
21
  from fastapi.middleware.cors import CORSMiddleware
22
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
@@ -32,15 +33,15 @@ from pdf2image import convert_from_path
32
  # ==========================================
33
  load_dotenv()
34
 
35
- # ContextVar for thread-safe Request ID tracking
36
  request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")
37
 
38
  class Config:
39
- APP_NAME = os.getenv("APP_NAME", "OCR API")
40
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
41
- MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
42
  ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
43
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
 
44
 
45
  class RequestIdFilter(logging.Filter):
46
  def filter(self, record):
@@ -63,6 +64,11 @@ class StatusEnum(str, Enum):
63
  SUCCESS = "success"
64
  ERROR = "error"
65
 
 
 
 
 
 
66
  class BaseResponse(BaseModel):
67
  request_id: str
68
  process_time_ms: float
@@ -75,6 +81,7 @@ class PageResult(BaseModel):
75
  text: str
76
  confidence: Optional[float] = None
77
  lines_detected: Optional[int] = None
 
78
 
79
  class OCRResult(BaseModel):
80
  filename: str
@@ -83,6 +90,7 @@ class OCRResult(BaseModel):
83
  total_pages: int
84
  pages_content: List[PageResult]
85
  average_confidence: Optional[float] = None
 
86
 
87
  class APIResponse(BaseResponse):
88
  data: Optional[OCRResult] = None
@@ -120,62 +128,78 @@ class FileValidator:
120
  raise HTTPException(413, "File too large")
121
  return tmp_path
122
 
123
- class OCRProcessor:
124
- """RapidOCR-based OCR processor with enhanced accuracy"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  def __init__(self):
127
- """Initialize RapidOCR engine"""
128
  self.engine = RapidOCR()
129
- logger.info("RapidOCR engine initialized successfully")
130
 
131
- def _extract_text_from_image(self, image_path: str) -> dict:
132
- """
133
- Extract text from a single image using RapidOCR
134
-
135
- Args:
136
- image_path: Path to image file
137
-
138
- Returns:
139
- dict: Contains text, confidence, and line count
140
- """
141
  try:
142
- # Perform OCR - RapidOCR returns (result_object, elapse_list)
143
  ocr_result, elapse = self.engine(image_path)
144
 
145
- # Handle result object
146
  if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
147
  result = list(ocr_result)
148
  else:
149
  result = ocr_result
150
 
151
  if result is None or len(result) == 0:
152
- logger.warning(f"No text detected in image: {image_path}")
153
  return {
154
  'text': '',
155
  'confidence': 0.0,
156
- 'lines_detected': 0
 
157
  }
158
 
159
- # Parse results
160
  texts = []
161
  confidences = []
162
 
163
- for idx, line in enumerate(result):
164
  try:
165
  if isinstance(line, (list, tuple)):
166
  if len(line) == 2:
167
- # Format: [box, text] or [text, confidence]
168
  if isinstance(line[0], (list, tuple)):
169
  box, text = line
170
  confidence = 1.0
171
  else:
172
  text, confidence = line
173
- box = []
174
  elif len(line) == 3:
175
- # Format: [box, text, confidence]
176
  box, text, confidence = line
177
  elif len(line) >= 4:
178
- # Format: [box, text, confidence, something_else]
179
  box, text, confidence = line[0], line[1], line[2]
180
  else:
181
  continue
@@ -184,50 +208,88 @@ class OCRProcessor:
184
 
185
  texts.append(str(text))
186
  confidences.append(float(confidence) if confidence is not None else 1.0)
187
-
188
- except Exception as e:
189
- logger.debug(f"Skipping malformed line {idx}: {e}")
190
  continue
191
 
192
- if not texts:
193
- return {
194
- 'text': '',
195
- 'confidence': 0.0,
196
- 'lines_detected': 0
197
- }
198
-
199
  combined_text = '\n'.join(texts)
200
  avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
201
 
202
- logger.debug(f"Extracted {len(texts)} lines with avg confidence: {avg_confidence:.2%}")
203
-
204
  return {
205
  'text': combined_text,
206
  'confidence': avg_confidence,
207
- 'lines_detected': len(texts)
 
208
  }
209
-
210
  except Exception as e:
211
- logger.error(f"Image OCR extraction failed: {str(e)}")
212
- raise ValueError(f"OCR extraction error: {str(e)}")
 
 
 
 
 
 
 
213
 
214
- def process_file(self, file_path: str, content_type: str) -> dict:
215
  """
216
- Process PDF or image file and extract text
217
 
218
  Args:
219
- file_path: Path to the file
220
- content_type: MIME type of the file
221
-
222
- Returns:
223
- dict: Processing results with pages content
224
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  start = time.perf_counter()
226
  pages_content = []
227
  all_confidences = []
 
228
 
229
  try:
230
- logger.info(f"Processing File: {file_path}")
231
 
232
  if content_type == "application/pdf":
233
  logger.info("Converting PDF to Images...")
@@ -238,43 +300,41 @@ class OCRProcessor:
238
  page_num = idx + 1
239
  logger.info(f"Scanning Page {page_num}/{total}")
240
 
241
- # Save PIL Image to temp file for RapidOCR
242
  with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
243
  img.save(tmp_img.name, 'PNG')
244
  temp_img_path = tmp_img.name
245
 
246
  try:
247
- # Extract text from temp image
248
- ocr_result = self._extract_text_from_image(temp_img_path)
249
 
250
  pages_content.append({
251
  "index": idx,
252
  "page_number": page_num,
253
  "text": ocr_result["text"],
254
  "confidence": ocr_result["confidence"],
255
- "lines_detected": ocr_result["lines_detected"]
 
256
  })
257
 
258
  if ocr_result["confidence"] > 0:
259
  all_confidences.append(ocr_result["confidence"])
260
  finally:
261
- # Clean up temp image
262
  try:
263
  os.remove(temp_img_path)
264
  except:
265
  pass
266
  else:
267
  logger.info("Scanning Single Image...")
268
-
269
- # Extract text from image
270
- ocr_result = self._extract_text_from_image(file_path)
271
 
272
  pages_content.append({
273
  "index": 0,
274
  "page_number": 1,
275
  "text": ocr_result["text"],
276
  "confidence": ocr_result["confidence"],
277
- "lines_detected": ocr_result["lines_detected"]
 
278
  })
279
 
280
  if ocr_result["confidence"] > 0:
@@ -288,7 +348,8 @@ class OCRProcessor:
288
  return {
289
  "total_pages": len(pages_content),
290
  "pages_content": pages_content,
291
- "average_confidence": avg_confidence
 
292
  }
293
 
294
  except Exception as e:
@@ -309,9 +370,7 @@ app.add_middleware(
309
 
310
  @app.middleware("http")
311
  async def request_context_middleware(request: Request, call_next):
312
- # 1. Generate ID
313
  req_id = str(uuid.uuid4())
314
- # 2. Set Context (Crucial for thread logging)
315
  token = request_id_ctx.set(req_id)
316
  request.state.request_id = req_id
317
 
@@ -335,7 +394,6 @@ async def request_context_middleware(request: Request, call_next):
335
  }
336
  )
337
  finally:
338
- # 3. Clean up Context
339
  request_id_ctx.reset(token)
340
 
341
  # ==========================================
@@ -348,53 +406,84 @@ async def root(request: Request):
348
  "request_id": request.state.request_id,
349
  "process_time_ms": 0,
350
  "status": StatusEnum.SUCCESS,
351
- "message": "RapidOCR API Active",
352
- "engine": "RapidOCR",
353
- "version": "1.0.0"
 
354
  }
355
 
356
  @app.get("/health")
357
  async def health_check(request: Request):
358
  """Health check endpoint"""
359
- try:
360
- return {
361
- "request_id": request.state.request_id,
362
- "status": StatusEnum.SUCCESS,
363
- "message": "Service healthy",
364
- "ocr_engine": "RapidOCR"
 
365
  }
366
- except Exception as e:
367
- return JSONResponse(
368
- status_code=503,
369
- content={
370
- "request_id": request.state.request_id,
371
- "status": StatusEnum.ERROR,
372
- "message": "Service unhealthy",
373
- "error": str(e)
374
- }
375
- )
376
 
377
  @app.post("/api/v1/get_data", response_model=APIResponse)
378
  async def extract_data(
379
  request: Request,
380
  file: UploadFile = File(...),
 
381
  token: str = Depends(SecurityService.validate_token)
382
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  start_ts = time.perf_counter()
384
  tmp_path = None
385
  req_id = request.state.request_id
386
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  try:
388
  FileValidator.validate(file)
389
  tmp_path = FileValidator.check_size_and_save(file)
390
 
391
- # Initialize OCR processor and run in thread pool
392
- # ContextVars are automatically copied to the thread
 
393
  processor = OCRProcessor()
394
  result = await run_in_threadpool(
395
  processor.process_file,
396
  tmp_path,
397
- file.content_type
 
398
  )
399
 
400
  return {
@@ -408,7 +497,8 @@ async def extract_data(
408
  "saved_file_path": tmp_path,
409
  "total_pages": result["total_pages"],
410
  "pages_content": result["pages_content"],
411
- "average_confidence": result.get("average_confidence", 0.0)
 
412
  }
413
  }
414
 
@@ -426,7 +516,6 @@ async def extract_data(
426
  )
427
  finally:
428
  if tmp_path:
429
- logger.info(f"File preserved at: {tmp_path}")
430
  try:
431
  os.remove(tmp_path)
432
  logger.info(f"Temporary file deleted: {tmp_path}")
@@ -439,14 +528,13 @@ async def extract_data(
439
 
440
  @app.on_event("startup")
441
  async def startup_event():
442
- """Initialize OCR engine on startup"""
443
- logger.info("Starting RapidOCR API...")
444
  try:
445
- # Test initialize the engine
446
- test_processor = OCRProcessor()
447
- logger.info("RapidOCR engine ready for processing")
448
  except Exception as e:
449
- logger.error(f"Failed to initialize OCR engine: {str(e)}")
450
  raise
451
 
452
  if __name__ == "__main__":
 
13
  import uvicorn
14
  import cv2
15
  import numpy as np
16
+ import pytesseract
17
  from rapidocr_onnxruntime import RapidOCR
18
  from fastapi import (
19
  FastAPI, File, UploadFile, Depends,
20
+ HTTPException, Request, Query, Form
21
  )
22
  from fastapi.middleware.cors import CORSMiddleware
23
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 
33
  # ==========================================
34
  load_dotenv()
35
 
 
36
  request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")
37
 
38
  class Config:
39
+ APP_NAME = os.getenv("APP_NAME", "Hybrid OCR API")
40
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
41
+ MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
42
  ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
43
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
44
+ DEFAULT_ENGINE = os.getenv("DEFAULT_OCR_ENGINE", "tesseract") # or "rapidocr" or "hybrid"
45
 
46
  class RequestIdFilter(logging.Filter):
47
  def filter(self, record):
 
64
  SUCCESS = "success"
65
  ERROR = "error"
66
 
67
+ class OCREngine(str, Enum):
68
+ TESSERACT = "tesseract"
69
+ RAPIDOCR = "rapidocr"
70
+ HYBRID = "hybrid" # Use both and pick best result
71
+
72
  class BaseResponse(BaseModel):
73
  request_id: str
74
  process_time_ms: float
 
81
  text: str
82
  confidence: Optional[float] = None
83
  lines_detected: Optional[int] = None
84
+ engine_used: Optional[str] = None
85
 
86
  class OCRResult(BaseModel):
87
  filename: str
 
90
  total_pages: int
91
  pages_content: List[PageResult]
92
  average_confidence: Optional[float] = None
93
+ engine: str
94
 
95
  class APIResponse(BaseResponse):
96
  data: Optional[OCRResult] = None
 
128
  raise HTTPException(413, "File too large")
129
  return tmp_path
130
 
131
+ class TesseractEngine:
132
+ """Tesseract OCR Engine - Best for English/European languages"""
133
+
134
+ @staticmethod
135
+ def extract_text(image_path: str) -> dict:
136
+ """Extract text using Tesseract"""
137
+ try:
138
+ img = Image.open(image_path)
139
+
140
+ # Get text with confidence
141
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
142
+
143
+ # Filter out low confidence and empty text
144
+ texts = []
145
+ confidences = []
146
+ for i, text in enumerate(data['text']):
147
+ if text.strip() and int(data['conf'][i]) > 0:
148
+ texts.append(text)
149
+ confidences.append(int(data['conf'][i]) / 100.0)
150
+
151
+ combined_text = ' '.join(texts)
152
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
153
+
154
+ return {
155
+ 'text': combined_text,
156
+ 'confidence': avg_confidence,
157
+ 'lines_detected': len(texts),
158
+ 'engine': 'tesseract'
159
+ }
160
+ except Exception as e:
161
+ logger.error(f"Tesseract extraction failed: {str(e)}")
162
+ raise ValueError(f"Tesseract error: {str(e)}")
163
+
164
+ class RapidOCREngine:
165
+ """RapidOCR Engine - Fast and lightweight"""
166
 
167
  def __init__(self):
 
168
  self.engine = RapidOCR()
 
169
 
170
+ def extract_text(self, image_path: str) -> dict:
171
+ """Extract text using RapidOCR"""
 
 
 
 
 
 
 
 
172
  try:
 
173
  ocr_result, elapse = self.engine(image_path)
174
 
 
175
  if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
176
  result = list(ocr_result)
177
  else:
178
  result = ocr_result
179
 
180
  if result is None or len(result) == 0:
 
181
  return {
182
  'text': '',
183
  'confidence': 0.0,
184
+ 'lines_detected': 0,
185
+ 'engine': 'rapidocr'
186
  }
187
 
 
188
  texts = []
189
  confidences = []
190
 
191
+ for line in result:
192
  try:
193
  if isinstance(line, (list, tuple)):
194
  if len(line) == 2:
 
195
  if isinstance(line[0], (list, tuple)):
196
  box, text = line
197
  confidence = 1.0
198
  else:
199
  text, confidence = line
 
200
  elif len(line) == 3:
 
201
  box, text, confidence = line
202
  elif len(line) >= 4:
 
203
  box, text, confidence = line[0], line[1], line[2]
204
  else:
205
  continue
 
208
 
209
  texts.append(str(text))
210
  confidences.append(float(confidence) if confidence is not None else 1.0)
211
+ except:
 
 
212
  continue
213
 
 
 
 
 
 
 
 
214
  combined_text = '\n'.join(texts)
215
  avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
216
 
 
 
217
  return {
218
  'text': combined_text,
219
  'confidence': avg_confidence,
220
+ 'lines_detected': len(texts),
221
+ 'engine': 'rapidocr'
222
  }
 
223
  except Exception as e:
224
+ logger.error(f"RapidOCR extraction failed: {str(e)}")
225
+ raise ValueError(f"RapidOCR error: {str(e)}")
226
+
227
+ class HybridOCRProcessor:
228
+ """Hybrid processor that uses both engines and picks the best result"""
229
+
230
+ def __init__(self):
231
+ self.rapidocr = RapidOCREngine()
232
+ self.tesseract = TesseractEngine()
233
 
234
+ def extract_text(self, image_path: str, engine: str = "tesseract") -> dict:
235
  """
236
+ Extract text using specified engine or both
237
 
238
  Args:
239
+ image_path: Path to image
240
+ engine: 'tesseract', 'rapidocr', or 'hybrid'
 
 
 
241
  """
242
+ if engine == OCREngine.TESSERACT:
243
+ return self.tesseract.extract_text(image_path)
244
+
245
+ elif engine == OCREngine.RAPIDOCR:
246
+ return self.rapidocr.extract_text(image_path)
247
+
248
+ elif engine == OCREngine.HYBRID:
249
+ # Run both engines
250
+ logger.info("Running hybrid OCR (Tesseract + RapidOCR)")
251
+
252
+ try:
253
+ tess_result = self.tesseract.extract_text(image_path)
254
+ except Exception as e:
255
+ logger.warning(f"Tesseract failed in hybrid mode: {e}")
256
+ tess_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
257
+
258
+ try:
259
+ rapid_result = self.rapidocr.extract_text(image_path)
260
+ except Exception as e:
261
+ logger.warning(f"RapidOCR failed in hybrid mode: {e}")
262
+ rapid_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
263
+
264
+ # Pick the one with higher confidence
265
+ if tess_result['confidence'] >= rapid_result['confidence']:
266
+ logger.info(f"Using Tesseract (conf: {tess_result['confidence']:.2%} vs {rapid_result['confidence']:.2%})")
267
+ tess_result['engine'] = 'tesseract (hybrid)'
268
+ return tess_result
269
+ else:
270
+ logger.info(f"Using RapidOCR (conf: {rapid_result['confidence']:.2%} vs {tess_result['confidence']:.2%})")
271
+ rapid_result['engine'] = 'rapidocr (hybrid)'
272
+ return rapid_result
273
+
274
+ else:
275
+ raise ValueError(f"Unknown engine: {engine}")
276
+
277
+ class OCRProcessor:
278
+ """Main OCR processor supporting multiple engines"""
279
+
280
+ def __init__(self, engine: str = None):
281
+ self.engine_type = engine or Config.DEFAULT_ENGINE
282
+ self.processor = HybridOCRProcessor()
283
+
284
+ def process_file(self, file_path: str, content_type: str, engine: str = None) -> dict:
285
+ """Process PDF or image file"""
286
  start = time.perf_counter()
287
  pages_content = []
288
  all_confidences = []
289
+ engine_to_use = engine or self.engine_type
290
 
291
  try:
292
+ logger.info(f"Processing File: {file_path} with engine: {engine_to_use}")
293
 
294
  if content_type == "application/pdf":
295
  logger.info("Converting PDF to Images...")
 
300
  page_num = idx + 1
301
  logger.info(f"Scanning Page {page_num}/{total}")
302
 
303
+ # Save PIL Image to temp file
304
  with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
305
  img.save(tmp_img.name, 'PNG')
306
  temp_img_path = tmp_img.name
307
 
308
  try:
309
+ ocr_result = self.processor.extract_text(temp_img_path, engine_to_use)
 
310
 
311
  pages_content.append({
312
  "index": idx,
313
  "page_number": page_num,
314
  "text": ocr_result["text"],
315
  "confidence": ocr_result["confidence"],
316
+ "lines_detected": ocr_result["lines_detected"],
317
+ "engine_used": ocr_result.get("engine", engine_to_use)
318
  })
319
 
320
  if ocr_result["confidence"] > 0:
321
  all_confidences.append(ocr_result["confidence"])
322
  finally:
 
323
  try:
324
  os.remove(temp_img_path)
325
  except:
326
  pass
327
  else:
328
  logger.info("Scanning Single Image...")
329
+ ocr_result = self.processor.extract_text(file_path, engine_to_use)
 
 
330
 
331
  pages_content.append({
332
  "index": 0,
333
  "page_number": 1,
334
  "text": ocr_result["text"],
335
  "confidence": ocr_result["confidence"],
336
+ "lines_detected": ocr_result["lines_detected"],
337
+ "engine_used": ocr_result.get("engine", engine_to_use)
338
  })
339
 
340
  if ocr_result["confidence"] > 0:
 
348
  return {
349
  "total_pages": len(pages_content),
350
  "pages_content": pages_content,
351
+ "average_confidence": avg_confidence,
352
+ "engine": engine_to_use
353
  }
354
 
355
  except Exception as e:
 
370
 
371
  @app.middleware("http")
372
  async def request_context_middleware(request: Request, call_next):
 
373
  req_id = str(uuid.uuid4())
 
374
  token = request_id_ctx.set(req_id)
375
  request.state.request_id = req_id
376
 
 
394
  }
395
  )
396
  finally:
 
397
  request_id_ctx.reset(token)
398
 
399
  # ==========================================
 
406
  "request_id": request.state.request_id,
407
  "process_time_ms": 0,
408
  "status": StatusEnum.SUCCESS,
409
+ "message": "Hybrid OCR API Active",
410
+ "engines": ["tesseract", "rapidocr", "hybrid"],
411
+ "default_engine": Config.DEFAULT_ENGINE,
412
+ "version": "2.0.0"
413
  }
414
 
415
  @app.get("/health")
416
  async def health_check(request: Request):
417
  """Health check endpoint"""
418
+ return {
419
+ "request_id": request.state.request_id,
420
+ "status": StatusEnum.SUCCESS,
421
+ "message": "Service healthy",
422
+ "engines": {
423
+ "tesseract": "ready",
424
+ "rapidocr": "ready"
425
  }
426
+ }
 
 
 
 
 
 
 
 
 
427
 
428
  @app.post("/api/v1/get_data", response_model=APIResponse)
429
  async def extract_data(
430
  request: Request,
431
  file: UploadFile = File(...),
432
+ engine: Optional[str] = Form(default=None, description="OCR engine: tesseract, rapidocr, or hybrid"),
433
  token: str = Depends(SecurityService.validate_token)
434
  ):
435
+ """
436
+ Extract text from image or PDF
437
+
438
+ - **file**: Image or PDF file to process
439
+ - **engine**: Choose OCR engine (optional, can be sent as form data or query param)
440
+ - `tesseract`: Best for English/European languages, highest accuracy (DEFAULT)
441
+ - `rapidocr`: Faster, good for Asian languages
442
+ - `hybrid`: Use both and pick best result (slower but most accurate)
443
+
444
+ Example curl:
445
+ ```bash
446
+ # Using query parameter
447
+ curl -X POST "http://localhost:7860/api/v1/get_data?engine=tesseract" \
448
+ -H "Authorization: Bearer your-token" \
449
+ -F "file=@document.pdf"
450
+
451
+ # Using form data (payload)
452
+ curl -X POST "http://localhost:7860/api/v1/get_data" \
453
+ -H "Authorization: Bearer your-token" \
454
+ -F "file=@document.pdf" \
455
+ -F "engine=hybrid"
456
+ ```
457
+ """
458
  start_ts = time.perf_counter()
459
  tmp_path = None
460
  req_id = request.state.request_id
461
 
462
+ # Validate engine parameter
463
+ engine_to_use = engine
464
+ if engine_to_use and engine_to_use not in [e.value for e in OCREngine]:
465
+ return JSONResponse(
466
+ status_code=400,
467
+ content={
468
+ "request_id": req_id,
469
+ "status": StatusEnum.ERROR,
470
+ "error_message": f"Invalid engine '{engine_to_use}'. Must be one of: tesseract, rapidocr, hybrid"
471
+ }
472
+ )
473
+
474
  try:
475
  FileValidator.validate(file)
476
  tmp_path = FileValidator.check_size_and_save(file)
477
 
478
+ logger.info(f"Processing with engine: {engine_to_use or Config.DEFAULT_ENGINE}")
479
+
480
+ # Initialize processor with selected engine
481
  processor = OCRProcessor()
482
  result = await run_in_threadpool(
483
  processor.process_file,
484
  tmp_path,
485
+ file.content_type,
486
+ engine_to_use
487
  )
488
 
489
  return {
 
497
  "saved_file_path": tmp_path,
498
  "total_pages": result["total_pages"],
499
  "pages_content": result["pages_content"],
500
+ "average_confidence": result.get("average_confidence", 0.0),
501
+ "engine": result["engine"]
502
  }
503
  }
504
 
 
516
  )
517
  finally:
518
  if tmp_path:
 
519
  try:
520
  os.remove(tmp_path)
521
  logger.info(f"Temporary file deleted: {tmp_path}")
 
528
 
529
  @app.on_event("startup")
530
  async def startup_event():
531
+ """Initialize OCR engines on startup"""
532
+ logger.info("Starting Hybrid OCR API...")
533
  try:
534
+ test_processor = HybridOCRProcessor()
535
+ logger.info("All OCR engines ready for processing")
 
536
  except Exception as e:
537
+ logger.error(f"Failed to initialize OCR engines: {str(e)}")
538
  raise
539
 
540
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -9,5 +9,8 @@ opencv-python-headless==4.12.0.88
9
  numpy<2.3.0
10
  pdf2image==1.17.0
11
  Pillow==11.2.1
 
 
 
12
  rapidocr-onnxruntime>=1.3.0
13
  onnxruntime>=1.16.0
 
9
  numpy<2.3.0
10
  pdf2image==1.17.0
11
  Pillow==11.2.1
12
+ # Tesseract OCR
13
+ pytesseract==0.3.13
14
+ # RapidOCR
15
  rapidocr-onnxruntime>=1.3.0
16
  onnxruntime>=1.16.0