xce009 commited on
Commit
29ed039
·
verified ·
1 Parent(s): e62fae2

Delete main.py

Browse files
Files changed (1) hide show
  1. main.py +0 -546
main.py DELETED
@@ -1,546 +0,0 @@
1
- import os
2
- import uuid
3
- import time
4
- import logging
5
- import shutil
6
- import tempfile
7
- from typing import Optional, List
8
- from enum import Enum
9
- from pathlib import Path
10
- from contextvars import ContextVar
11
-
12
- # Third-party imports
13
- import uvicorn
14
- import cv2
15
- import numpy as np
16
- import pytesseract
17
- from rapidocr_onnxruntime import RapidOCR
18
- from fastapi import (
19
- FastAPI, File, UploadFile, Depends,
20
- HTTPException, Request, Query, Form
21
- )
22
- from fastapi.middleware.cors import CORSMiddleware
23
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
24
- from fastapi.responses import JSONResponse
25
- from fastapi.concurrency import run_in_threadpool
26
- from pydantic import BaseModel
27
- from dotenv import load_dotenv
28
- from PIL import Image
29
- from pdf2image import convert_from_path
30
-
31
- # ==========================================
32
- # 1. CONFIGURATION & LOGGING SETUP
33
- # ==========================================
34
- load_dotenv()
35
-
36
- request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")
37
-
38
- class Config:
39
- APP_NAME = os.getenv("APP_NAME", "Hybrid OCR API")
40
- API_TOKEN = os.getenv("API_BEARER_TOKEN")
41
- MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
42
- ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
43
- ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
44
- DEFAULT_ENGINE = os.getenv("DEFAULT_OCR_ENGINE", "tesseract") # or "rapidocr" or "hybrid"
45
-
46
- class RequestIdFilter(logging.Filter):
47
- def filter(self, record):
48
- record.request_id = request_id_ctx.get()
49
- return True
50
-
51
- logging.basicConfig(
52
- level=logging.INFO,
53
- format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
54
- datefmt='%Y-%m-%d %H:%M:%S',
55
- force=True
56
- )
57
- logger = logging.getLogger("ocr_api")
58
- logger.addFilter(RequestIdFilter())
59
-
60
- # ==========================================
61
- # 2. MODELS
62
- # ==========================================
63
- class StatusEnum(str, Enum):
64
- SUCCESS = "success"
65
- ERROR = "error"
66
-
67
- class OCREngine(str, Enum):
68
- TESSERACT = "tesseract"
69
- RAPIDOCR = "rapidocr"
70
- HYBRID = "hybrid" # Use both and pick best result
71
-
72
- class BaseResponse(BaseModel):
73
- request_id: str
74
- process_time_ms: float
75
- status: StatusEnum
76
- message: Optional[str] = None
77
-
78
- class PageResult(BaseModel):
79
- index: int
80
- page_number: int
81
- text: str
82
- confidence: Optional[float] = None
83
- lines_detected: Optional[int] = None
84
- engine_used: Optional[str] = None
85
-
86
- class OCRResult(BaseModel):
87
- filename: str
88
- content_type: str
89
- saved_file_path: str
90
- total_pages: int
91
- pages_content: List[PageResult]
92
- average_confidence: Optional[float] = None
93
- engine: str
94
-
95
- class APIResponse(BaseResponse):
96
- data: Optional[OCRResult] = None
97
- error_message: Optional[str] = None
98
-
99
- # ==========================================
100
- # 3. SERVICES
101
- # ==========================================
102
-
103
- class SecurityService:
104
- security_scheme = HTTPBearer()
105
-
106
- @staticmethod
107
- async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
108
- if credentials.credentials != Config.API_TOKEN:
109
- logger.warning("Auth Failed: Invalid Token")
110
- raise HTTPException(status_code=401, detail="Invalid Bearer Token")
111
- return credentials.credentials
112
-
113
- class FileValidator:
114
- @staticmethod
115
- def validate(file: UploadFile):
116
- if file.content_type not in Config.ALLOWED_TYPES:
117
- raise HTTPException(400, f"Invalid file type: {file.content_type}")
118
-
119
- @staticmethod
120
- def check_size_and_save(file: UploadFile) -> str:
121
- suffix = Path(file.filename).suffix
122
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
123
- shutil.copyfileobj(file.file, buffer)
124
- tmp_path = os.path.abspath(buffer.name)
125
-
126
- if os.path.getsize(tmp_path) > Config.MAX_SIZE:
127
- os.remove(tmp_path)
128
- raise HTTPException(413, "File too large")
129
- return tmp_path
130
-
131
- class TesseractEngine:
132
- """Tesseract OCR Engine - Best for English/European languages"""
133
-
134
- @staticmethod
135
- def extract_text(image_path: str) -> dict:
136
- """Extract text using Tesseract"""
137
- try:
138
- img = Image.open(image_path)
139
-
140
- # Get text with confidence
141
- data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
142
-
143
- # Filter out low confidence and empty text
144
- texts = []
145
- confidences = []
146
- for i, text in enumerate(data['text']):
147
- if text.strip() and int(data['conf'][i]) > 0:
148
- texts.append(text)
149
- confidences.append(int(data['conf'][i]) / 100.0)
150
-
151
- combined_text = ' '.join(texts)
152
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
153
-
154
- return {
155
- 'text': combined_text,
156
- 'confidence': avg_confidence,
157
- 'lines_detected': len(texts),
158
- 'engine': 'tesseract'
159
- }
160
- except Exception as e:
161
- logger.error(f"Tesseract extraction failed: {str(e)}")
162
- raise ValueError(f"Tesseract error: {str(e)}")
163
-
164
- class RapidOCREngine:
165
- """RapidOCR Engine - Fast and lightweight"""
166
-
167
- def __init__(self):
168
- self.engine = RapidOCR()
169
-
170
- def extract_text(self, image_path: str) -> dict:
171
- """Extract text using RapidOCR"""
172
- try:
173
- ocr_result, elapse = self.engine(image_path)
174
-
175
- if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
176
- result = list(ocr_result)
177
- else:
178
- result = ocr_result
179
-
180
- if result is None or len(result) == 0:
181
- return {
182
- 'text': '',
183
- 'confidence': 0.0,
184
- 'lines_detected': 0,
185
- 'engine': 'rapidocr'
186
- }
187
-
188
- texts = []
189
- confidences = []
190
-
191
- for line in result:
192
- try:
193
- if isinstance(line, (list, tuple)):
194
- if len(line) == 2:
195
- if isinstance(line[0], (list, tuple)):
196
- box, text = line
197
- confidence = 1.0
198
- else:
199
- text, confidence = line
200
- elif len(line) == 3:
201
- box, text, confidence = line
202
- elif len(line) >= 4:
203
- box, text, confidence = line[0], line[1], line[2]
204
- else:
205
- continue
206
- else:
207
- continue
208
-
209
- texts.append(str(text))
210
- confidences.append(float(confidence) if confidence is not None else 1.0)
211
- except:
212
- continue
213
-
214
- combined_text = '\n'.join(texts)
215
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
216
-
217
- return {
218
- 'text': combined_text,
219
- 'confidence': avg_confidence,
220
- 'lines_detected': len(texts),
221
- 'engine': 'rapidocr'
222
- }
223
- except Exception as e:
224
- logger.error(f"RapidOCR extraction failed: {str(e)}")
225
- raise ValueError(f"RapidOCR error: {str(e)}")
226
-
227
- class HybridOCRProcessor:
228
- """Hybrid processor that uses both engines and picks the best result"""
229
-
230
- def __init__(self):
231
- self.rapidocr = RapidOCREngine()
232
- self.tesseract = TesseractEngine()
233
-
234
- def extract_text(self, image_path: str, engine: str = "tesseract") -> dict:
235
- """
236
- Extract text using specified engine or both
237
-
238
- Args:
239
- image_path: Path to image
240
- engine: 'tesseract', 'rapidocr', or 'hybrid'
241
- """
242
- if engine == OCREngine.TESSERACT:
243
- return self.tesseract.extract_text(image_path)
244
-
245
- elif engine == OCREngine.RAPIDOCR:
246
- return self.rapidocr.extract_text(image_path)
247
-
248
- elif engine == OCREngine.HYBRID:
249
- # Run both engines
250
- logger.info("Running hybrid OCR (Tesseract + RapidOCR)")
251
-
252
- try:
253
- tess_result = self.tesseract.extract_text(image_path)
254
- except Exception as e:
255
- logger.warning(f"Tesseract failed in hybrid mode: {e}")
256
- tess_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
257
-
258
- try:
259
- rapid_result = self.rapidocr.extract_text(image_path)
260
- except Exception as e:
261
- logger.warning(f"RapidOCR failed in hybrid mode: {e}")
262
- rapid_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
263
-
264
- # Pick the one with higher confidence
265
- if tess_result['confidence'] >= rapid_result['confidence']:
266
- logger.info(f"Using Tesseract (conf: {tess_result['confidence']:.2%} vs {rapid_result['confidence']:.2%})")
267
- tess_result['engine'] = 'tesseract (hybrid)'
268
- return tess_result
269
- else:
270
- logger.info(f"Using RapidOCR (conf: {rapid_result['confidence']:.2%} vs {tess_result['confidence']:.2%})")
271
- rapid_result['engine'] = 'rapidocr (hybrid)'
272
- return rapid_result
273
-
274
- else:
275
- raise ValueError(f"Unknown engine: {engine}")
276
-
277
- class OCRProcessor:
278
- """Main OCR processor supporting multiple engines"""
279
-
280
- def __init__(self, engine: str = None):
281
- self.engine_type = engine or Config.DEFAULT_ENGINE
282
- self.processor = HybridOCRProcessor()
283
-
284
- def process_file(self, file_path: str, content_type: str, engine: str = None) -> dict:
285
- """Process PDF or image file"""
286
- start = time.perf_counter()
287
- pages_content = []
288
- all_confidences = []
289
- engine_to_use = engine or self.engine_type
290
-
291
- try:
292
- logger.info(f"Processing File: {file_path} with engine: {engine_to_use}")
293
-
294
- if content_type == "application/pdf":
295
- logger.info("Converting PDF to Images...")
296
- images = convert_from_path(file_path)
297
- total = len(images)
298
-
299
- for idx, img in enumerate(images):
300
- page_num = idx + 1
301
- logger.info(f"Scanning Page {page_num}/{total}")
302
-
303
- # Save PIL Image to temp file
304
- with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
305
- img.save(tmp_img.name, 'PNG')
306
- temp_img_path = tmp_img.name
307
-
308
- try:
309
- ocr_result = self.processor.extract_text(temp_img_path, engine_to_use)
310
-
311
- pages_content.append({
312
- "index": idx,
313
- "page_number": page_num,
314
- "text": ocr_result["text"],
315
- "confidence": ocr_result["confidence"],
316
- "lines_detected": ocr_result["lines_detected"],
317
- "engine_used": ocr_result.get("engine", engine_to_use)
318
- })
319
-
320
- if ocr_result["confidence"] > 0:
321
- all_confidences.append(ocr_result["confidence"])
322
- finally:
323
- try:
324
- os.remove(temp_img_path)
325
- except:
326
- pass
327
- else:
328
- logger.info("Scanning Single Image...")
329
- ocr_result = self.processor.extract_text(file_path, engine_to_use)
330
-
331
- pages_content.append({
332
- "index": 0,
333
- "page_number": 1,
334
- "text": ocr_result["text"],
335
- "confidence": ocr_result["confidence"],
336
- "lines_detected": ocr_result["lines_detected"],
337
- "engine_used": ocr_result.get("engine", engine_to_use)
338
- })
339
-
340
- if ocr_result["confidence"] > 0:
341
- all_confidences.append(ocr_result["confidence"])
342
-
343
- avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
344
- processing_time = (time.perf_counter() - start) * 1000
345
-
346
- logger.info(f"OCR Complete in {processing_time:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
347
-
348
- return {
349
- "total_pages": len(pages_content),
350
- "pages_content": pages_content,
351
- "average_confidence": avg_confidence,
352
- "engine": engine_to_use
353
- }
354
-
355
- except Exception as e:
356
- logger.error(f"OCR Logic Failure: {str(e)}")
357
- raise ValueError(str(e))
358
-
359
- # ==========================================
360
- # 4. APP & MIDDLEWARE
361
- # ==========================================
362
- app = FastAPI(title=Config.APP_NAME)
363
-
364
- app.add_middleware(
365
- CORSMiddleware,
366
- allow_origins=Config.ALLOWED_ORIGINS if Config.ALLOWED_ORIGINS else ["*"],
367
- allow_methods=["*"],
368
- allow_headers=["*"],
369
- )
370
-
371
- @app.middleware("http")
372
- async def request_context_middleware(request: Request, call_next):
373
- req_id = str(uuid.uuid4())
374
- token = request_id_ctx.set(req_id)
375
- request.state.request_id = req_id
376
-
377
- start_time = time.perf_counter()
378
- logger.info(f"Start: {request.method} {request.url.path}")
379
-
380
- try:
381
- response = await call_next(request)
382
- duration = (time.perf_counter() - start_time) * 1000
383
- response.headers["X-Request-ID"] = req_id
384
- logger.info(f"Finish: {response.status_code} in {duration:.2f}ms")
385
- return response
386
- except Exception as e:
387
- logger.exception("Middleware caught crash")
388
- return JSONResponse(
389
- status_code=500,
390
- content={
391
- "status": "error",
392
- "message": "Internal Server Error",
393
- "request_id": req_id
394
- }
395
- )
396
- finally:
397
- request_id_ctx.reset(token)
398
-
399
- # ==========================================
400
- # 5. ENDPOINTS
401
- # ==========================================
402
-
403
- @app.get("/")
404
- async def root(request: Request):
405
- return {
406
- "request_id": request.state.request_id,
407
- "process_time_ms": 0,
408
- "status": StatusEnum.SUCCESS,
409
- "message": "Hybrid OCR API Active",
410
- "engines": ["tesseract", "rapidocr", "hybrid"],
411
- "default_engine": Config.DEFAULT_ENGINE,
412
- "version": "2.0.0"
413
- }
414
-
415
- @app.get("/health")
416
- async def health_check(request: Request):
417
- """Health check endpoint"""
418
- return {
419
- "request_id": request.state.request_id,
420
- "status": StatusEnum.SUCCESS,
421
- "message": "Service healthy",
422
- "engines": {
423
- "tesseract": "ready",
424
- "rapidocr": "ready"
425
- }
426
- }
427
-
428
- @app.post("/api/v1/get_data", response_model=APIResponse)
429
- async def extract_data(
430
- request: Request,
431
- file: UploadFile = File(...),
432
- engine: Optional[str] = Form(default=None, description="OCR engine: tesseract, rapidocr, or hybrid"),
433
- token: str = Depends(SecurityService.validate_token)
434
- ):
435
- """
436
- Extract text from image or PDF
437
-
438
- - **file**: Image or PDF file to process
439
- - **engine**: Choose OCR engine (optional, can be sent as form data or query param)
440
- - `tesseract`: Best for English/European languages, highest accuracy (DEFAULT)
441
- - `rapidocr`: Faster, good for Asian languages
442
- - `hybrid`: Use both and pick best result (slower but most accurate)
443
-
444
- Example curl:
445
- ```bash
446
- # Using query parameter
447
- curl -X POST "http://localhost:7860/api/v1/get_data?engine=tesseract" \
448
- -H "Authorization: Bearer your-token" \
449
- -F "file=@document.pdf"
450
-
451
- # Using form data (payload)
452
- curl -X POST "http://localhost:7860/api/v1/get_data" \
453
- -H "Authorization: Bearer your-token" \
454
- -F "file=@document.pdf" \
455
- -F "engine=hybrid"
456
- ```
457
- """
458
- start_ts = time.perf_counter()
459
- tmp_path = None
460
- req_id = request.state.request_id
461
-
462
- # Validate engine parameter
463
- engine_to_use = engine
464
- if engine_to_use and engine_to_use not in [e.value for e in OCREngine]:
465
- return JSONResponse(
466
- status_code=400,
467
- content={
468
- "request_id": req_id,
469
- "status": StatusEnum.ERROR,
470
- "error_message": f"Invalid engine '{engine_to_use}'. Must be one of: tesseract, rapidocr, hybrid"
471
- }
472
- )
473
-
474
- try:
475
- FileValidator.validate(file)
476
- tmp_path = FileValidator.check_size_and_save(file)
477
-
478
- logger.info(f"Processing with engine: {engine_to_use or Config.DEFAULT_ENGINE}")
479
-
480
- # Initialize processor with selected engine
481
- processor = OCRProcessor()
482
- result = await run_in_threadpool(
483
- processor.process_file,
484
- tmp_path,
485
- file.content_type,
486
- engine_to_use
487
- )
488
-
489
- return {
490
- "request_id": req_id,
491
- "process_time_ms": (time.perf_counter() - start_ts) * 1000,
492
- "status": StatusEnum.SUCCESS,
493
- "message": "OCR Extraction Successful",
494
- "data": {
495
- "filename": file.filename,
496
- "content_type": file.content_type,
497
- "saved_file_path": tmp_path,
498
- "total_pages": result["total_pages"],
499
- "pages_content": result["pages_content"],
500
- "average_confidence": result.get("average_confidence", 0.0),
501
- "engine": result["engine"]
502
- }
503
- }
504
-
505
- except Exception as e:
506
- logger.error(f"Request failed: {str(e)}")
507
- status_code = getattr(e, "status_code", 500)
508
- return JSONResponse(
509
- status_code=status_code,
510
- content={
511
- "request_id": req_id,
512
- "process_time_ms": (time.perf_counter() - start_ts) * 1000,
513
- "status": StatusEnum.ERROR,
514
- "error_message": getattr(e, "detail", str(e))
515
- }
516
- )
517
- finally:
518
- if tmp_path:
519
- try:
520
- os.remove(tmp_path)
521
- logger.info(f"Temporary file deleted: {tmp_path}")
522
- except Exception as e:
523
- logger.warning(f"Failed to delete temp file: {str(e)}")
524
-
525
- # ==========================================
526
- # 6. STARTUP
527
- # ==========================================
528
-
529
- @app.on_event("startup")
530
- async def startup_event():
531
- """Initialize OCR engines on startup"""
532
- logger.info("Starting Hybrid OCR API...")
533
- try:
534
- test_processor = HybridOCRProcessor()
535
- logger.info("All OCR engines ready for processing")
536
- except Exception as e:
537
- logger.error(f"Failed to initialize OCR engines: {str(e)}")
538
- raise
539
-
540
- if __name__ == "__main__":
541
- uvicorn.run(
542
- "main:app",
543
- host="0.0.0.0",
544
- port=int(os.getenv("PORT", 7860)),
545
- workers=4
546
- )