Spaces:

xce009
/

ocr-api

Running

App Files Files Community

xce009 commited on Jan 25

Commit

29ed039

verified ·

1 Parent(s): e62fae2

Delete main.py

Browse files

Files changed (1) hide show

main.py +0 -546

main.py DELETED Viewed

@@ -1,546 +0,0 @@
-import os
-import uuid
-import time
-import logging
-import shutil
-import tempfile
-from typing import Optional, List
-from enum import Enum
-from pathlib import Path
-from contextvars import ContextVar
-# Third-party imports
-import uvicorn
-import cv2
-import numpy as np
-import pytesseract
-from rapidocr_onnxruntime import RapidOCR
-from fastapi import (
-    FastAPI, File, UploadFile, Depends,
-    HTTPException, Request, Query, Form
-)
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi.responses import JSONResponse
-from fastapi.concurrency import run_in_threadpool
-from pydantic import BaseModel
-from dotenv import load_dotenv
-from PIL import Image
-from pdf2image import convert_from_path
-# ==========================================
-# 1. CONFIGURATION & LOGGING SETUP
-# ==========================================
-load_dotenv()
-request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")
-class Config:
-    APP_NAME = os.getenv("APP_NAME", "Hybrid OCR API")
-    API_TOKEN = os.getenv("API_BEARER_TOKEN")
-    MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
-    ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
-    ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
-    DEFAULT_ENGINE = os.getenv("DEFAULT_OCR_ENGINE", "tesseract")  # or "rapidocr" or "hybrid"
-class RequestIdFilter(logging.Filter):
-    def filter(self, record):
-        record.request_id = request_id_ctx.get()
-        return True
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S',
-    force=True
-)
-logger = logging.getLogger("ocr_api")
-logger.addFilter(RequestIdFilter())
-# ==========================================
-# 2. MODELS
-# ==========================================
-class StatusEnum(str, Enum):
-    SUCCESS = "success"
-    ERROR = "error"
-class OCREngine(str, Enum):
-    TESSERACT = "tesseract"
-    RAPIDOCR = "rapidocr"
-    HYBRID = "hybrid"  # Use both and pick best result
-class BaseResponse(BaseModel):
-    request_id: str
-    process_time_ms: float
-    status: StatusEnum
-    message: Optional[str] = None
-class PageResult(BaseModel):
-    index: int
-    page_number: int
-    text: str
-    confidence: Optional[float] = None
-    lines_detected: Optional[int] = None
-    engine_used: Optional[str] = None
-class OCRResult(BaseModel):
-    filename: str
-    content_type: str
-    saved_file_path: str
-    total_pages: int
-    pages_content: List[PageResult]
-    average_confidence: Optional[float] = None
-    engine: str
-class APIResponse(BaseResponse):
-    data: Optional[OCRResult] = None
-    error_message: Optional[str] = None
-# ==========================================
-# 3. SERVICES
-# ==========================================
-class SecurityService:
-    security_scheme = HTTPBearer()
-    @staticmethod
-    async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
-        if credentials.credentials != Config.API_TOKEN:
-            logger.warning("Auth Failed: Invalid Token")
-            raise HTTPException(status_code=401, detail="Invalid Bearer Token")
-        return credentials.credentials
-class FileValidator:
-    @staticmethod
-    def validate(file: UploadFile):
-        if file.content_type not in Config.ALLOWED_TYPES:
-            raise HTTPException(400, f"Invalid file type: {file.content_type}")
-    @staticmethod
-    def check_size_and_save(file: UploadFile) -> str:
-        suffix = Path(file.filename).suffix
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
-            shutil.copyfileobj(file.file, buffer)
-            tmp_path = os.path.abspath(buffer.name)
-        if os.path.getsize(tmp_path) > Config.MAX_SIZE:
-            os.remove(tmp_path)
-            raise HTTPException(413, "File too large")
-        return tmp_path
-class TesseractEngine:
-    """Tesseract OCR Engine - Best for English/European languages"""
-    @staticmethod
-    def extract_text(image_path: str) -> dict:
-        """Extract text using Tesseract"""
-        try:
-            img = Image.open(image_path)
-            # Get text with confidence
-            data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
-            # Filter out low confidence and empty text
-            texts = []
-            confidences = []
-            for i, text in enumerate(data['text']):
-                if text.strip() and int(data['conf'][i]) > 0:
-                    texts.append(text)
-                    confidences.append(int(data['conf'][i]) / 100.0)
-            combined_text = ' '.join(texts)
-            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
-            return {
-                'text': combined_text,
-                'confidence': avg_confidence,
-                'lines_detected': len(texts),
-                'engine': 'tesseract'
-            }
-        except Exception as e:
-            logger.error(f"Tesseract extraction failed: {str(e)}")
-            raise ValueError(f"Tesseract error: {str(e)}")
-class RapidOCREngine:
-    """RapidOCR Engine - Fast and lightweight"""
-    def __init__(self):
-        self.engine = RapidOCR()
-    def extract_text(self, image_path: str) -> dict:
-        """Extract text using RapidOCR"""
-        try:
-            ocr_result, elapse = self.engine(image_path)
-            if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
-                result = list(ocr_result)
-            else:
-                result = ocr_result
-            if result is None or len(result) == 0:
-                return {
-                    'text': '',
-                    'confidence': 0.0,
-                    'lines_detected': 0,
-                    'engine': 'rapidocr'
-                }
-            texts = []
-            confidences = []
-            for line in result:
-                try:
-                    if isinstance(line, (list, tuple)):
-                        if len(line) == 2:
-                            if isinstance(line[0], (list, tuple)):
-                                box, text = line
-                                confidence = 1.0
-                            else:
-                                text, confidence = line
-                        elif len(line) == 3:
-                            box, text, confidence = line
-                        elif len(line) >= 4:
-                            box, text, confidence = line[0], line[1], line[2]
-                        else:
-                            continue
-                    else:
-                        continue
-                    texts.append(str(text))
-                    confidences.append(float(confidence) if confidence is not None else 1.0)
-                except:
-                    continue
-            combined_text = '\n'.join(texts)
-            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
-            return {
-                'text': combined_text,
-                'confidence': avg_confidence,
-                'lines_detected': len(texts),
-                'engine': 'rapidocr'
-            }
-        except Exception as e:
-            logger.error(f"RapidOCR extraction failed: {str(e)}")
-            raise ValueError(f"RapidOCR error: {str(e)}")
-class HybridOCRProcessor:
-    """Hybrid processor that uses both engines and picks the best result"""
-    def __init__(self):
-        self.rapidocr = RapidOCREngine()
-        self.tesseract = TesseractEngine()
-    def extract_text(self, image_path: str, engine: str = "tesseract") -> dict:
-        """
-        Extract text using specified engine or both
-        Args:
-            image_path: Path to image
-            engine: 'tesseract', 'rapidocr', or 'hybrid'
-        """
-        if engine == OCREngine.TESSERACT:
-            return self.tesseract.extract_text(image_path)
-        elif engine == OCREngine.RAPIDOCR:
-            return self.rapidocr.extract_text(image_path)
-        elif engine == OCREngine.HYBRID:
-            # Run both engines
-            logger.info("Running hybrid OCR (Tesseract + RapidOCR)")
-            try:
-                tess_result = self.tesseract.extract_text(image_path)
-            except Exception as e:
-                logger.warning(f"Tesseract failed in hybrid mode: {e}")
-                tess_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
-            try:
-                rapid_result = self.rapidocr.extract_text(image_path)
-            except Exception as e:
-                logger.warning(f"RapidOCR failed in hybrid mode: {e}")
-                rapid_result = {'text': '', 'confidence': 0.0, 'lines_detected': 0}
-            # Pick the one with higher confidence
-            if tess_result['confidence'] >= rapid_result['confidence']:
-                logger.info(f"Using Tesseract (conf: {tess_result['confidence']:.2%} vs {rapid_result['confidence']:.2%})")
-                tess_result['engine'] = 'tesseract (hybrid)'
-                return tess_result
-            else:
-                logger.info(f"Using RapidOCR (conf: {rapid_result['confidence']:.2%} vs {tess_result['confidence']:.2%})")
-                rapid_result['engine'] = 'rapidocr (hybrid)'
-                return rapid_result
-        else:
-            raise ValueError(f"Unknown engine: {engine}")
-class OCRProcessor:
-    """Main OCR processor supporting multiple engines"""
-    def __init__(self, engine: str = None):
-        self.engine_type = engine or Config.DEFAULT_ENGINE
-        self.processor = HybridOCRProcessor()
-    def process_file(self, file_path: str, content_type: str, engine: str = None) -> dict:
-        """Process PDF or image file"""
-        start = time.perf_counter()
-        pages_content = []
-        all_confidences = []
-        engine_to_use = engine or self.engine_type
-        try:
-            logger.info(f"Processing File: {file_path} with engine: {engine_to_use}")
-            if content_type == "application/pdf":
-                logger.info("Converting PDF to Images...")
-                images = convert_from_path(file_path)
-                total = len(images)
-                for idx, img in enumerate(images):
-                    page_num = idx + 1
-                    logger.info(f"Scanning Page {page_num}/{total}")
-                    # Save PIL Image to temp file
-                    with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
-                        img.save(tmp_img.name, 'PNG')
-                        temp_img_path = tmp_img.name
-                    try:
-                        ocr_result = self.processor.extract_text(temp_img_path, engine_to_use)
-                        pages_content.append({
-                            "index": idx,
-                            "page_number": page_num,
-                            "text": ocr_result["text"],
-                            "confidence": ocr_result["confidence"],
-                            "lines_detected": ocr_result["lines_detected"],
-                            "engine_used": ocr_result.get("engine", engine_to_use)
-                        })
-                        if ocr_result["confidence"] > 0:
-                            all_confidences.append(ocr_result["confidence"])
-                    finally:
-                        try:
-                            os.remove(temp_img_path)
-                        except:
-                            pass
-            else:
-                logger.info("Scanning Single Image...")
-                ocr_result = self.processor.extract_text(file_path, engine_to_use)
-                pages_content.append({
-                    "index": 0,
-                    "page_number": 1,
-                    "text": ocr_result["text"],
-                    "confidence": ocr_result["confidence"],
-                    "lines_detected": ocr_result["lines_detected"],
-                    "engine_used": ocr_result.get("engine", engine_to_use)
-                })
-                if ocr_result["confidence"] > 0:
-                    all_confidences.append(ocr_result["confidence"])
-            avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
-            processing_time = (time.perf_counter() - start) * 1000
-            logger.info(f"OCR Complete in {processing_time:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
-            return {
-                "total_pages": len(pages_content),
-                "pages_content": pages_content,
-                "average_confidence": avg_confidence,
-                "engine": engine_to_use
-            }
-        except Exception as e:
-            logger.error(f"OCR Logic Failure: {str(e)}")
-            raise ValueError(str(e))
-# ==========================================
-# 4. APP & MIDDLEWARE
-# ==========================================
-app = FastAPI(title=Config.APP_NAME)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=Config.ALLOWED_ORIGINS if Config.ALLOWED_ORIGINS else ["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.middleware("http")
-async def request_context_middleware(request: Request, call_next):
-    req_id = str(uuid.uuid4())
-    token = request_id_ctx.set(req_id)
-    request.state.request_id = req_id
-    start_time = time.perf_counter()
-    logger.info(f"Start: {request.method} {request.url.path}")
-    try:
-        response = await call_next(request)
-        duration = (time.perf_counter() - start_time) * 1000
-        response.headers["X-Request-ID"] = req_id
-        logger.info(f"Finish: {response.status_code} in {duration:.2f}ms")
-        return response
-    except Exception as e:
-        logger.exception("Middleware caught crash")
-        return JSONResponse(
-            status_code=500,
-            content={
-                "status": "error",
-                "message": "Internal Server Error",
-                "request_id": req_id
-            }
-        )
-    finally:
-        request_id_ctx.reset(token)
-# ==========================================
-# 5. ENDPOINTS
-# ==========================================
-@app.get("/")
-async def root(request: Request):
-    return {
-        "request_id": request.state.request_id,
-        "process_time_ms": 0,
-        "status": StatusEnum.SUCCESS,
-        "message": "Hybrid OCR API Active",
-        "engines": ["tesseract", "rapidocr", "hybrid"],
-        "default_engine": Config.DEFAULT_ENGINE,
-        "version": "2.0.0"
-    }
-@app.get("/health")
-async def health_check(request: Request):
-    """Health check endpoint"""
-    return {
-        "request_id": request.state.request_id,
-        "status": StatusEnum.SUCCESS,
-        "message": "Service healthy",
-        "engines": {
-            "tesseract": "ready",
-            "rapidocr": "ready"
-        }
-    }
-@app.post("/api/v1/get_data", response_model=APIResponse)
-async def extract_data(
-    request: Request,
-    file: UploadFile = File(...),
-    engine: Optional[str] = Form(default=None, description="OCR engine: tesseract, rapidocr, or hybrid"),
-    token: str = Depends(SecurityService.validate_token)
-):
-    """
-    Extract text from image or PDF
-    - **file**: Image or PDF file to process
-    - **engine**: Choose OCR engine (optional, can be sent as form data or query param)
-      - `tesseract`: Best for English/European languages, highest accuracy (DEFAULT)
-      - `rapidocr`: Faster, good for Asian languages
-      - `hybrid`: Use both and pick best result (slower but most accurate)
-    Example curl:
-    ```bash
-    # Using query parameter
-    curl -X POST "http://localhost:7860/api/v1/get_data?engine=tesseract" \
-      -H "Authorization: Bearer your-token" \
-      -F "file=@document.pdf"
-    # Using form data (payload)
-    curl -X POST "http://localhost:7860/api/v1/get_data" \
-      -H "Authorization: Bearer your-token" \
-      -F "file=@document.pdf" \
-      -F "engine=hybrid"
-    ```
-    """
-    start_ts = time.perf_counter()
-    tmp_path = None
-    req_id = request.state.request_id
-    # Validate engine parameter
-    engine_to_use = engine
-    if engine_to_use and engine_to_use not in [e.value for e in OCREngine]:
-        return JSONResponse(
-            status_code=400,
-            content={
-                "request_id": req_id,
-                "status": StatusEnum.ERROR,
-                "error_message": f"Invalid engine '{engine_to_use}'. Must be one of: tesseract, rapidocr, hybrid"
-            }
-        )
-    try:
-        FileValidator.validate(file)
-        tmp_path = FileValidator.check_size_and_save(file)
-        logger.info(f"Processing with engine: {engine_to_use or Config.DEFAULT_ENGINE}")
-        # Initialize processor with selected engine
-        processor = OCRProcessor()
-        result = await run_in_threadpool(
-            processor.process_file,
-            tmp_path,
-            file.content_type,
-            engine_to_use
-        )
-        return {
-            "request_id": req_id,
-            "process_time_ms": (time.perf_counter() - start_ts) * 1000,
-            "status": StatusEnum.SUCCESS,
-            "message": "OCR Extraction Successful",
-            "data": {
-                "filename": file.filename,
-                "content_type": file.content_type,
-                "saved_file_path": tmp_path,
-                "total_pages": result["total_pages"],
-                "pages_content": result["pages_content"],
-                "average_confidence": result.get("average_confidence", 0.0),
-                "engine": result["engine"]
-            }
-        }
-    except Exception as e:
-        logger.error(f"Request failed: {str(e)}")
-        status_code = getattr(e, "status_code", 500)
-        return JSONResponse(
-            status_code=status_code,
-            content={
-                "request_id": req_id,
-                "process_time_ms": (time.perf_counter() - start_ts) * 1000,
-                "status": StatusEnum.ERROR,
-                "error_message": getattr(e, "detail", str(e))
-            }
-        )
-    finally:
-        if tmp_path:
-            try:
-                os.remove(tmp_path)
-                logger.info(f"Temporary file deleted: {tmp_path}")
-            except Exception as e:
-                logger.warning(f"Failed to delete temp file: {str(e)}")
-# ==========================================
-# 6. STARTUP
-# ==========================================
-@app.on_event("startup")
-async def startup_event():
-    """Initialize OCR engines on startup"""
-    logger.info("Starting Hybrid OCR API...")
-    try:
-        test_processor = HybridOCRProcessor()
-        logger.info("All OCR engines ready for processing")
-    except Exception as e:
-        logger.error(f"Failed to initialize OCR engines: {str(e)}")
-        raise
-if __name__ == "__main__":
-    uvicorn.run(
-        "main:app",
-        host="0.0.0.0",
-        port=int(os.getenv("PORT", 7860)),
-        workers=4
-    )