Spaces:

xce009
/

ocr-api

Running

App Files Files Community

Soumik Bose commited on Jan 25

Commit

3ddb265

1 Parent(s): 8be8190

go

Browse files

Files changed (4) hide show

.gitignore +126 -0
Dockerfile +9 -12
main.py +262 -22
requirements.txt +3 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,126 @@

+# Environment Variables
+.env
+.env.local
+.env.*.local
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+.venv
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Jupyter Notebook
+.ipynb_checkpoints
+# PyCharm
+.idea/
+# Testing
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+.nox/
+coverage.xml
+*.cover
+.hypothesis/
+# Logs
+*.log
+logs/
+*.log.*
+# Temporary files
+*.tmp
+*.temp
+temp/
+tmp/
+*.bak
+# OCR Processing files
+uploads/
+processed/
+output/
+samples/
+*.pdf
+*.jpg
+*.jpeg
+*.png
+*.bmp
+*.webp
+*.tiff
+# Docker
+*.env.docker
+docker-compose.override.yml
+# OS
+Thumbs.db
+.DS_Store
+*.swp
+# Database
+*.db
+*.sqlite
+*.sqlite3
+# Cache
+.cache/
+*.cache
+__pycache__/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Model files (if downloading OCR models)
+*.onnx
+models/
+weights/
+# Hugging Face cache
+.huggingface/
+# Node modules (if using any JS tooling)
+node_modules/
+# Secrets and certificates
+*.pem
+*.key
+*.crt
+secrets/
+credentials/

Dockerfile CHANGED Viewed

@@ -2,23 +2,15 @@ FROM python:3.11-slim
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y \
     curl \
-    tesseract-ocr \
-    tesseract-ocr-eng \
-    tesseract-ocr-deu \
-    tesseract-ocr-fra \
-    tesseract-ocr-spa \
-    tesseract-ocr-por \
-    tesseract-ocr-ita \
-    tesseract-ocr-rus \
-    tesseract-ocr-chi-sim \
-    tesseract-ocr-jpn \
-    tesseract-ocr-kor \
     poppler-utils \
     libgl1 \
     libglib2.0-0 \
     && rm -rf /var/lib/apt/lists/*
 # Fix: Ensure logs appear immediately in the console
@@ -27,19 +19,24 @@ ENV PYTHONIOENCODING=UTF-8
 ENV HF_HOME=/tmp/cache
 ENV PORT=7860
 COPY requirements.txt .
 RUN pip install --upgrade pip setuptools wheel \
     && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
 COPY . .
 RUN useradd -m appuser && chown -R appuser /app
 USER appuser
 RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME}
 EXPOSE $PORT
 CMD bash -c "\
 (while true; do curl -s https://xce009-ocr-api.hf.space >/dev/null; sleep 300; done) & \
 uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"

 WORKDIR /app
+# Install system dependencies for RapidOCR and PDF processing
 RUN apt-get update && apt-get install -y \
     curl \
     poppler-utils \
     libgl1 \
     libglib2.0-0 \
+    libgomp1 \
+    gcc \
+    g++ \
     && rm -rf /var/lib/apt/lists/*
 # Fix: Ensure logs appear immediately in the console
 ENV HF_HOME=/tmp/cache
 ENV PORT=7860
+# Copy requirements and install dependencies
 COPY requirements.txt .
 RUN pip install --upgrade pip setuptools wheel \
     && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
+# Copy application files
 COPY . .
+# Create non-root user
 RUN useradd -m appuser && chown -R appuser /app
 USER appuser
+# Create cache directory
 RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME}
 EXPOSE $PORT
+# Start the application
 CMD bash -c "\
 (while true; do curl -s https://xce009-ocr-api.hf.space >/dev/null; sleep 300; done) & \
 uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"

main.py CHANGED Viewed

@@ -11,7 +11,9 @@ from contextvars import ContextVar
 # Third-party imports
 import uvicorn
-import pytesseract
 from fastapi import (
     FastAPI, File, UploadFile, Depends,
     HTTPException, Request, status
@@ -39,10 +41,16 @@ class Config:
     MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
     ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
     ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
 class RequestIdFilter(logging.Filter):
     def filter(self, record):
-        # Automatically pull request_id from the context variable
         record.request_id = request_id_ctx.get()
         return True
@@ -50,7 +58,7 @@ logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S',
-    force=True # Ensures our config is applied
 )
 logger = logging.getLogger("ocr_api")
 logger.addFilter(RequestIdFilter())
@@ -72,6 +80,8 @@ class PageResult(BaseModel):
     index: int
     page_number: int
     text: str
 class OCRResult(BaseModel):
     filename: str
@@ -79,6 +89,7 @@ class OCRResult(BaseModel):
     saved_file_path: str
     total_pages: int
     pages_content: List[PageResult]
 class APIResponse(BaseResponse):
     data: Optional[OCRResult] = None
@@ -116,12 +127,146 @@ class FileValidator:
             raise HTTPException(413, "File too large")
         return tmp_path
 class OCRProcessor:
-    @classmethod
-    def process_file(cls, file_path: str, content_type: str) -> dict:
-        """Note: No longer passing request_id; logger picks it up from contextvars automatically."""
         start = time.perf_counter()
         pages_content = []
         try:
             logger.info(f"Processing File: {file_path}")
@@ -134,16 +279,54 @@ class OCRProcessor:
                 for idx, img in enumerate(images):
                     page_num = idx + 1
                     logger.info(f"Scanning Page {page_num}/{total}")
-                    text = pytesseract.image_to_string(img).strip()
-                    pages_content.append({"index": idx, "page_number": page_num, "text": text})
             else:
                 logger.info("Scanning Single Image...")
-                img = Image.open(file_path)
-                text = pytesseract.image_to_string(img).strip()
-                pages_content.append({"index": 0, "page_number": 1, "text": text})
-            logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms")
-            return {"total_pages": len(pages_content), "pages_content": pages_content}
         except Exception as e:
             logger.error(f"OCR Logic Failure: {str(e)}")
@@ -180,7 +363,14 @@ async def request_context_middleware(request: Request, call_next):
         return response
     except Exception as e:
         logger.exception("Middleware caught crash")
-        return JSONResponse(status_code=500, content={"status":"error","message":"Internal Server Error","request_id":req_id})
     finally:
         # 3. Clean up Context
         request_id_ctx.reset(token)
@@ -195,9 +385,34 @@ async def root(request: Request):
         "request_id": request.state.request_id,
         "process_time_ms": 0,
         "status": StatusEnum.SUCCESS,
-        "message": "OCR API Active"
     }
 @app.post("/api/v1/get_data", response_model=APIResponse)
 async def extract_data(
     request: Request,
@@ -212,10 +427,11 @@ async def extract_data(
         FileValidator.validate(file)
         tmp_path = FileValidator.check_size_and_save(file)
-        # CPU heavy task run in thread pool.
-        # ContextVars are automatically copied to the thread.
         result = await run_in_threadpool(
-            OCRProcessor.process_file,
             tmp_path,
             file.content_type
         )
@@ -230,7 +446,8 @@ async def extract_data(
                 "content_type": file.content_type,
                 "saved_file_path": tmp_path,
                 "total_pages": result["total_pages"],
-                "pages_content": result["pages_content"]
             }
         }
@@ -250,7 +467,30 @@ async def extract_data(
         if tmp_path:
             logger.info(f"File preserved at: {tmp_path}")
             try:
-              os.remove(tmp_path)
-              logger.info(f"Temporary file deleted: {tmp_path}")
             except Exception as e:
-              logger.warning(f"Failed to delete temp file: {str(e)}")

 # Third-party imports
 import uvicorn
+import cv2
+import numpy as np
+from rapidocr_onnxruntime import RapidOCR
 from fastapi import (
     FastAPI, File, UploadFile, Depends,
     HTTPException, Request, status
     MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
     ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
     ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
+    # RapidOCR Settings
+    USE_ANGLE_CLS = os.getenv("OCR_USE_ANGLE_CLS", "true").lower() == "true"
+    USE_TEXT_SCORE = os.getenv("OCR_USE_TEXT_SCORE", "true").lower() == "true"
+    MIN_HEIGHT = int(os.getenv("OCR_MIN_HEIGHT", "30"))
+    TEXT_SCORE_THRESHOLD = float(os.getenv("OCR_TEXT_SCORE", "0.5"))
+    ENABLE_PREPROCESSING = os.getenv("OCR_PREPROCESSING", "true").lower() == "true"
 class RequestIdFilter(logging.Filter):
     def filter(self, record):
         record.request_id = request_id_ctx.get()
         return True
     level=logging.INFO,
     format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S',
+    force=True
 )
 logger = logging.getLogger("ocr_api")
 logger.addFilter(RequestIdFilter())
     index: int
     page_number: int
     text: str
+    confidence: Optional[float] = None
+    lines_detected: Optional[int] = None
 class OCRResult(BaseModel):
     filename: str
     saved_file_path: str
     total_pages: int
     pages_content: List[PageResult]
+    average_confidence: Optional[float] = None
 class APIResponse(BaseResponse):
     data: Optional[OCRResult] = None
             raise HTTPException(413, "File too large")
         return tmp_path
+class RapidOCREngine:
+    """Singleton RapidOCR engine for efficient reuse"""
+    _instance = None
+    _engine = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialize_engine()
+        return cls._instance
+    def _initialize_engine(self):
+        """Initialize RapidOCR with optimized settings"""
+        try:
+            self._engine = RapidOCR(
+                det_use_cuda=False,
+                cls_use_cuda=False,
+                rec_use_cuda=False,
+                use_angle_cls=Config.USE_ANGLE_CLS,
+                use_text_score=Config.USE_TEXT_SCORE,
+                print_verbose=False,
+                min_height=Config.MIN_HEIGHT,
+                text_score=Config.TEXT_SCORE_THRESHOLD
+            )
+            logger.info("RapidOCR engine initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize RapidOCR: {str(e)}")
+            raise
+    def get_engine(self):
+        return self._engine
+    @staticmethod
+    def preprocess_image(img_array):
+        """Enhanced preprocessing for better accuracy"""
+        if not Config.ENABLE_PREPROCESSING:
+            return img_array
+        try:
+            # Convert to grayscale if needed
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
+            else:
+                gray = img_array
+            # Denoise
+            denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+            # Enhance contrast using CLAHE
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+            contrast = clahe.apply(denoised)
+            # Sharpen
+            kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
+            sharpened = cv2.filter2D(contrast, -1, kernel)
+            # Adaptive threshold
+            processed = cv2.adaptiveThreshold(
+                sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY, 11, 2
+            )
+            return processed
+        except Exception as e:
+            logger.warning(f"Preprocessing failed, using original image: {str(e)}")
+            return img_array
 class OCRProcessor:
+    def __init__(self):
+        self.ocr_engine = RapidOCREngine().get_engine()
+    def _extract_from_image(self, img_array) -> dict:
+        """Extract text from a single image using RapidOCR"""
+        try:
+            # Preprocess image
+            processed_img = RapidOCREngine.preprocess_image(img_array)
+            # Perform OCR
+            result, elapse = self.ocr_engine(processed_img)
+            if result is None or len(result) == 0:
+                return {
+                    "text": "",
+                    "confidence": 0.0,
+                    "lines_detected": 0
+                }
+            # Parse results
+            texts = []
+            confidences = []
+            for line in result:
+                try:
+                    if isinstance(line, (list, tuple)):
+                        if len(line) == 2:
+                            # [box, text] or [text, confidence]
+                            if isinstance(line[0], (list, tuple)):
+                                _, text = line
+                                confidence = 1.0
+                            else:
+                                text, confidence = line
+                        elif len(line) == 3:
+                            # [box, text, confidence]
+                            _, text, confidence = line
+                        elif len(line) >= 4:
+                            _, text, confidence = line[0], line[1], line[2]
+                        else:
+                            continue
+                        texts.append(str(text))
+                        confidences.append(float(confidence) if confidence is not None else 1.0)
+                except Exception as e:
+                    logger.debug(f"Skipping malformed line: {e}")
+                    continue
+            if not texts:
+                return {
+                    "text": "",
+                    "confidence": 0.0,
+                    "lines_detected": 0
+                }
+            combined_text = '\n'.join(texts)
+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+            return {
+                "text": combined_text,
+                "confidence": avg_confidence,
+                "lines_detected": len(texts)
+            }
+        except Exception as e:
+            logger.error(f"Image OCR extraction failed: {str(e)}")
+            raise ValueError(f"OCR extraction error: {str(e)}")
+    def process_file(self, file_path: str, content_type: str) -> dict:
+        """Process PDF or image file and extract text"""
         start = time.perf_counter()
         pages_content = []
+        all_confidences = []
         try:
             logger.info(f"Processing File: {file_path}")
                 for idx, img in enumerate(images):
                     page_num = idx + 1
                     logger.info(f"Scanning Page {page_num}/{total}")
+                    # Convert PIL Image to numpy array for OpenCV
+                    img_array = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+                    # Extract text
+                    ocr_result = self._extract_from_image(img_array)
+                    pages_content.append({
+                        "index": idx,
+                        "page_number": page_num,
+                        "text": ocr_result["text"],
+                        "confidence": ocr_result["confidence"],
+                        "lines_detected": ocr_result["lines_detected"]
+                    })
+                    if ocr_result["confidence"] > 0:
+                        all_confidences.append(ocr_result["confidence"])
             else:
                 logger.info("Scanning Single Image...")
+                # Read image with OpenCV
+                img_array = cv2.imread(file_path)
+                if img_array is None:
+                    raise ValueError("Failed to load image file")
+                # Extract text
+                ocr_result = self._extract_from_image(img_array)
+                pages_content.append({
+                    "index": 0,
+                    "page_number": 1,
+                    "text": ocr_result["text"],
+                    "confidence": ocr_result["confidence"],
+                    "lines_detected": ocr_result["lines_detected"]
+                })
+                if ocr_result["confidence"] > 0:
+                    all_confidences.append(ocr_result["confidence"])
+            avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
+            logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
+            return {
+                "total_pages": len(pages_content),
+                "pages_content": pages_content,
+                "average_confidence": avg_confidence
+            }
         except Exception as e:
             logger.error(f"OCR Logic Failure: {str(e)}")
         return response
     except Exception as e:
         logger.exception("Middleware caught crash")
+        return JSONResponse(
+            status_code=500,
+            content={
+                "status": "error",
+                "message": "Internal Server Error",
+                "request_id": req_id
+            }
+        )
     finally:
         # 3. Clean up Context
         request_id_ctx.reset(token)
         "request_id": request.state.request_id,
         "process_time_ms": 0,
         "status": StatusEnum.SUCCESS,
+        "message": "RapidOCR API Active",
+        "engine": "RapidOCR",
+        "version": "1.0.0"
     }
+@app.get("/health")
+async def health_check(request: Request):
+    """Health check endpoint"""
+    try:
+        # Verify OCR engine is initialized
+        engine = RapidOCREngine().get_engine()
+        return {
+            "request_id": request.state.request_id,
+            "status": StatusEnum.SUCCESS,
+            "message": "Service healthy",
+            "ocr_engine": "ready"
+        }
+    except Exception as e:
+        return JSONResponse(
+            status_code=503,
+            content={
+                "request_id": request.state.request_id,
+                "status": StatusEnum.ERROR,
+                "message": "Service unhealthy",
+                "error": str(e)
+            }
+        )
 @app.post("/api/v1/get_data", response_model=APIResponse)
 async def extract_data(
     request: Request,
         FileValidator.validate(file)
         tmp_path = FileValidator.check_size_and_save(file)
+        # CPU heavy task run in thread pool
+        # ContextVars are automatically copied to the thread
+        processor = OCRProcessor()
         result = await run_in_threadpool(
+            processor.process_file,
             tmp_path,
             file.content_type
         )
                 "content_type": file.content_type,
                 "saved_file_path": tmp_path,
                 "total_pages": result["total_pages"],
+                "pages_content": result["pages_content"],
+                "average_confidence": result.get("average_confidence", 0.0)
             }
         }
         if tmp_path:
             logger.info(f"File preserved at: {tmp_path}")
             try:
+                os.remove(tmp_path)
+                logger.info(f"Temporary file deleted: {tmp_path}")
             except Exception as e:
+                logger.warning(f"Failed to delete temp file: {str(e)}")
+# ==========================================
+# 6. STARTUP
+# ==========================================
+@app.on_event("startup")
+async def startup_event():
+    """Initialize OCR engine on startup"""
+    logger.info("Starting OCR API with RapidOCR engine...")
+    try:
+        RapidOCREngine()  # Initialize singleton
+        logger.info("RapidOCR engine ready")
+    except Exception as e:
+        logger.error(f"Failed to initialize OCR engine: {str(e)}")
+        raise
+if __name__ == "__main__":
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=int(os.getenv("PORT", 7860)),
+        workers=4
+    )

requirements.txt CHANGED Viewed

@@ -5,8 +5,9 @@ python-dotenv>=1.0
 aiohttp==3.11.13
 requests==2.32.3
 pypdf==5.1.0
-pytesseract==0.3.13
 opencv-python-headless==4.12.0.88
 numpy<2.3.0
 pdf2image==1.17.0
-Pillow==11.2.1

 aiohttp==3.11.13
 requests==2.32.3
 pypdf==5.1.0
 opencv-python-headless==4.12.0.88
 numpy<2.3.0
 pdf2image==1.17.0
+Pillow==11.2.1
+rapidocr-onnxruntime>=1.3.0
+onnxruntime>=1.16.0