Spaces:

Um34ER
/

bazaar-bridge-ocr

Running

App Files Files Community

Um34ER commited on 28 days ago

Commit

a97fa9d

verified ·

1 Parent(s): 46b529d

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -43

app.py CHANGED Viewed

@@ -887,6 +887,11 @@ ocr_engine = SmartOCR()
 semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
 result_cache: Dict[str, dict] = {}  # hash → {result, timestamp}
 # ── FastAPI App ───────────────────────────────────────────────────────────────
 from contextlib import asynccontextmanager
@@ -940,7 +945,6 @@ def _cache_get(h: str) -> dict | None:
 def _cache_put(h: str, result: dict):
-    # Evict oldest if over limit
     if len(result_cache) >= CACHE_SIZE:
         oldest_key = min(result_cache, key=lambda k: result_cache[k]["ts"])
         del result_cache[oldest_key]
@@ -950,19 +954,18 @@ def _cache_put(h: str, result: dict):
 # ── Image Loading ─────────────────────────────────────────────────────────────
 def load_image(raw_bytes: bytes) -> np.ndarray:
-    """Load image bytes → RGB numpy array, with size validation."""
     size_mb = len(raw_bytes) / (1024 * 1024)
     if size_mb > MAX_IMAGE_SIZE_MB:
         raise ValueError(f"Image too large: {size_mb:.1f} MB (max {MAX_IMAGE_SIZE_MB})")
     pil = Image.open(io.BytesIO(raw_bytes)).convert("RGB")
     return np.array(pil)
-# ── Core Processing ──────────────────────────────────────────────────────────
 def process_image(rgb: np.ndarray) -> Dict[str, Any]:
-    """Full pipeline: preprocess → VLM → brain → structured JSON."""
     t0 = time.time()
     # Step 1: Image quality analysis
@@ -978,95 +981,155 @@ def process_image(rgb: np.ndarray) -> Dict[str, Any]:
     logger.info("VLM raw output (%d chars)", len(raw_text))
     if not raw_text.strip():
-        # Last resort: try with fully enhanced (binarized) image
         logger.info("Retrying with binarized image...")
         enhanced_rgb = enhance(rgb)
         pil_enhanced = Image.fromarray(enhanced_rgb)
         raw_text = ocr_engine.extract_text(pil_enhanced)
-    # Step 4: Brain — try JSON parse first, then regex
     result = try_parse_json_response(raw_text)
     if not result:
         result = process_raw_text(raw_text)
     # Step 5: Enrich with metadata
     result["processing_time_ms"] = int((time.time() - t0) * 1000)
-    result["raw_text"] = raw_text[:500]  # truncated for debug
     result["image_quality"] = quality
     result["engine"] = ocr_engine.health_check()
     return result
 # ── Endpoints ─────────────────────────────────────────────────────────────────
 @app.post("/process-parchi")
 async def process_parchi(image: UploadFile = File(...)):
-    """Process a parchi image and return structured JSON."""
-    request_id = str(uuid.uuid4())[:8]
-    logger.info("[%s] Processing: %s (%s)", request_id, image.filename, image.content_type)
     try:
         raw_bytes = await image.read()
     except Exception as e:
         raise HTTPException(400, f"Failed to read file: {e}")
-    # Cache check
     img_hash = _image_hash(raw_bytes)
     cached = _cache_get(img_hash)
     if cached:
-        logger.info("[%s] Cache hit: %s", request_id, img_hash)
-        cached["request_id"] = request_id
         cached["cached"] = True
         return JSONResponse(cached)
-    # Concurrency gate
-    async with semaphore:
-        try:
-            rgb = load_image(raw_bytes)
-        except ValueError as e:
-            raise HTTPException(400, str(e))
-        except Exception as e:
-            raise HTTPException(400, f"Invalid image: {e}")
-        # Run in thread pool (VLM inference is blocking)
-        loop = asyncio.get_event_loop()
-        try:
-            result = await loop.run_in_executor(None, process_image, rgb)
-        except Exception as e:
-            logger.exception("[%s] Processing failed", request_id)
-            raise HTTPException(500, f"Processing error: {e}")
-        finally:
-            gc.collect()
-    result["request_id"] = request_id
-    result["success"] = bool(result.get("items"))
-    result["cached"] = False
-    # Cache result
-    _cache_put(img_hash, result)
-    return JSONResponse(result)
 @app.get("/health")
 async def health():
-    """Health check endpoint with engine status."""
     return {
         "status": "healthy",
-        "version": "7.0.0",
-        "architecture": "Local Hybrid (Qaari + GOT-OCR)",
         "engine": ocr_engine.health_check(),
         "cache_entries": len(result_cache),
     }
 @app.get("/")
 async def root():
-    """Root endpoint — redirect info."""
     return {
-        "service": "Smart Parchi OCR v7",
         "docs": "/docs",
         "health": "/health",
-        "endpoint": "POST /process-parchi",
     }

 semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
 result_cache: Dict[str, dict] = {}  # hash → {result, timestamp}
+# ── Async Job Store (bypasses HF platform HTTP timeout) ──────────────────────────
+# Jobs older than JOB_TTL seconds are pruned automatically
+JOB_TTL = 3600  # 1 hour
+job_store: Dict[str, dict] = {}  # job_id → {status, result, ts, error}
 # ── FastAPI App ───────────────────────────────────────────────────────────────
 from contextlib import asynccontextmanager
 def _cache_put(h: str, result: dict):
     if len(result_cache) >= CACHE_SIZE:
         oldest_key = min(result_cache, key=lambda k: result_cache[k]["ts"])
         del result_cache[oldest_key]
 # ── Image Loading ─────────────────────────────────────────────────────────────
 def load_image(raw_bytes: bytes) -> np.ndarray:
+    """Load image bytes -> RGB numpy array, with size validation."""
     size_mb = len(raw_bytes) / (1024 * 1024)
     if size_mb > MAX_IMAGE_SIZE_MB:
         raise ValueError(f"Image too large: {size_mb:.1f} MB (max {MAX_IMAGE_SIZE_MB})")
     pil = Image.open(io.BytesIO(raw_bytes)).convert("RGB")
     return np.array(pil)
+# ── Core Processing ───────────────────────────────────────────────────────────
 def process_image(rgb: np.ndarray) -> Dict[str, Any]:
+    """Full pipeline: preprocess -> VLM -> brain -> structured JSON."""
     t0 = time.time()
     # Step 1: Image quality analysis
     logger.info("VLM raw output (%d chars)", len(raw_text))
     if not raw_text.strip():
         logger.info("Retrying with binarized image...")
         enhanced_rgb = enhance(rgb)
         pil_enhanced = Image.fromarray(enhanced_rgb)
         raw_text = ocr_engine.extract_text(pil_enhanced)
+    # Step 4: Brain -- try JSON parse first, then regex
     result = try_parse_json_response(raw_text)
     if not result:
         result = process_raw_text(raw_text)
     # Step 5: Enrich with metadata
     result["processing_time_ms"] = int((time.time() - t0) * 1000)
+    result["raw_text"] = raw_text[:500]
     result["image_quality"] = quality
     result["engine"] = ocr_engine.health_check()
     return result
+# ── Background OCR Worker (Async Job Queue) ───────────────────────────────────
+def _run_ocr_job(job_id: str, raw_bytes: bytes, img_hash: str):
+    """Blocking OCR function executed in a thread-pool worker."""
+    try:
+        job_store[job_id]["status"] = "processing"
+        rgb = load_image(raw_bytes)
+        result = process_image(rgb)
+        result["job_id"] = job_id
+        result["success"] = bool(result.get("items"))
+        result["cached"] = False
+        _cache_put(img_hash, result)
+        job_store[job_id].update({"status": "done", "result": result})
+        elapsed = time.time() - job_store[job_id]["ts"]
+        logger.info("[%s] Job completed in %.1fs", job_id, elapsed)
+    except Exception as e:
+        logger.exception("[%s] Job failed", job_id)
+        job_store[job_id].update({"status": "error", "error": str(e)})
+    finally:
+        gc.collect()
 # ── Endpoints ─────────────────────────────────────────────────────────────────
 @app.post("/process-parchi")
 async def process_parchi(image: UploadFile = File(...)):
+    """
+    Submit a parchi image for OCR processing.
+    Returns immediately with a job_id (typically <1s).
+    Poll GET /result/{job_id} every 10s until status == 'done'.
+    This async pattern is required because CPU inference takes 2-4 minutes,
+    which exceeds the HF platform HTTP timeout (~60s).
+    """
+    job_id = str(uuid.uuid4())[:12]
+    logger.info("[%s] Received: %s (%s)", job_id, image.filename, image.content_type)
     try:
         raw_bytes = await image.read()
     except Exception as e:
         raise HTTPException(400, f"Failed to read file: {e}")
+    # Cache hit -- return result immediately without spawning a job
     img_hash = _image_hash(raw_bytes)
     cached = _cache_get(img_hash)
     if cached:
+        logger.info("[%s] Cache hit -- returning immediately", job_id)
+        cached["job_id"] = job_id
         cached["cached"] = True
+        cached["status"] = "done"
         return JSONResponse(cached)
+    # Validate image before queuing
+    try:
+        load_image(raw_bytes)
+    except ValueError as e:
+        raise HTTPException(400, str(e))
+    except Exception as e:
+        raise HTTPException(400, f"Invalid image: {e}")
+    # Register job and prune stale ones
+    job_store[job_id] = {"status": "queued", "ts": time.time(), "result": None, "error": None}
+    now = time.time()
+    stale = [k for k, v in job_store.items() if now - v["ts"] > JOB_TTL]
+    for k in stale:
+        del job_store[k]
+    # Submit to thread pool (non-blocking -- returns immediately)
+    loop = asyncio.get_event_loop()
+    loop.run_in_executor(None, _run_ocr_job, job_id, raw_bytes, img_hash)
+    logger.info("[%s] Job queued -- returning job_id immediately", job_id)
+    return JSONResponse({
+        "job_id": job_id,
+        "status": "queued",
+        "poll_url": f"/result/{job_id}",
+        "message": "Image accepted. Poll /result/{job_id} every 10s until status=done.",
+    })
+@app.get("/result/{job_id}")
+async def get_result(job_id: str):
+    """
+    Poll for OCR job result.
+    Returns:
+      status=queued|processing : not ready yet, poll again in 10s
+      status=done              : result field contains the structured parchi JSON
+      status=error             : error field contains the failure message
+    """
+    job = job_store.get(job_id)
+    if not job:
+        raise HTTPException(404, f"Job '{job_id}' not found. It may have expired (TTL=1h).")
+    response: Dict[str, Any] = {"job_id": job_id, "status": job["status"]}
+    if job["status"] == "done":
+        response.update(job["result"] or {})
+    elif job["status"] == "error":
+        response["error"] = job["error"]
+    else:
+        response["elapsed_seconds"] = int(time.time() - job["ts"])
+        response["message"] = "Job is processing. Poll again in 10 seconds."
+    return JSONResponse(response)
 @app.get("/health")
 async def health():
+    """Health check with engine and queue status."""
+    active = sum(1 for j in job_store.values() if j["status"] in ("queued", "processing"))
     return {
         "status": "healthy",
+        "version": "7.1.0",
+        "architecture": "Local Hybrid (Qaari + GOT-OCR) -- Async Job Queue",
         "engine": ocr_engine.health_check(),
         "cache_entries": len(result_cache),
+        "active_jobs": active,
+        "total_jobs": len(job_store),
     }
 @app.get("/")
 async def root():
+    """Root endpoint."""
     return {
+        "service": "Smart Parchi OCR v7.1",
         "docs": "/docs",
         "health": "/health",
+        "submit": "POST /process-parchi  -> {job_id, status: queued}",
+        "poll":   "GET  /result/{job_id} -> {status, result (when done)}",
     }