ClientInfoOCR / app.py
BlackSpire's picture
Update app.py
4d6a332 verified
"""
Visiting Card & Letterhead OCR API
===================================
Two-step pipeline: nemoretriever-ocr-v1 β†’ nvidia-nemotron-nano-9b-v2
Deploy on Hugging Face Spaces (Docker or Python SDK):
- Set secret NVIDIA_API_KEY in Space settings β†’ Variables and secrets
- The app serves the HTML frontend at / and the API at /extract-card
- HF Spaces exposes port 7860 by default (set via HF_PORT env var)
Local usage:
pip install fastapi uvicorn requests python-multipart
NVIDIA_API_KEY=nvapi-xxx python visiting_card_api.py
Open http://localhost:7860
"""
import os
import re
import json
import base64
import requests
from pathlib import Path
from typing import List
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
# ── App ────────────────────────────────────────────────────────────────────────
app = FastAPI(
title="Visiting Card & Letterhead OCR API",
description="Two-step RAG pipeline: nemoretriever-ocr-v1 β†’ nvidia-nemotron-nano-9b-v2",
)
# ── CORS β€” allow all origins (needed for HF Spaces iframe / custom domains) ───
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ── Configuration ─────────────────────────────────────────────────────────────
NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "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")
OCR_URL = "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-ocr-v1"
LLM_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
LLM_MODEL = "nvidia/nvidia-nemotron-nano-9b-v2"
OCR_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Accept": "application/json"}
LLM_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Content-Type": "application/json"}
# ── System prompt ──────────────────────────────────────────────────────────────
CARD_SYSTEM_PROMPT = """You are a business card and letterhead data extraction assistant.
You will receive raw OCR text extracted from a visiting card, business card, or the header/footer of a business letter.
Parse it carefully and return ONLY a valid JSON object.
No markdown fences, no explanation, no preamble β€” just the raw JSON object.
JSON schema (return exactly this structure):
{
"company_name": "full name of the company or firm (string)",
"contact_person": "name of the individual on the card or letter (string)",
"designation": "job title or designation of the contact person (string)",
"mobile": "mobile number(s) as a string; if multiple separate with comma (string)",
"phone": "landline / office phone number(s); if multiple separate with comma (string)",
"email": "email address(es); if multiple separate with comma (string)",
"address": "full postal address as printed, preserving line breaks with a pipe | separator (string)",
"pin": "PIN code / ZIP code / postal code as a string of digits (string)",
"city": "city name (string)",
"state": "state or province name (string)",
"country": "country name (string)",
"gst_number": "GST / GSTIN number; typically 15 alphanumeric characters (string)",
"website": "website URL if present (string)",
"fax": "fax number if present (string)"
}
Rules:
- company_name: usually the largest text or the text near a logo
- contact_person: individual's personal name distinct from company name
- designation: title like CEO, Manager, Director, Proprietor, Sales Executive, etc.
- mobile: numbers prefixed with M:, Mob:, Cell:, +91, or 10-digit numbers
- phone: numbers prefixed with Ph:, Tel:, T:, O:, or STD codes like (022), (080)
- email: look for @ symbol; may be prefixed with E:, Email:, Mail:
- address: collect all address lines; separate each line with ' | '
- pin: extract 6-digit Indian PIN code or 5/9-digit ZIP; digits only
- city: extract city name from address
- state: extract state name from address
- country: default to India if address looks Indian and country not stated
- gst_number: 15-character alphanumeric GSTIN
- website: any URL starting with www., http://, or https://
- fax: number prefixed with Fax:, F:, or similar
- If a field is not found return "" (empty string)
- Do NOT invent or hallucinate any information not present in the OCR text
- If multiple phone or mobile numbers are present, join them with ', '"""
# ── Helpers ────────────────────────────────────────────────────────────────────
async def run_ocr(file: UploadFile) -> str:
content = await file.read()
image_b64 = base64.b64encode(content).decode()
# Matches reference: base64 must be < 180,000 characters
if len(image_b64) >= 180_000:
raise HTTPException(413, "Image too large (base64 must be < 180,000 chars). Resize and retry.")
# Payload structure matches the official reference exactly
payload = {
"input": [
{
"type": "image_url",
"url": f"data:image/png;base64,{image_b64}",
}
]
}
try:
r = requests.post(OCR_URL, headers=OCR_HEADERS, json=payload, timeout=30)
r.raise_for_status()
except requests.exceptions.RequestException as e:
raise HTTPException(502, f"NVIDIA OCR API error: {e}")
ocr_json = r.json()
detections = ocr_json.get("text_detections", [])
if not detections:
data = ocr_json.get("data", [])
if isinstance(data, list) and data:
detections = data[0].get("text_detections", [])
lines = []
for det in detections:
text = ""
if isinstance(det, dict):
if "text_prediction" in det:
text = det["text_prediction"].get("text", "").strip()
else:
text = det.get("text", "").strip()
if text:
lines.append(text)
return "\n".join(lines)
def call_llm(ocr_text: str) -> dict:
payload = {
"model": LLM_MODEL,
"max_tokens": 2048,
"temperature": 0.1,
"top_p": 0.9,
"messages": [
{"role": "system", "content": CARD_SYSTEM_PROMPT},
{"role": "user", "content": (
f"Here is the OCR text extracted from the business card or letterhead:\n\n"
f"{ocr_text}\n\nExtract the required data and return ONLY the JSON object."
)},
],
}
try:
r = requests.post(LLM_URL, headers=LLM_HEADERS, json=payload, timeout=120)
r.raise_for_status()
llm_json = r.json()
except requests.exceptions.RequestException as e:
raise HTTPException(502, f"NVIDIA LLM API error: {e}")
raw: str = llm_json.get("choices", [{}])[0].get("message", {}).get("content", "")
if not raw:
raise HTTPException(502, "LLM returned empty response")
cleaned = re.sub(r"```json\s*", "", raw, flags=re.IGNORECASE)
cleaned = re.sub(r"```\s*", "", cleaned).strip()
try:
parsed = json.loads(cleaned)
except json.JSONDecodeError:
m = re.search(r"\{[\s\S]*\}", cleaned)
if not m:
raise HTTPException(502, f"LLM did not return valid JSON. Preview: {raw[:400]}")
try:
parsed = json.loads(m.group(0))
except json.JSONDecodeError as e:
raise HTTPException(502, f"JSON parse error: {e}")
if not isinstance(parsed, dict):
raise HTTPException(502, f"LLM response not a JSON object. Got: {type(parsed).__name__}")
return parsed
# ── Pydantic models ────────────────────────────────────────────────────────────
class CardData(BaseModel):
company_name: str
contact_person: str
designation: str
mobile: str
phone: str
email: str
address: str
pin: str
city: str
state: str
country: str
gst_number: str
website: str
fax: str
def build_card(parsed: dict) -> CardData:
def s(k, n=300): return str(parsed.get(k, "")).strip()[:n]
return CardData(
company_name=s("company_name", 200), contact_person=s("contact_person", 100),
designation=s("designation", 100), mobile=s("mobile", 100),
phone=s("phone", 100), email=s("email", 200),
address=s("address", 500), pin=s("pin", 10),
city=s("city", 100), state=s("state", 100),
country=s("country", 100), gst_number=s("gst_number", 20),
website=s("website", 200), fax=s("fax", 50),
)
# ── API endpoints ──────────────────────────────────────────────────────────────
@app.post("/extract-card", response_model=CardData)
async def extract_card(file: UploadFile = File(...)):
allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"}
if file.content_type and file.content_type not in allowed:
raise HTTPException(415, f"Unsupported type: {file.content_type}.")
ocr_text = await run_ocr(file)
if not ocr_text.strip():
raise HTTPException(422, "OCR produced no text. Check image quality.")
return build_card(call_llm(ocr_text))
@app.post("/extract-card/batch", response_model=List[CardData])
async def extract_card_batch(files: List[UploadFile] = File(...)):
if len(files) > 10:
raise HTTPException(400, "Maximum 10 files per batch request.")
empty = CardData(**{f: "" for f in CardData.__fields__})
results = []
for idx, file in enumerate(files):
allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"}
if file.content_type and file.content_type not in allowed:
raise HTTPException(415, f"File {idx+1}: unsupported type.")
ocr_text = await run_ocr(file)
results.append(build_card(call_llm(ocr_text)) if ocr_text.strip() else empty)
return results
@app.get("/health")
async def health():
return {"status": "healthy", "model": LLM_MODEL}
# ── Serve index.html at root (must be placed alongside this script) ────────────
HTML_PATH = Path(__file__).parent / "index.html"
@app.get("/", response_class=HTMLResponse)
async def serve_ui():
if not HTML_PATH.exists():
return HTMLResponse(
"<h2 style='font-family:sans-serif;padding:40px'>"
"index.html not found β€” place it next to visiting_card_api.py</h2>", 500
)
return HTMLResponse(HTML_PATH.read_text(encoding="utf-8"))
# ── Entry point ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import uvicorn
port = int(os.environ.get("HF_PORT", 7860))
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)