Spaces:

BlackSpire
/

ClientInfoOCR

Running

App Files Files Community

ClientInfoOCR / app.py

BlackSpire

Update app.py

4d6a332 verified 13 minutes ago

raw

history blame contribute delete

11.6 kB

	"""
	Visiting Card & Letterhead OCR API
	===================================
	Two-step pipeline: nemoretriever-ocr-v1 → nvidia-nemotron-nano-9b-v2
	Deploy on Hugging Face Spaces (Docker or Python SDK):
	- Set secret NVIDIA_API_KEY in Space settings → Variables and secrets
	- The app serves the HTML frontend at / and the API at /extract-card
	- HF Spaces exposes port 7860 by default (set via HF_PORT env var)
	Local usage:
	pip install fastapi uvicorn requests python-multipart
	NVIDIA_API_KEY=nvapi-xxx python visiting_card_api.py
	Open http://localhost:7860
	"""

	import os
	import re
	import json
	import base64
	import requests
	from pathlib import Path
	from typing import List

	from fastapi import FastAPI, File, UploadFile, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	from pydantic import BaseModel

	# ── App ────────────────────────────────────────────────────────────────────────
	app = FastAPI(
	title="Visiting Card & Letterhead OCR API",
	description="Two-step RAG pipeline: nemoretriever-ocr-v1 → nvidia-nemotron-nano-9b-v2",
	)

	# ── CORS — allow all origins (needed for HF Spaces iframe / custom domains) ───
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ── Configuration ─────────────────────────────────────────────────────────────
	NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "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")

	OCR_URL = "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-ocr-v1"
	LLM_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
	LLM_MODEL = "nvidia/nvidia-nemotron-nano-9b-v2"

	OCR_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Accept": "application/json"}
	LLM_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Content-Type": "application/json"}

	# ── System prompt ──────────────────────────────────────────────────────────────
	CARD_SYSTEM_PROMPT = """You are a business card and letterhead data extraction assistant.
	You will receive raw OCR text extracted from a visiting card, business card, or the header/footer of a business letter.
	Parse it carefully and return ONLY a valid JSON object.
	No markdown fences, no explanation, no preamble — just the raw JSON object.
	JSON schema (return exactly this structure):
	{
	"company_name": "full name of the company or firm (string)",
	"contact_person": "name of the individual on the card or letter (string)",
	"designation": "job title or designation of the contact person (string)",
	"mobile": "mobile number(s) as a string; if multiple separate with comma (string)",
	"phone": "landline / office phone number(s); if multiple separate with comma (string)",
	"email": "email address(es); if multiple separate with comma (string)",
	"address": "full postal address as printed, preserving line breaks with a pipe \| separator (string)",
	"pin": "PIN code / ZIP code / postal code as a string of digits (string)",
	"city": "city name (string)",
	"state": "state or province name (string)",
	"country": "country name (string)",
	"gst_number": "GST / GSTIN number; typically 15 alphanumeric characters (string)",
	"website": "website URL if present (string)",
	"fax": "fax number if present (string)"
	}
	Rules:
	- company_name: usually the largest text or the text near a logo
	- contact_person: individual's personal name distinct from company name
	- designation: title like CEO, Manager, Director, Proprietor, Sales Executive, etc.
	- mobile: numbers prefixed with M:, Mob:, Cell:, +91, or 10-digit numbers
	- phone: numbers prefixed with Ph:, Tel:, T:, O:, or STD codes like (022), (080)
	- email: look for @ symbol; may be prefixed with E:, Email:, Mail:
	- address: collect all address lines; separate each line with ' \| '
	- pin: extract 6-digit Indian PIN code or 5/9-digit ZIP; digits only
	- city: extract city name from address
	- state: extract state name from address
	- country: default to India if address looks Indian and country not stated
	- gst_number: 15-character alphanumeric GSTIN
	- website: any URL starting with www., http://, or https://
	- fax: number prefixed with Fax:, F:, or similar
	- If a field is not found return "" (empty string)
	- Do NOT invent or hallucinate any information not present in the OCR text
	- If multiple phone or mobile numbers are present, join them with ', '"""


	# ── Helpers ────────────────────────────────────────────────────────────────────

	async def run_ocr(file: UploadFile) -> str:
	content = await file.read()
	image_b64 = base64.b64encode(content).decode()

	# Matches reference: base64 must be < 180,000 characters
	if len(image_b64) >= 180_000:
	raise HTTPException(413, "Image too large (base64 must be < 180,000 chars). Resize and retry.")

	# Payload structure matches the official reference exactly
	payload = {
	"input": [
	{
	"type": "image_url",
	"url": f"data:image/png;base64,{image_b64}",
	}
	]
	}

	try:
	r = requests.post(OCR_URL, headers=OCR_HEADERS, json=payload, timeout=30)
	r.raise_for_status()
	except requests.exceptions.RequestException as e:
	raise HTTPException(502, f"NVIDIA OCR API error: {e}")

	ocr_json = r.json()
	detections = ocr_json.get("text_detections", [])
	if not detections:
	data = ocr_json.get("data", [])
	if isinstance(data, list) and data:
	detections = data[0].get("text_detections", [])

	lines = []
	for det in detections:
	text = ""
	if isinstance(det, dict):
	if "text_prediction" in det:
	text = det["text_prediction"].get("text", "").strip()
	else:
	text = det.get("text", "").strip()
	if text:
	lines.append(text)
	return "\n".join(lines)


	def call_llm(ocr_text: str) -> dict:
	payload = {
	"model": LLM_MODEL,
	"max_tokens": 2048,
	"temperature": 0.1,
	"top_p": 0.9,
	"messages": [
	{"role": "system", "content": CARD_SYSTEM_PROMPT},
	{"role": "user", "content": (
	f"Here is the OCR text extracted from the business card or letterhead:\n\n"
	f"{ocr_text}\n\nExtract the required data and return ONLY the JSON object."
	)},
	],
	}

	try:
	r = requests.post(LLM_URL, headers=LLM_HEADERS, json=payload, timeout=120)
	r.raise_for_status()
	llm_json = r.json()
	except requests.exceptions.RequestException as e:
	raise HTTPException(502, f"NVIDIA LLM API error: {e}")

	raw: str = llm_json.get("choices", [{}])[0].get("message", {}).get("content", "")
	if not raw:
	raise HTTPException(502, "LLM returned empty response")

	cleaned = re.sub(r"```json\s*", "", raw, flags=re.IGNORECASE)
	cleaned = re.sub(r"```\s*", "", cleaned).strip()

	try:
	parsed = json.loads(cleaned)
	except json.JSONDecodeError:
	m = re.search(r"\{[\s\S]*\}", cleaned)
	if not m:
	raise HTTPException(502, f"LLM did not return valid JSON. Preview: {raw[:400]}")
	try:
	parsed = json.loads(m.group(0))
	except json.JSONDecodeError as e:
	raise HTTPException(502, f"JSON parse error: {e}")

	if not isinstance(parsed, dict):
	raise HTTPException(502, f"LLM response not a JSON object. Got: {type(parsed).__name__}")
	return parsed


	# ── Pydantic models ────────────────────────────────────────────────────────────

	class CardData(BaseModel):
	company_name: str
	contact_person: str
	designation: str
	mobile: str
	phone: str
	email: str
	address: str
	pin: str
	city: str
	state: str
	country: str
	gst_number: str
	website: str
	fax: str


	def build_card(parsed: dict) -> CardData:
	def s(k, n=300): return str(parsed.get(k, "")).strip()[:n]
	return CardData(
	company_name=s("company_name", 200), contact_person=s("contact_person", 100),
	designation=s("designation", 100), mobile=s("mobile", 100),
	phone=s("phone", 100), email=s("email", 200),
	address=s("address", 500), pin=s("pin", 10),
	city=s("city", 100), state=s("state", 100),
	country=s("country", 100), gst_number=s("gst_number", 20),
	website=s("website", 200), fax=s("fax", 50),
	)


	# ── API endpoints ──────────────────────────────────────────────────────────────

	@app.post("/extract-card", response_model=CardData)
	async def extract_card(file: UploadFile = File(...)):
	allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"}
	if file.content_type and file.content_type not in allowed:
	raise HTTPException(415, f"Unsupported type: {file.content_type}.")
	ocr_text = await run_ocr(file)
	if not ocr_text.strip():
	raise HTTPException(422, "OCR produced no text. Check image quality.")
	return build_card(call_llm(ocr_text))


	@app.post("/extract-card/batch", response_model=List[CardData])
	async def extract_card_batch(files: List[UploadFile] = File(...)):
	if len(files) > 10:
	raise HTTPException(400, "Maximum 10 files per batch request.")
	empty = CardData(**{f: "" for f in CardData.__fields__})
	results = []
	for idx, file in enumerate(files):
	allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"}
	if file.content_type and file.content_type not in allowed:
	raise HTTPException(415, f"File {idx+1}: unsupported type.")
	ocr_text = await run_ocr(file)
	results.append(build_card(call_llm(ocr_text)) if ocr_text.strip() else empty)
	return results


	@app.get("/health")
	async def health():
	return {"status": "healthy", "model": LLM_MODEL}


	# ── Serve index.html at root (must be placed alongside this script) ────────────
	HTML_PATH = Path(__file__).parent / "index.html"

	@app.get("/", response_class=HTMLResponse)
	async def serve_ui():
	if not HTML_PATH.exists():
	return HTMLResponse(
	"<h2 style='font-family:sans-serif;padding:40px'>"
	"index.html not found — place it next to visiting_card_api.py</h2>", 500
	)
	return HTMLResponse(HTML_PATH.read_text(encoding="utf-8"))


	# ── Entry point ────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	import uvicorn
	port = int(os.environ.get("HF_PORT", 7860))
	uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)