Spaces:
Running
Running
| """ | |
| Visiting Card & Letterhead OCR API | |
| =================================== | |
| Two-step pipeline: nemoretriever-ocr-v1 β nvidia-nemotron-nano-9b-v2 | |
| Deploy on Hugging Face Spaces (Docker or Python SDK): | |
| - Set secret NVIDIA_API_KEY in Space settings β Variables and secrets | |
| - The app serves the HTML frontend at / and the API at /extract-card | |
| - HF Spaces exposes port 7860 by default (set via HF_PORT env var) | |
| Local usage: | |
| pip install fastapi uvicorn requests python-multipart | |
| NVIDIA_API_KEY=nvapi-xxx python visiting_card_api.py | |
| Open http://localhost:7860 | |
| """ | |
| import os | |
| import re | |
| import json | |
| import base64 | |
| import requests | |
| from pathlib import Path | |
| from typing import List | |
| from fastapi import FastAPI, File, UploadFile, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import HTMLResponse | |
| from pydantic import BaseModel | |
| # ββ App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI( | |
| title="Visiting Card & Letterhead OCR API", | |
| description="Two-step RAG pipeline: nemoretriever-ocr-v1 β nvidia-nemotron-nano-9b-v2", | |
| ) | |
| # ββ CORS β allow all origins (needed for HF Spaces iframe / custom domains) βββ | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "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") | |
| OCR_URL = "https://ai.api.nvidia.com/v1/cv/nvidia/nemotron-ocr-v1" | |
| LLM_URL = "https://integrate.api.nvidia.com/v1/chat/completions" | |
| LLM_MODEL = "nvidia/nvidia-nemotron-nano-9b-v2" | |
| OCR_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Accept": "application/json"} | |
| LLM_HEADERS = {"Authorization": f"Bearer {NVIDIA_API_KEY}", "Content-Type": "application/json"} | |
| # ββ System prompt ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CARD_SYSTEM_PROMPT = """You are a business card and letterhead data extraction assistant. | |
| You will receive raw OCR text extracted from a visiting card, business card, or the header/footer of a business letter. | |
| Parse it carefully and return ONLY a valid JSON object. | |
| No markdown fences, no explanation, no preamble β just the raw JSON object. | |
| JSON schema (return exactly this structure): | |
| { | |
| "company_name": "full name of the company or firm (string)", | |
| "contact_person": "name of the individual on the card or letter (string)", | |
| "designation": "job title or designation of the contact person (string)", | |
| "mobile": "mobile number(s) as a string; if multiple separate with comma (string)", | |
| "phone": "landline / office phone number(s); if multiple separate with comma (string)", | |
| "email": "email address(es); if multiple separate with comma (string)", | |
| "address": "full postal address as printed, preserving line breaks with a pipe | separator (string)", | |
| "pin": "PIN code / ZIP code / postal code as a string of digits (string)", | |
| "city": "city name (string)", | |
| "state": "state or province name (string)", | |
| "country": "country name (string)", | |
| "gst_number": "GST / GSTIN number; typically 15 alphanumeric characters (string)", | |
| "website": "website URL if present (string)", | |
| "fax": "fax number if present (string)" | |
| } | |
| Rules: | |
| - company_name: usually the largest text or the text near a logo | |
| - contact_person: individual's personal name distinct from company name | |
| - designation: title like CEO, Manager, Director, Proprietor, Sales Executive, etc. | |
| - mobile: numbers prefixed with M:, Mob:, Cell:, +91, or 10-digit numbers | |
| - phone: numbers prefixed with Ph:, Tel:, T:, O:, or STD codes like (022), (080) | |
| - email: look for @ symbol; may be prefixed with E:, Email:, Mail: | |
| - address: collect all address lines; separate each line with ' | ' | |
| - pin: extract 6-digit Indian PIN code or 5/9-digit ZIP; digits only | |
| - city: extract city name from address | |
| - state: extract state name from address | |
| - country: default to India if address looks Indian and country not stated | |
| - gst_number: 15-character alphanumeric GSTIN | |
| - website: any URL starting with www., http://, or https:// | |
| - fax: number prefixed with Fax:, F:, or similar | |
| - If a field is not found return "" (empty string) | |
| - Do NOT invent or hallucinate any information not present in the OCR text | |
| - If multiple phone or mobile numbers are present, join them with ', '""" | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def run_ocr(file: UploadFile) -> str: | |
| content = await file.read() | |
| image_b64 = base64.b64encode(content).decode() | |
| # Matches reference: base64 must be < 180,000 characters | |
| if len(image_b64) >= 180_000: | |
| raise HTTPException(413, "Image too large (base64 must be < 180,000 chars). Resize and retry.") | |
| # Payload structure matches the official reference exactly | |
| payload = { | |
| "input": [ | |
| { | |
| "type": "image_url", | |
| "url": f"data:image/png;base64,{image_b64}", | |
| } | |
| ] | |
| } | |
| try: | |
| r = requests.post(OCR_URL, headers=OCR_HEADERS, json=payload, timeout=30) | |
| r.raise_for_status() | |
| except requests.exceptions.RequestException as e: | |
| raise HTTPException(502, f"NVIDIA OCR API error: {e}") | |
| ocr_json = r.json() | |
| detections = ocr_json.get("text_detections", []) | |
| if not detections: | |
| data = ocr_json.get("data", []) | |
| if isinstance(data, list) and data: | |
| detections = data[0].get("text_detections", []) | |
| lines = [] | |
| for det in detections: | |
| text = "" | |
| if isinstance(det, dict): | |
| if "text_prediction" in det: | |
| text = det["text_prediction"].get("text", "").strip() | |
| else: | |
| text = det.get("text", "").strip() | |
| if text: | |
| lines.append(text) | |
| return "\n".join(lines) | |
| def call_llm(ocr_text: str) -> dict: | |
| payload = { | |
| "model": LLM_MODEL, | |
| "max_tokens": 2048, | |
| "temperature": 0.1, | |
| "top_p": 0.9, | |
| "messages": [ | |
| {"role": "system", "content": CARD_SYSTEM_PROMPT}, | |
| {"role": "user", "content": ( | |
| f"Here is the OCR text extracted from the business card or letterhead:\n\n" | |
| f"{ocr_text}\n\nExtract the required data and return ONLY the JSON object." | |
| )}, | |
| ], | |
| } | |
| try: | |
| r = requests.post(LLM_URL, headers=LLM_HEADERS, json=payload, timeout=120) | |
| r.raise_for_status() | |
| llm_json = r.json() | |
| except requests.exceptions.RequestException as e: | |
| raise HTTPException(502, f"NVIDIA LLM API error: {e}") | |
| raw: str = llm_json.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| if not raw: | |
| raise HTTPException(502, "LLM returned empty response") | |
| cleaned = re.sub(r"```json\s*", "", raw, flags=re.IGNORECASE) | |
| cleaned = re.sub(r"```\s*", "", cleaned).strip() | |
| try: | |
| parsed = json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| m = re.search(r"\{[\s\S]*\}", cleaned) | |
| if not m: | |
| raise HTTPException(502, f"LLM did not return valid JSON. Preview: {raw[:400]}") | |
| try: | |
| parsed = json.loads(m.group(0)) | |
| except json.JSONDecodeError as e: | |
| raise HTTPException(502, f"JSON parse error: {e}") | |
| if not isinstance(parsed, dict): | |
| raise HTTPException(502, f"LLM response not a JSON object. Got: {type(parsed).__name__}") | |
| return parsed | |
| # ββ Pydantic models ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class CardData(BaseModel): | |
| company_name: str | |
| contact_person: str | |
| designation: str | |
| mobile: str | |
| phone: str | |
| email: str | |
| address: str | |
| pin: str | |
| city: str | |
| state: str | |
| country: str | |
| gst_number: str | |
| website: str | |
| fax: str | |
| def build_card(parsed: dict) -> CardData: | |
| def s(k, n=300): return str(parsed.get(k, "")).strip()[:n] | |
| return CardData( | |
| company_name=s("company_name", 200), contact_person=s("contact_person", 100), | |
| designation=s("designation", 100), mobile=s("mobile", 100), | |
| phone=s("phone", 100), email=s("email", 200), | |
| address=s("address", 500), pin=s("pin", 10), | |
| city=s("city", 100), state=s("state", 100), | |
| country=s("country", 100), gst_number=s("gst_number", 20), | |
| website=s("website", 200), fax=s("fax", 50), | |
| ) | |
| # ββ API endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def extract_card(file: UploadFile = File(...)): | |
| allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"} | |
| if file.content_type and file.content_type not in allowed: | |
| raise HTTPException(415, f"Unsupported type: {file.content_type}.") | |
| ocr_text = await run_ocr(file) | |
| if not ocr_text.strip(): | |
| raise HTTPException(422, "OCR produced no text. Check image quality.") | |
| return build_card(call_llm(ocr_text)) | |
| async def extract_card_batch(files: List[UploadFile] = File(...)): | |
| if len(files) > 10: | |
| raise HTTPException(400, "Maximum 10 files per batch request.") | |
| empty = CardData(**{f: "" for f in CardData.__fields__}) | |
| results = [] | |
| for idx, file in enumerate(files): | |
| allowed = {"image/jpeg", "image/jpg", "image/png", "image/webp"} | |
| if file.content_type and file.content_type not in allowed: | |
| raise HTTPException(415, f"File {idx+1}: unsupported type.") | |
| ocr_text = await run_ocr(file) | |
| results.append(build_card(call_llm(ocr_text)) if ocr_text.strip() else empty) | |
| return results | |
| async def health(): | |
| return {"status": "healthy", "model": LLM_MODEL} | |
| # ββ Serve index.html at root (must be placed alongside this script) ββββββββββββ | |
| HTML_PATH = Path(__file__).parent / "index.html" | |
| async def serve_ui(): | |
| if not HTML_PATH.exists(): | |
| return HTMLResponse( | |
| "<h2 style='font-family:sans-serif;padding:40px'>" | |
| "index.html not found β place it next to visiting_card_api.py</h2>", 500 | |
| ) | |
| return HTMLResponse(HTML_PATH.read_text(encoding="utf-8")) | |
| # ββ Entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| import uvicorn | |
| port = int(os.environ.get("HF_PORT", 7860)) | |
| uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False) |