from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse from pydantic import BaseModel import requests import base64 import json import re import os app = FastAPI( title="Aadhaar Card OCR API", description="Two-step RAG pipeline: nemoretriever-ocr-v1 → nvidia-nemotron-nano-9b-v2 for Aadhaar card extraction", ) # ── CORS ────────────────────────────────────────────────────────────────────── app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ── Configuration ───────────────────────────────────────────────────────────── # Set NVIDIA_API_KEY as a Secret in your HuggingFace Space settings NVIDIA_API_KEY = os.environ.get("NVIDIA_API_KEY", "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") OCR_URL = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1" LLM_URL = "https://integrate.api.nvidia.com/v1/chat/completions" LLM_MODEL = "nvidia/nvidia-nemotron-nano-9b-v2" OCR_HEADERS = { "Authorization": f"Bearer {NVIDIA_API_KEY}", "Accept": "application/json", } LLM_HEADERS = { "Authorization": f"Bearer {NVIDIA_API_KEY}", "Content-Type": "application/json", } # ── System prompts ───────────────────────────────────────────────────────────── FRONT_SYSTEM_PROMPT = """You are an Aadhaar card front-side data extraction assistant. You will receive raw OCR text extracted from the FRONT side of an Indian Aadhaar card. Parse it carefully and return ONLY a valid JSON object. No markdown fences, no explanation, no preamble — just the raw JSON object. JSON schema (return exactly this structure): { "name": "full name of the card holder (string)", "dob": "date of birth in DD/MM/YYYY format (string)", "gender": "gender Male, Female, or Other (string)", "aadhaar_no": "12-digit Aadhaar number as a string, digits only, yes spaces" } Rules: - name: the primary card holder's full name (usually in bold, after "Name:" or just prominently placed) - dob: look for "DOB", "Date of Birth", "जन्म तिथि" — output in DD/MM/YYYY format; if already in that format keep it - gender: look for "Male", "Female", "Other", or Hindi equivalents "पुरुष", "महिला", "अन्य" - aadhaar_no: the 12-digit number, usually printed in groups like "XXXX XXXX XXXX" — remove all spaces and return only digits - If a field is not found, use "" for strings - Do NOT include address details, gender, or any other fields not in the schema""" BACK_SYSTEM_PROMPT = """You are an Aadhaar card back-side data extraction assistant. You will receive raw OCR text extracted from the BACK side of an Indian Aadhaar card. Parse it carefully and return ONLY a valid JSON object. No markdown fences, no explanation, no preamble — just the raw JSON object. JSON schema (return exactly this structure): { "address": "door/flat number and street/locality/road name (string)", "village_city": "village name or city/town name (string)", "state": "state name (string)", "pincode": "6-digit PIN code as a string" } Rules: - address: the first line(s) of the address — house/flat number, building name, street or locality; exclude city, district, state, and PIN - village_city: look for village name, town, or city; may also appear under "District" — prefer the more specific locality name over the district - state: the full state name (e.g. "Maharashtra", "Tamil Nadu"); look near the end of the address block - pincode: the 6-digit postal code; look for "PIN", "PIN Code", or a standalone 6-digit number at the end of the address - If a field is not found, use "" for strings - Do NOT include the card holder's name or Aadhaar number — focus only on address fields""" # ── Helper ───────────────────────────────────────────────────────────────────── async def run_ocr(file: UploadFile) -> str: """Upload image to NVIDIA OCR and return concatenated plain text.""" content = await file.read() image_b64 = base64.b64encode(content).decode() if len(image_b64) >= 500_000: raise HTTPException( status_code=413, detail="Image too large (base64 > 180 KB). Resize the image and try again.", ) payload = { "input": [ { "type": "image_url", "url": f"data:image/png;base64,{image_b64}", } ] } try: response = requests.post(OCR_URL, headers=OCR_HEADERS, json=payload, timeout=30) response.raise_for_status() except requests.exceptions.RequestException as e: raise HTTPException(status_code=502, detail=f"NVIDIA OCR API error: {str(e)}") ocr_json = response.json() print("OCR Response:", ocr_json) detections = ocr_json.get("text_detections", []) if not detections: data = ocr_json.get("data", []) if isinstance(data, list) and len(data) > 0: detections = data[0].get("text_detections", []) lines = [] for det in detections: text = "" if isinstance(det, dict): if "text_prediction" in det: text = det["text_prediction"].get("text", "").strip() else: text = det.get("text", "").strip() if text: lines.append(text) return "\n".join(lines) def call_llm(ocr_text: str, system_prompt: str) -> dict: """Send OCR text to the LLM with the given system prompt and return parsed JSON dict.""" llm_payload = { "model": LLM_MODEL, "max_tokens": 1024, "temperature": 0.2, "top_p": 0.9, "messages": [ {"role": "system", "content": system_prompt}, { "role": "user", "content": ( f"Here is the OCR text extracted from the Aadhaar card:\n\n" f"{ocr_text}\n\n" f"Extract the required data and return ONLY the JSON object." ), }, ], } try: llm_response = requests.post(LLM_URL, headers=LLM_HEADERS, json=llm_payload, timeout=200) llm_response.raise_for_status() llm_json = llm_response.json() print("LLM Response JSON:", llm_json) except requests.exceptions.RequestException as e: raise HTTPException(status_code=502, detail=f"NVIDIA LLM API error: {str(e)}") raw_text: str = llm_json.get("choices", [{}])[0].get("message", {}).get("content", "") print("LLM Raw Text:", raw_text) if not raw_text: raise HTTPException(status_code=502, detail="LLM returned an empty response") cleaned = re.sub(r"```json\s*", "", raw_text, flags=re.IGNORECASE) cleaned = re.sub(r"```\s*", "", cleaned).strip() parsed = None try: parsed = json.loads(cleaned) except json.JSONDecodeError: match = re.search(r"\{[\s\S]*\}", cleaned) if not match: raise HTTPException( status_code=502, detail=f"LLM did not return valid JSON. Preview: {raw_text[:400]}", ) try: parsed = json.loads(match.group(0)) except json.JSONDecodeError as e: raise HTTPException(status_code=502, detail=f"JSON parse error: {str(e)}") if not isinstance(parsed, dict): raise HTTPException( status_code=502, detail=f"LLM response is not a JSON object. Got: {type(parsed).__name__}", ) print("LLM Parsed Data:", parsed) return parsed # ── Request / Response models ───────────────────────────────────────────────── class AadhaarFrontData(BaseModel): name: str dob: str gender: str aadhaar_no: str class AadhaarBackData(BaseModel): address: str village_city: str state: str pincode: str # ── Endpoints ───────────────────────────────────────────────────────────────── @app.post("/extract-front", response_model=AadhaarFrontData) async def extract_front(file: UploadFile = File(...)): """ Upload the FRONT side of an Aadhaar card image. Pipeline: 1. nemoretriever-ocr-v1 → raw OCR text 2. nvidia-nemotron-nano-9b-v2 → structured JSON Returns: name, dob, gender, aadhaar_no """ ocr_text = await run_ocr(file) if not ocr_text.strip(): raise HTTPException(status_code=422, detail="OCR produced no text. Check the image quality.") parsed = call_llm(ocr_text, FRONT_SYSTEM_PROMPT) raw_aadhaar = str(parsed.get("aadhaar_no", "")) aadhaar_digits = re.sub(r"\D", "", raw_aadhaar) return AadhaarFrontData( name=str(parsed.get("name", "")).strip()[:100], dob=str(parsed.get("dob", "")).strip()[:12], gender=str(parsed.get("gender", "")).strip()[:20], aadhaar_no=aadhaar_digits[:12], ) @app.post("/extract-back", response_model=AadhaarBackData) async def extract_back(file: UploadFile = File(...)): """ Upload the BACK side of an Aadhaar card image. Pipeline: 1. nemoretriever-ocr-v1 → raw OCR text 2. nvidia-nemotron-nano-9b-v2 → structured JSON Returns: address, village_city, state, pincode """ ocr_text = await run_ocr(file) if not ocr_text.strip(): raise HTTPException(status_code=422, detail="OCR produced no text. Check the image quality.") parsed = call_llm(ocr_text, BACK_SYSTEM_PROMPT) raw_pin = str(parsed.get("pincode", "")) pin_digits = re.sub(r"\D", "", raw_pin)[:6] return AadhaarBackData( address=str(parsed.get("address", "")).strip()[:200], village_city=str(parsed.get("village_city", "")).strip()[:100], state=str(parsed.get("state", "")).strip()[:60], pincode=pin_digits, ) @app.get("/health") async def health_check(): return {"status": "healthy", "model": LLM_MODEL} @app.get("/") async def root(): return FileResponse("index.html") if __name__ == "__main__": import uvicorn uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)