Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pytesseract | |
| import camelot | |
| import re | |
| import shutil | |
| from fastapi import FastAPI, File, UploadFile, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from pdf2image import convert_from_path | |
| import os | |
| import uuid | |
| import tempfile | |
| from typing import List | |
| from loguru import logger | |
| import logging | |
| # Configure logging | |
| logger.add("app.log", rotation="500 MB", level="INFO") | |
| app = FastAPI(title="Policy Verification API", version="1.0.0") | |
| # ========== Core Utilities ========== | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """ | |
| Extracts text from a PDF using PyMuPDF; falls back to OCR if text is insufficient. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| Returns: | |
| str: Extracted text. | |
| Raises: | |
| RuntimeError: If text extraction or OCR fails. | |
| """ | |
| try: | |
| logger.info(f"Extracting text from PDF: {pdf_path}") | |
| doc = fitz.open(pdf_path) | |
| text = "".join(page.get_text() for page in doc) | |
| doc.close() | |
| if len(text.strip()) < 50: | |
| logger.warning("Insufficient text extracted, falling back to OCR") | |
| text = ocr_pdf(pdf_path) | |
| return text | |
| except Exception as e: | |
| logger.error(f"Failed to extract text from PDF: {e}") | |
| raise RuntimeError(f"Error reading PDF with PyMuPDF: {e}") | |
| def ocr_pdf(pdf_path: str) -> str: | |
| """ | |
| Performs OCR on a PDF using Tesseract and pdf2image. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| Returns: | |
| str: OCR-extracted text. | |
| Raises: | |
| RuntimeError: If OCR process fails. | |
| """ | |
| try: | |
| logger.info(f"Performing OCR on PDF: {pdf_path}") | |
| images = convert_from_path(pdf_path) | |
| text = "".join(pytesseract.image_to_string(img) for img in images) | |
| logger.info("OCR completed successfully") | |
| return text | |
| except Exception as e: | |
| logger.error(f"OCR failed: {e}") | |
| raise RuntimeError(f"OCR failed: {e}") | |
| def find_uins(text: str) -> List[str]: | |
| """ | |
| Finds UINs matching IRDA format in the provided text. | |
| Args: | |
| text (str): Text to search for UINs. | |
| Returns: | |
| List[str]: List of unique, normalized UINs. | |
| """ | |
| pattern = r"UIN[:\-]?\s*(IRDAN\d+\s*[A-Z0-9]+V\d{2,})" | |
| matches = re.findall(pattern, text, flags=re.IGNORECASE) | |
| normalized = [re.sub(r"\s+", "", m.strip()) for m in matches] | |
| unique_uins = list(dict.fromkeys(normalized)) | |
| logger.info(f"Found {len(unique_uins)} unique UINs") | |
| return unique_uins | |
| def extract_uin_list_from_pdf_db(pdf_db_path: str) -> List[str]: | |
| """ | |
| Extracts UINs from a database-like PDF using Camelot. | |
| Args: | |
| pdf_db_path (str): Path to the database PDF file. | |
| Returns: | |
| List[str]: List of unique, normalized UINs. | |
| Raises: | |
| RuntimeError: If UIN extraction fails. | |
| """ | |
| try: | |
| logger.info(f"Extracting UINs from DB PDF: {pdf_db_path}") | |
| tables = camelot.read_pdf(pdf_db_path, pages='all', flavor='stream') | |
| all_uins = [] | |
| for table in tables: | |
| df = table.df | |
| for row in df.values: | |
| for cell in row: | |
| matches = re.findall(r"IRDAN\d+\s*[A-Z0-9]+V\d{2,}", str(cell), flags=re.IGNORECASE) | |
| all_uins.extend(re.sub(r"\s+", "", match.strip()) for match in matches) | |
| unique_uins = list(dict.fromkeys(all_uins)) | |
| logger.info(f"Extracted {len(unique_uins)} unique UINs from DB") | |
| return unique_uins | |
| except Exception as e: | |
| logger.error(f"Error extracting UINs from DB PDF: {e}") | |
| raise RuntimeError(f"Error extracting UINs from DB PDF: {e}") | |
| def match_found(policy_uins: List[str], db_uins: List[str]) -> bool: | |
| """ | |
| Checks if any policy UIN exists in the database UINs. | |
| Args: | |
| policy_uins (List[str]): List of policy UINs. | |
| db_uins (List[str]): List of database UINs. | |
| Returns: | |
| bool: True if a match is found, False otherwise. | |
| """ | |
| db_set = {u.replace(" ", "").replace("-", "").lower() for u in db_uins} | |
| for u in policy_uins: | |
| normalized = u.replace(" ", "").replace("-", "").lower() | |
| if normalized in db_set: | |
| logger.info(f"UIN match found: {normalized}") | |
| return True | |
| logger.info("No UIN matches found") | |
| return False | |
| # ========== FastAPI Endpoint ========== | |
| async def verify_policy( | |
| policy_file: UploadFile = File(...) | |
| ): | |
| """ | |
| Verifies if a policy PDF's UIN exists in a database PDF. | |
| """ | |
| logger.info("Received policy verification request") | |
| db_file_path = "/app/list-of-products.pdf" | |
| # Validate uploaded policy file | |
| if not policy_file.filename.endswith(".pdf"): | |
| logger.error("Invalid file type for policy") | |
| raise HTTPException(status_code=400, detail="Policy file must be a PDF.") | |
| if not os.path.exists(db_file_path): | |
| logger.error(f"Database PDF file not found at {db_file_path}") | |
| raise HTTPException(status_code=500, detail="Database file missing on server.") | |
| try: | |
| with tempfile.TemporaryDirectory() as tmpdirname: | |
| # Save uploaded policy file | |
| policy_path = os.path.join(tmpdirname, f"{uuid.uuid4()}_policy.pdf") | |
| logger.info(f"Saving policy file to: {policy_path}") | |
| with open(policy_path, "wb") as f: | |
| shutil.copyfileobj(policy_file.file, f) | |
| # Use static DB file path directly | |
| logger.info(f"Using DB file from: {db_file_path}") | |
| policy_text = extract_text_from_pdf(policy_path) | |
| policy_uins = find_uins(policy_text) | |
| db_uins = extract_uin_list_from_pdf_db(db_file_path) | |
| is_matched = match_found(policy_uins, db_uins) | |
| return JSONResponse( | |
| status_code=200, | |
| content={ | |
| "match": is_matched, | |
| "policy-text":policy_text, | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"Verification failed: {e}") | |
| raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}") | |
| # ========== Run Application ========== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| logger.info("Starting FastAPI server") | |
| uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) |