Spaces:

agentsay
/

Agrosure_OCR_verifier

Sleeping

App Files Files Community

Agrosure_OCR_verifier / app.py

agentsay

Update app.py

2295e56 verified 7 months ago

raw

history blame contribute delete

6.42 kB

	import fitz # PyMuPDF
	import pytesseract
	import camelot
	import re
	import shutil
	from fastapi import FastAPI, File, UploadFile, HTTPException
	from fastapi.responses import JSONResponse
	from pdf2image import convert_from_path
	import os
	import uuid
	import tempfile
	from typing import List
	from loguru import logger
	import logging

	# Configure logging
	logger.add("app.log", rotation="500 MB", level="INFO")

	app = FastAPI(title="Policy Verification API", version="1.0.0")

	# ========== Core Utilities ==========

	def extract_text_from_pdf(pdf_path: str) -> str:
	"""
	Extracts text from a PDF using PyMuPDF; falls back to OCR if text is insufficient.

	Args:
	pdf_path (str): Path to the PDF file.

	Returns:
	str: Extracted text.

	Raises:
	RuntimeError: If text extraction or OCR fails.
	"""
	try:
	logger.info(f"Extracting text from PDF: {pdf_path}")
	doc = fitz.open(pdf_path)
	text = "".join(page.get_text() for page in doc)
	doc.close()

	if len(text.strip()) < 50:
	logger.warning("Insufficient text extracted, falling back to OCR")
	text = ocr_pdf(pdf_path)
	return text
	except Exception as e:
	logger.error(f"Failed to extract text from PDF: {e}")
	raise RuntimeError(f"Error reading PDF with PyMuPDF: {e}")

	def ocr_pdf(pdf_path: str) -> str:
	"""
	Performs OCR on a PDF using Tesseract and pdf2image.

	Args:
	pdf_path (str): Path to the PDF file.

	Returns:
	str: OCR-extracted text.

	Raises:
	RuntimeError: If OCR process fails.
	"""
	try:
	logger.info(f"Performing OCR on PDF: {pdf_path}")
	images = convert_from_path(pdf_path)
	text = "".join(pytesseract.image_to_string(img) for img in images)
	logger.info("OCR completed successfully")
	return text
	except Exception as e:
	logger.error(f"OCR failed: {e}")
	raise RuntimeError(f"OCR failed: {e}")

	def find_uins(text: str) -> List[str]:
	"""
	Finds UINs matching IRDA format in the provided text.

	Args:
	text (str): Text to search for UINs.

	Returns:
	List[str]: List of unique, normalized UINs.
	"""
	pattern = r"UIN[:\-]?\s(IRDAN\d+\s[A-Z0-9]+V\d{2,})"
	matches = re.findall(pattern, text, flags=re.IGNORECASE)
	normalized = [re.sub(r"\s+", "", m.strip()) for m in matches]
	unique_uins = list(dict.fromkeys(normalized))
	logger.info(f"Found {len(unique_uins)} unique UINs")
	return unique_uins

	def extract_uin_list_from_pdf_db(pdf_db_path: str) -> List[str]:
	"""
	Extracts UINs from a database-like PDF using Camelot.

	Args:
	pdf_db_path (str): Path to the database PDF file.

	Returns:
	List[str]: List of unique, normalized UINs.

	Raises:
	RuntimeError: If UIN extraction fails.
	"""
	try:
	logger.info(f"Extracting UINs from DB PDF: {pdf_db_path}")
	tables = camelot.read_pdf(pdf_db_path, pages='all', flavor='stream')
	all_uins = []
	for table in tables:
	df = table.df
	for row in df.values:
	for cell in row:
	matches = re.findall(r"IRDAN\d+\s*[A-Z0-9]+V\d{2,}", str(cell), flags=re.IGNORECASE)
	all_uins.extend(re.sub(r"\s+", "", match.strip()) for match in matches)
	unique_uins = list(dict.fromkeys(all_uins))
	logger.info(f"Extracted {len(unique_uins)} unique UINs from DB")
	return unique_uins
	except Exception as e:
	logger.error(f"Error extracting UINs from DB PDF: {e}")
	raise RuntimeError(f"Error extracting UINs from DB PDF: {e}")

	def match_found(policy_uins: List[str], db_uins: List[str]) -> bool:
	"""
	Checks if any policy UIN exists in the database UINs.

	Args:
	policy_uins (List[str]): List of policy UINs.
	db_uins (List[str]): List of database UINs.

	Returns:
	bool: True if a match is found, False otherwise.
	"""
	db_set = {u.replace(" ", "").replace("-", "").lower() for u in db_uins}
	for u in policy_uins:
	normalized = u.replace(" ", "").replace("-", "").lower()
	if normalized in db_set:
	logger.info(f"UIN match found: {normalized}")
	return True
	logger.info("No UIN matches found")
	return False

	# ========== FastAPI Endpoint ==========

	@app.post("/verify-policy")
	async def verify_policy(
	policy_file: UploadFile = File(...)
	):
	"""
	Verifies if a policy PDF's UIN exists in a database PDF.
	"""
	logger.info("Received policy verification request")
	db_file_path = "/app/list-of-products.pdf"

	# Validate uploaded policy file
	if not policy_file.filename.endswith(".pdf"):
	logger.error("Invalid file type for policy")
	raise HTTPException(status_code=400, detail="Policy file must be a PDF.")

	if not os.path.exists(db_file_path):
	logger.error(f"Database PDF file not found at {db_file_path}")
	raise HTTPException(status_code=500, detail="Database file missing on server.")

	try:
	with tempfile.TemporaryDirectory() as tmpdirname:
	# Save uploaded policy file
	policy_path = os.path.join(tmpdirname, f"{uuid.uuid4()}_policy.pdf")
	logger.info(f"Saving policy file to: {policy_path}")
	with open(policy_path, "wb") as f:
	shutil.copyfileobj(policy_file.file, f)

	# Use static DB file path directly
	logger.info(f"Using DB file from: {db_file_path}")
	policy_text = extract_text_from_pdf(policy_path)
	policy_uins = find_uins(policy_text)
	db_uins = extract_uin_list_from_pdf_db(db_file_path)
	is_matched = match_found(policy_uins, db_uins)

	return JSONResponse(
	status_code=200,
	content={
	"match": is_matched,
	"policy-text":policy_text,
	}
	)
	except Exception as e:
	logger.error(f"Verification failed: {e}")
	raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")

	# ========== Run Application ==========

	if __name__ == "__main__":
	import uvicorn
	logger.info("Starting FastAPI server")
	uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)