from pypdf import PdfReader from fastapi import HTTPException, status import io def extract_text_from_pdf(file_bytes: bytes) -> str: try: reader = PdfReader(io.BytesIO(file_bytes)) text = "" for page in reader.pages: text += page.extract_text() or "" if not text.strip(): raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Could not extract text from PDF. The file may be scanned/image-based." ) return text.strip() except HTTPException: raise except Exception: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid or corrupted PDF file" )