Spaces:
Sleeping
Sleeping
| from pypdf import PdfReader | |
| from fastapi import HTTPException, status | |
| import io | |
| def extract_text_from_pdf(file_bytes: bytes) -> str: | |
| try: | |
| reader = PdfReader(io.BytesIO(file_bytes)) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| if not text.strip(): | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="Could not extract text from PDF. The file may be scanned/image-based." | |
| ) | |
| return text.strip() | |
| except HTTPException: | |
| raise | |
| except Exception: | |
| raise HTTPException( | |
| status_code=status.HTTP_400_BAD_REQUEST, | |
| detail="Invalid or corrupted PDF file" | |
| ) |