| from io import BytesIO |
| from fastapi import UploadFile, HTTPException |
| import PyPDF2 |
| import docx |
|
|
| async def extract_text_from_file(file: UploadFile) -> str: |
| """Extracts text from various file types.""" |
| content = await file.read() |
| file_stream = BytesIO(content) |
|
|
| if file.content_type == "application/pdf": |
| return extract_text_from_pdf(file_stream) |
| elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
| return extract_text_from_docx(file_stream) |
| elif file.content_type == "text/plain": |
| return file_stream.read().decode("utf-8") |
| else: |
| raise HTTPException( |
| status_code=415, |
| detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file." |
| ) |
|
|
| def extract_text_from_pdf(file_stream: BytesIO) -> str: |
| """Extracts text from a PDF file.""" |
| reader = PyPDF2.PdfReader(file_stream) |
| text = "" |
| for page in reader.pages: |
| text += page.extract_text() or "" |
| return text |
|
|
| def extract_text_from_docx(file_stream: BytesIO) -> str: |
| """Extracts text from a DOCX file.""" |
| doc = docx.Document(file_stream) |
| text = "" |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
| return text |
|
|