File size: 1,259 Bytes
29fbb51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | from io import BytesIO
from fastapi import UploadFile, HTTPException
import PyPDF2
import docx
async def extract_text_from_file(file: UploadFile) -> str:
"""Extracts text from various file types."""
content = await file.read()
file_stream = BytesIO(content)
if file.content_type == "application/pdf":
return extract_text_from_pdf(file_stream)
elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return extract_text_from_docx(file_stream)
elif file.content_type == "text/plain":
return file_stream.read().decode("utf-8")
else:
raise HTTPException(
status_code=415,
detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
)
def extract_text_from_pdf(file_stream: BytesIO) -> str:
"""Extracts text from a PDF file."""
reader = PyPDF2.PdfReader(file_stream)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def extract_text_from_docx(file_stream: BytesIO) -> str:
"""Extracts text from a DOCX file."""
doc = docx.Document(file_stream)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
|