from io import BytesIO from fastapi import UploadFile, HTTPException import PyPDF2 import docx async def extract_text_from_file(file: UploadFile) -> str: """Extracts text from various file types.""" content = await file.read() file_stream = BytesIO(content) if file.content_type == "application/pdf": return extract_text_from_pdf(file_stream) elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return extract_text_from_docx(file_stream) elif file.content_type == "text/plain": return file_stream.read().decode("utf-8") else: raise HTTPException( status_code=415, detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file." ) def extract_text_from_pdf(file_stream: BytesIO) -> str: """Extracts text from a PDF file.""" reader = PyPDF2.PdfReader(file_stream) text = "" for page in reader.pages: text += page.extract_text() or "" return text def extract_text_from_docx(file_stream: BytesIO) -> str: """Extracts text from a DOCX file.""" doc = docx.Document(file_stream) text = "" for para in doc.paragraphs: text += para.text + "\n" return text