File size: 1,229 Bytes
959439e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import fitz
import docx
from io import BytesIO
async def extract_text(file):
filename = (file.filename or "").lower()
contents = await file.read()
if not contents:
raise ValueError("Uploaded file is empty.")
if filename.endswith(".pdf"):
pdf = fitz.open(stream=contents, filetype="pdf")
text = ""
for page in pdf:
text += page.get_text()
if not text.strip():
raise ValueError("Could not extract text from PDF.")
return text
elif filename.endswith(".docx"):
doc = docx.Document(BytesIO(contents))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
if not text.strip():
raise ValueError("Could not extract text from DOCX.")
return text
elif filename.endswith(".txt"):
try:
text = contents.decode("utf-8")
except UnicodeDecodeError as e:
raise ValueError("TXT file must be UTF-8 encoded.") from e
if not text.strip():
raise ValueError("Uploaded TXT file is empty.")
return text
else:
raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.") |