Spaces:

Uamir
/

studysnap

Sleeping

deploy backend

959439e 18 days ago

1.23 kB

	import fitz
	import docx
	from io import BytesIO

	async def extract_text(file):
	filename = (file.filename or "").lower()
	contents = await file.read()
	if not contents:
	raise ValueError("Uploaded file is empty.")

	if filename.endswith(".pdf"):
	pdf = fitz.open(stream=contents, filetype="pdf")
	text = ""
	for page in pdf:
	text += page.get_text()
	if not text.strip():
	raise ValueError("Could not extract text from PDF.")
	return text

	elif filename.endswith(".docx"):
	doc = docx.Document(BytesIO(contents))
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	if not text.strip():
	raise ValueError("Could not extract text from DOCX.")
	return text

	elif filename.endswith(".txt"):
	try:
	text = contents.decode("utf-8")
	except UnicodeDecodeError as e:
	raise ValueError("TXT file must be UTF-8 encoded.") from e
	if not text.strip():
	raise ValueError("Uploaded TXT file is empty.")
	return text

	else:
	raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")