Spaces:

ByteRiot
/

CandidateExplorer

Sleeping

clean init

478dec6 17 days ago

933 Bytes

	# import fitz # PyMuPDF
	# from io import BytesIO

	# async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
	# """
	# Extract text from PDF file in bytes
	# Return plain text
	# """
	# text_chunks = []

	# with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	# for page in doc:
	# page_text = page.get_text()
	# if page_text:
	# text_chunks.append(page_text)

	# return "\n".join(text_chunks).strip()

	from io import BytesIO
	from pypdf import PdfReader

	async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
	if not isinstance(pdf_bytes, (bytes, bytearray)):
	raise TypeError("pdf_bytes must be bytes")

	reader = PdfReader(BytesIO(pdf_bytes))

	text_parts = []
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text_parts.append(page_text)

	return "\n".join(text_parts)