Spaces:

Noo88ear
/

Job-Application-Assistant

Runtime error

App Files Files Community

Job-Application-Assistant / utils /file_ingest.py

Noo88ear

🚀 Initial deployment of Multi-Agent Job Application Assistant

7498f2c 6 months ago

raw

history blame contribute delete

2.94 kB

	from __future__ import annotations
	from typing import Optional
	import io
	import logging

	logger = logging.getLogger(__name__)

	# Try to import document libraries
	try:
	from docx import Document # type: ignore
	DOCX_AVAILABLE = True
	except Exception: # pragma: no cover
	Document = None # type: ignore
	DOCX_AVAILABLE = False
	logger.info("python-docx not available - .docx support disabled")

	try:
	import PyPDF2 # type: ignore
	PDF_AVAILABLE = True
	except Exception:
	PyPDF2 = None # type: ignore
	PDF_AVAILABLE = False
	logger.info("PyPDF2 not available - .pdf support disabled")


	def read_uploaded_text(file) -> Optional[str]:
	"""Read text from a Streamlit UploadedFile. Supports .txt, .docx, and .pdf."""
	if file is None:
	return None

	name = file.name.lower()
	logger.info(f"Attempting to read file: {file.name}")

	try:
	if name.endswith(".txt"):
	data = file.getvalue()
	text = data.decode("utf-8", errors="ignore")
	logger.info(f"Successfully read .txt file: {len(text)} characters")
	return text

	elif name.endswith(".docx"):
	if not DOCX_AVAILABLE:
	logger.warning("python-docx not installed. Cannot read .docx files.")
	logger.info("Install with: pip install python-docx")
	return None

	data = file.getvalue()
	bio = io.BytesIO(data)
	doc = Document(bio) # type: ignore
	parts = []
	for p in doc.paragraphs:
	if p.text.strip(): # Only add non-empty paragraphs
	parts.append(p.text)
	text = "\n".join(parts)
	logger.info(f"Successfully read .docx file: {len(text)} characters")
	return text

	elif name.endswith(".pdf"):
	if not PDF_AVAILABLE:
	logger.warning("PyPDF2 not installed. Cannot read .pdf files.")
	logger.info("Install with: pip install PyPDF2")
	return None

	data = file.getvalue()
	bio = io.BytesIO(data)
	pdf_reader = PyPDF2.PdfReader(bio) # type: ignore
	text_parts = []

	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text_parts.append(page.extract_text())

	text = "\n".join(text_parts)
	logger.info(f"Successfully read .pdf file: {len(text)} characters")
	return text

	else:
	logger.warning(f"Unsupported file type: {name}")
	return None

	except Exception as e:
	logger.error(f"Error reading file {file.name}: {str(e)}", exc_info=True)
	return None