Spaces:

pranav8tripathi
/

ResumeIQ

Sleeping

ResumeIQ / utils /document_processor.py

pranav8tripathi@gmail.com

updated

4ede186 3 months ago

3.8 kB

	import PyPDF2
	from docx import Document
	import docx2txt
	import io
	from typing import Union

	class DocumentProcessor:
	"""Process different document formats (PDF, DOCX, DOC) and extract text"""

	@staticmethod
	def extract_text_from_pdf(file) -> str:
	"""Extract text from PDF file"""
	try:
	# Ensure we're at the beginning of the file
	if hasattr(file, 'seek'):
	file.seek(0)

	reader = PyPDF2.PdfReader(file)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	result = text.strip()
	print(f"[DEBUG] Extracted {len(result)} characters from PDF")
	return result
	except Exception as e:
	print(f"Error extracting text from PDF: {str(e)}")
	import traceback
	traceback.print_exc()
	return ""

	@staticmethod
	def extract_text_from_docx(file) -> str:
	"""Extract text from DOCX file"""
	try:
	# Try using python-docx first
	try:
	doc = Document(file)
	text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
	if text.strip():
	return text.strip()
	except:
	pass

	# Fallback to docx2txt
	file.seek(0)
	text = docx2txt.process(file)
	return text.strip()
	except Exception as e:
	print(f"Error extracting text from DOCX: {str(e)}")
	return ""

	@staticmethod
	def extract_text_from_doc(file) -> str:
	"""Extract text from DOC file (legacy Word format)"""
	try:
	# For .doc files, we'll try docx2txt which has some support
	text = docx2txt.process(file)
	return text.strip()
	except Exception as e:
	print(f"Error extracting text from DOC: {str(e)}")
	# If docx2txt fails, return a message
	return "Note: Legacy .doc format may require conversion to .docx for better text extraction."

	@staticmethod
	def extract_text(file, file_type: str = None) -> str:
	"""
	Extract text from any supported document format

	Args:
	file: File object or file-like object
	file_type: File extension (e.g., '.pdf', '.docx', '.doc')

	Returns:
	Extracted text as string
	"""
	# Determine file type if not provided
	if file_type is None:
	if hasattr(file, 'name'):
	file_type = file.name.split('.')[-1].lower()
	elif hasattr(file, 'type'):
	type_map = {
	'application/pdf': 'pdf',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
	'application/msword': 'doc'
	}
	file_type = type_map.get(file.type, 'pdf')
	else:
	file_type = 'pdf' # Default to PDF

	# Remove leading dot if present
	file_type = file_type.lstrip('.')

	# Reset file pointer to beginning
	if hasattr(file, 'seek'):
	file.seek(0)

	# Extract text based on file type
	if file_type == 'pdf':
	return DocumentProcessor.extract_text_from_pdf(file)
	elif file_type == 'docx':
	return DocumentProcessor.extract_text_from_docx(file)
	elif file_type == 'doc':
	return DocumentProcessor.extract_text_from_doc(file)
	else:
	return f"Unsupported file type: {file_type}"