Spaces:

amirmadjour
/

amir

Sleeping

amir / document_processor.py

translation working pdf

b584aa5 9 months ago

1.73 kB

	from pathlib import Path
	import PyPDF2
	from docx import Document
	import pptx
	import pandas as pd
	import tempfile

	def supported_formats():
	return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']

	def process_document(file_path: str) -> str:
	"""Extract text from various document formats"""
	file_ext = Path(file_path).suffix.lower()

	try:
	if file_ext == '.pdf':
	return _extract_pdf_text(file_path)
	elif file_ext == '.docx':
	return _extract_docx_text(file_path)
	elif file_ext == '.pptx':
	return _extract_pptx_text(file_path)
	elif file_ext == '.txt':
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	elif file_ext == '.xlsx':
	return _extract_excel_text(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_ext}")
	except Exception as e:
	raise ValueError(f"Error processing document: {str(e)}")

	def _extract_pdf_text(file_path: str) -> str:
	text = ""
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	def _extract_docx_text(file_path: str) -> str:
	doc = Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])

	def _extract_pptx_text(file_path: str) -> str:
	prs = pptx.Presentation(file_path)
	text = []
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text.append(shape.text)
	return "\n".join(text)

	def _extract_excel_text(file_path: str) -> str:
	df = pd.read_excel(file_path)
	return df.to_string()