amir / document_processor.py
amirmadjour's picture
translation working pdf
b584aa5
from pathlib import Path
import PyPDF2
from docx import Document
import pptx
import pandas as pd
import tempfile
def supported_formats():
return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']
def process_document(file_path: str) -> str:
"""Extract text from various document formats"""
file_ext = Path(file_path).suffix.lower()
try:
if file_ext == '.pdf':
return _extract_pdf_text(file_path)
elif file_ext == '.docx':
return _extract_docx_text(file_path)
elif file_ext == '.pptx':
return _extract_pptx_text(file_path)
elif file_ext == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
elif file_ext == '.xlsx':
return _extract_excel_text(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
raise ValueError(f"Error processing document: {str(e)}")
def _extract_pdf_text(file_path: str) -> str:
text = ""
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def _extract_docx_text(file_path: str) -> str:
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def _extract_pptx_text(file_path: str) -> str:
prs = pptx.Presentation(file_path)
text = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text)
def _extract_excel_text(file_path: str) -> str:
df = pd.read_excel(file_path)
return df.to_string()