from pathlib import Path import PyPDF2 from docx import Document import pptx import pandas as pd import tempfile def supported_formats(): return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx'] def process_document(file_path: str) -> str: """Extract text from various document formats""" file_ext = Path(file_path).suffix.lower() try: if file_ext == '.pdf': return _extract_pdf_text(file_path) elif file_ext == '.docx': return _extract_docx_text(file_path) elif file_ext == '.pptx': return _extract_pptx_text(file_path) elif file_ext == '.txt': with open(file_path, 'r', encoding='utf-8') as f: return f.read() elif file_ext == '.xlsx': return _extract_excel_text(file_path) else: raise ValueError(f"Unsupported file format: {file_ext}") except Exception as e: raise ValueError(f"Error processing document: {str(e)}") def _extract_pdf_text(file_path: str) -> str: text = "" with open(file_path, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() + "\n" return text def _extract_docx_text(file_path: str) -> str: doc = Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) def _extract_pptx_text(file_path: str) -> str: prs = pptx.Presentation(file_path) text = [] for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text.append(shape.text) return "\n".join(text) def _extract_excel_text(file_path: str) -> str: df = pd.read_excel(file_path) return df.to_string()