Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import PyPDF2 | |
| from docx import Document | |
| import pptx | |
| import pandas as pd | |
| import tempfile | |
| def supported_formats(): | |
| return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx'] | |
| def process_document(file_path: str) -> str: | |
| """Extract text from various document formats""" | |
| file_ext = Path(file_path).suffix.lower() | |
| try: | |
| if file_ext == '.pdf': | |
| return _extract_pdf_text(file_path) | |
| elif file_ext == '.docx': | |
| return _extract_docx_text(file_path) | |
| elif file_ext == '.pptx': | |
| return _extract_pptx_text(file_path) | |
| elif file_ext == '.txt': | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| elif file_ext == '.xlsx': | |
| return _extract_excel_text(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_ext}") | |
| except Exception as e: | |
| raise ValueError(f"Error processing document: {str(e)}") | |
| def _extract_pdf_text(file_path: str) -> str: | |
| text = "" | |
| with open(file_path, 'rb') as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def _extract_docx_text(file_path: str) -> str: | |
| doc = Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| def _extract_pptx_text(file_path: str) -> str: | |
| prs = pptx.Presentation(file_path) | |
| text = [] | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text.append(shape.text) | |
| return "\n".join(text) | |
| def _extract_excel_text(file_path: str) -> str: | |
| df = pd.read_excel(file_path) | |
| return df.to_string() |