Spaces:
Sleeping
Sleeping
File size: 1,734 Bytes
b584aa5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from pathlib import Path
import PyPDF2
from docx import Document
import pptx
import pandas as pd
import tempfile
def supported_formats():
return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']
def process_document(file_path: str) -> str:
"""Extract text from various document formats"""
file_ext = Path(file_path).suffix.lower()
try:
if file_ext == '.pdf':
return _extract_pdf_text(file_path)
elif file_ext == '.docx':
return _extract_docx_text(file_path)
elif file_ext == '.pptx':
return _extract_pptx_text(file_path)
elif file_ext == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
elif file_ext == '.xlsx':
return _extract_excel_text(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
raise ValueError(f"Error processing document: {str(e)}")
def _extract_pdf_text(file_path: str) -> str:
text = ""
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def _extract_docx_text(file_path: str) -> str:
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def _extract_pptx_text(file_path: str) -> str:
prs = pptx.Presentation(file_path)
text = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text)
def _extract_excel_text(file_path: str) -> str:
df = pd.read_excel(file_path)
return df.to_string() |