File size: 1,734 Bytes
b584aa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from pathlib import Path
import PyPDF2
from docx import Document
import pptx
import pandas as pd
import tempfile

def supported_formats():
    return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']

def process_document(file_path: str) -> str:
    """Extract text from various document formats"""
    file_ext = Path(file_path).suffix.lower()
    
    try:
        if file_ext == '.pdf':
            return _extract_pdf_text(file_path)
        elif file_ext == '.docx':
            return _extract_docx_text(file_path)
        elif file_ext == '.pptx':
            return _extract_pptx_text(file_path)
        elif file_ext == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        elif file_ext == '.xlsx':
            return _extract_excel_text(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_ext}")
    except Exception as e:
        raise ValueError(f"Error processing document: {str(e)}")

def _extract_pdf_text(file_path: str) -> str:
    text = ""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def _extract_docx_text(file_path: str) -> str:
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def _extract_pptx_text(file_path: str) -> str:
    prs = pptx.Presentation(file_path)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

def _extract_excel_text(file_path: str) -> str:
    df = pd.read_excel(file_path)
    return df.to_string()