File size: 1,013 Bytes
86a63d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import pdfplumber
import pandas as pd
from docx import Document

def extract_content(file_path: str, filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower()
    content = ""

    try:
        if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()

        elif ext == ".pdf":
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    content += page.extract_text() or ""

        elif ext in [".docx"]:
            doc = Document(file_path)
            for para in doc.paragraphs:
                content += para.text + "\n"

        elif ext in [".xlsx"]:
            df = pd.read_excel(file_path)
            content = df.to_markdown()

        else:
            content = "Unsupported file type or binary content."

    except Exception as e:
        content = f"Error reading file: {str(e)}"

    return content