Spaces:

aquab
/

file-operation

Sleeping

File size: 1,013 Bytes

86a63d9

import os
import pdfplumber
import pandas as pd
from docx import Document

def extract_content(file_path: str, filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower()
    content = ""

    try:
        if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()

        elif ext == ".pdf":
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    content += page.extract_text() or ""

        elif ext in [".docx"]:
            doc = Document(file_path)
            for para in doc.paragraphs:
                content += para.text + "\n"

        elif ext in [".xlsx"]:
            df = pd.read_excel(file_path)
            content = df.to_markdown()

        else:
            content = "Unsupported file type or binary content."

    except Exception as e:
        content = f"Error reading file: {str(e)}"

    return content