Spaces:
Sleeping
Sleeping
File size: 1,013 Bytes
86a63d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | import os
import pdfplumber
import pandas as pd
from docx import Document
def extract_content(file_path: str, filename: str) -> str:
ext = os.path.splitext(filename)[1].lower()
content = ""
try:
if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
elif ext == ".pdf":
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
content += page.extract_text() or ""
elif ext in [".docx"]:
doc = Document(file_path)
for para in doc.paragraphs:
content += para.text + "\n"
elif ext in [".xlsx"]:
df = pd.read_excel(file_path)
content = df.to_markdown()
else:
content = "Unsupported file type or binary content."
except Exception as e:
content = f"Error reading file: {str(e)}"
return content
|