Spaces:
Sleeping
Sleeping
| import os | |
| import pdfplumber | |
| import pandas as pd | |
| from docx import Document | |
| def extract_content(file_path: str, filename: str) -> str: | |
| ext = os.path.splitext(filename)[1].lower() | |
| content = "" | |
| try: | |
| if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]: | |
| with open(file_path, "r", encoding="utf-8", errors="ignore") as f: | |
| content = f.read() | |
| elif ext == ".pdf": | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| content += page.extract_text() or "" | |
| elif ext in [".docx"]: | |
| doc = Document(file_path) | |
| for para in doc.paragraphs: | |
| content += para.text + "\n" | |
| elif ext in [".xlsx"]: | |
| df = pd.read_excel(file_path) | |
| content = df.to_markdown() | |
| else: | |
| content = "Unsupported file type or binary content." | |
| except Exception as e: | |
| content = f"Error reading file: {str(e)}" | |
| return content | |