import os import pdfplumber import pandas as pd from docx import Document def extract_content(file_path: str, filename: str) -> str: ext = os.path.splitext(filename)[1].lower() content = "" try: if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() elif ext == ".pdf": with pdfplumber.open(file_path) as pdf: for page in pdf.pages: content += page.extract_text() or "" elif ext in [".docx"]: doc = Document(file_path) for para in doc.paragraphs: content += para.text + "\n" elif ext in [".xlsx"]: df = pd.read_excel(file_path) content = df.to_markdown() else: content = "Unsupported file type or binary content." except Exception as e: content = f"Error reading file: {str(e)}" return content