file-operation / app /utils.py
rajeshbms's picture
feat: implement FastAPI service to extract and convert document content to Markdown
86a63d9
import os
import pdfplumber
import pandas as pd
from docx import Document
def extract_content(file_path: str, filename: str) -> str:
ext = os.path.splitext(filename)[1].lower()
content = ""
try:
if ext in [".txt", ".md", ".py", ".json", ".html", ".csv"]:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
elif ext == ".pdf":
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
content += page.extract_text() or ""
elif ext in [".docx"]:
doc = Document(file_path)
for para in doc.paragraphs:
content += para.text + "\n"
elif ext in [".xlsx"]:
df = pd.read_excel(file_path)
content = df.to_markdown()
else:
content = "Unsupported file type or binary content."
except Exception as e:
content = f"Error reading file: {str(e)}"
return content