Spaces:

Notionhive
/

mdtohtml

Sleeping

File size: 4,313 Bytes

from fastapi import FastAPI, Request, UploadFile, File
from fastapi.responses import StreamingResponse, JSONResponse
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.section import WD_ORIENT
from PIL import Image
from io import BytesIO
import markdown2
import io

app = FastAPI()

# === Page Layout Utility ===

def set_a4_page_layout(doc):
    section = doc.sections[0]
    section.page_height = Inches(11.69)
    section.page_width = Inches(8.27)
    section.orientation = WD_ORIENT.PORTRAIT
    section.top_margin = Inches(1)
    section.bottom_margin = Inches(1)
    section.left_margin = Inches(1)
    section.right_margin = Inches(1)

# === HTML Cleaning Utilities ===

def remove_empty_paragraphs_around(soup, tag_names):
    for tag_name in tag_names:
        for tag in soup.find_all(tag_name):
            for prev in tag.find_all_previous():
                if prev.name == "p" and not prev.text.strip():
                    prev.decompose()
                    break
                elif prev.name not in ["p", "br", None]:
                    break
            for next_ in tag.find_all_next():
                if next_.name == "p" and not next_.text.strip():
                    next_.decompose()
                    break
                elif next_.name not in ["p", "br", None]:
                    break

def clean_extra_spacing_around_tables(soup):
    for p in soup.find_all("p"):
        if not p.text.strip():
            p.decompose()

    for table in soup.find_all("table"):
        next_sibling = table.find_next_sibling()
        while next_sibling and (next_sibling.name == "br" or (next_sibling.name == "p" and not next_sibling.text.strip())):
            temp = next_sibling.find_next_sibling()
            next_sibling.decompose()
            next_sibling = temp

def add_table_borders_to_html(html_content: str) -> str:
    soup = BeautifulSoup(html_content, "html.parser")

    for table in soup.find_all("table"):
        table['border'] = "1"
        table['style'] = "border: 1px solid black; border-collapse: collapse; width: 100%;"

        first_row = table.find("tr")
        if first_row:
            col_count = len(first_row.find_all(["td", "th"]))
            colgroup = soup.new_tag("colgroup")
            for _ in range(col_count):
                col = soup.new_tag("col")
                col['style'] = "width: {}%;".format(round(100 / col_count))
                colgroup.append(col)
            table.insert(0, colgroup)

        rows = table.find_all("tr")
        if rows:
            thead = soup.new_tag("thead")
            thead.append(rows[0])
            tbody = soup.new_tag("tbody")
            for row in rows[1:]:
                tbody.append(row)
            table.append(thead)
            table.append(tbody)

        for row in table.find_all("tr"):
            for cell in row.find_all(["th", "td"]):
                existing_style = cell.get('style', '')
                new_style = "border: 1px solid black; padding: 6px;"
                cell['style'] = f"{existing_style} {new_style}".strip()

    return str(soup)

# === API 1: Markdown to HTML ===

@app.post("/convert-md-to-html")
async def convert_md_to_html(request: Request):
    data = await request.json()
    md_text = data.get("markdown", "")
    client_name = data.get("client_name", "Client").strip()

    if not md_text:
        return {"error": "No markdown text provided"}

    html = markdown2.markdown(md_text, extras=[
        "tables",
        "fenced-code-blocks",
        "cuddled-lists",
        "footnotes"
    ])

    soup = BeautifulSoup(html, "html.parser")
    remove_empty_paragraphs_around(soup, ["table", "img", "h1", "h2", "h3", "h4", "h5", "h6"])
    clean_extra_spacing_around_tables(soup)

    cleaned_html = add_table_borders_to_html(str(soup))

    html_bytes = cleaned_html.encode("utf-8")
    html_io = BytesIO(html_bytes)
    html_io.seek(0)

    safe_client_name = "".join(c for c in client_name if c.isalnum() or c in (" ", "_", "-")).strip()
    filename = f"Proposal for {safe_client_name}.html"

    headers = {
        'Content-Disposition': f'attachment; filename="{filename}"'
    }

    return StreamingResponse(
        html_io,
        media_type='text/html',
        headers=headers
    )