import streamlit as st
from docx import Document
from io import BytesIO
import re

st.set_page_config(page_title="Doc Cleaner", layout="wide")

st.title("📄 DOCX Heading Cleaner")

uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"])

if uploaded_file:
    doc = Document(uploaded_file)

    removed_headings = []
    cleaned_paragraphs = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if not text:
            continue

        style_name = para.style.name.lower()

        # ✅ Detection logic
        is_docx_heading = "heading" in style_name
        is_markdown_heading = text.startswith("#")
        is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
        is_colon_title = text.endswith(":") and len(text.split()) <= 6  # short titles like "Introduction:"

        if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title:
            removed_headings.append(text)
        else:
            cleaned_paragraphs.append(text)

    cleaned_text = "\n\n".join(cleaned_paragraphs)

    # 🗑️ Removed Headings Section
    st.subheader("🗑️ Removed Headings")

    if removed_headings:
        st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1)))
    else:
        st.info("No headings found")

    # 📝 Cleaned Text Section
    st.subheader("📝 Cleaned Text")

    st.text_area(
        "Paragraph Content",
        value=cleaned_text,
        height=400
    )

    # 📥 Download cleaned DOCX
    new_doc = Document()
    for para in cleaned_paragraphs:
        new_doc.add_paragraph(para)

    buffer = BytesIO()
    new_doc.save(buffer)
    buffer.seek(0)

    st.download_button(
        label="⬇️ Download Cleaned DOCX",
        data=buffer,
        file_name="cleaned.docx",
        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    )