import streamlit as st from docx import Document from io import BytesIO import re st.set_page_config(page_title="Doc Cleaner", layout="wide") st.title("📄 DOCX Heading Cleaner") uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"]) if uploaded_file: doc = Document(uploaded_file) removed_headings = [] cleaned_paragraphs = [] for para in doc.paragraphs: text = para.text.strip() if not text: continue style_name = para.style.name.lower() # ✅ Detection logic is_docx_heading = "heading" in style_name is_markdown_heading = text.startswith("#") is_segment_heading = re.match(r"(segment\s*\d+)", text.lower()) is_colon_title = text.endswith(":") and len(text.split()) <= 6 # short titles like "Introduction:" if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title: removed_headings.append(text) else: cleaned_paragraphs.append(text) cleaned_text = "\n\n".join(cleaned_paragraphs) # 🗑️ Removed Headings Section st.subheader("🗑️ Removed Headings") if removed_headings: st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1))) else: st.info("No headings found") # 📝 Cleaned Text Section st.subheader("📝 Cleaned Text") st.text_area( "Paragraph Content", value=cleaned_text, height=400 ) # 📥 Download cleaned DOCX new_doc = Document() for para in cleaned_paragraphs: new_doc.add_paragraph(para) buffer = BytesIO() new_doc.save(buffer) buffer.seek(0) st.download_button( label="⬇️ Download Cleaned DOCX", data=buffer, file_name="cleaned.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" )