Heading_Removed / src /streamlit_app.py
rohitdiwane's picture
Update src/streamlit_app.py
5e6bcca verified
import streamlit as st
from docx import Document
from io import BytesIO
import re
st.set_page_config(page_title="Doc Cleaner", layout="wide")
st.title("πŸ“„ DOCX Heading Cleaner")
uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"])
if uploaded_file:
doc = Document(uploaded_file)
removed_headings = []
cleaned_paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
style_name = para.style.name.lower()
# βœ… Detection logic
is_docx_heading = "heading" in style_name
is_markdown_heading = text.startswith("#")
is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
is_colon_title = text.endswith(":") and len(text.split()) <= 6 # short titles like "Introduction:"
if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title:
removed_headings.append(text)
else:
cleaned_paragraphs.append(text)
cleaned_text = "\n\n".join(cleaned_paragraphs)
# πŸ—‘οΈ Removed Headings Section
st.subheader("πŸ—‘οΈ Removed Headings")
if removed_headings:
st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1)))
else:
st.info("No headings found")
# πŸ“ Cleaned Text Section
st.subheader("πŸ“ Cleaned Text")
st.text_area(
"Paragraph Content",
value=cleaned_text,
height=400
)
# πŸ“₯ Download cleaned DOCX
new_doc = Document()
for para in cleaned_paragraphs:
new_doc.add_paragraph(para)
buffer = BytesIO()
new_doc.save(buffer)
buffer.seek(0)
st.download_button(
label="⬇️ Download Cleaned DOCX",
data=buffer,
file_name="cleaned.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)