Spaces:
Sleeping
Sleeping
File size: 1,903 Bytes
c163d1f f30e472 5e6bcca c163d1f f30e472 aee62e4 f30e472 aee62e4 5e6bcca aee62e4 5e6bcca aee62e4 5e6bcca aee62e4 f30e472 aee62e4 f30e472 5e6bcca f30e472 aee62e4 f30e472 aee62e4 f30e472 5e6bcca f30e472 5e6bcca aee62e4 5e6bcca aee62e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import streamlit as st
from docx import Document
from io import BytesIO
import re
st.set_page_config(page_title="Doc Cleaner", layout="wide")
st.title("π DOCX Heading Cleaner")
uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"])
if uploaded_file:
doc = Document(uploaded_file)
removed_headings = []
cleaned_paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
style_name = para.style.name.lower()
# β
Detection logic
is_docx_heading = "heading" in style_name
is_markdown_heading = text.startswith("#")
is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
is_colon_title = text.endswith(":") and len(text.split()) <= 6 # short titles like "Introduction:"
if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title:
removed_headings.append(text)
else:
cleaned_paragraphs.append(text)
cleaned_text = "\n\n".join(cleaned_paragraphs)
# ποΈ Removed Headings Section
st.subheader("ποΈ Removed Headings")
if removed_headings:
st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1)))
else:
st.info("No headings found")
# π Cleaned Text Section
st.subheader("π Cleaned Text")
st.text_area(
"Paragraph Content",
value=cleaned_text,
height=400
)
# π₯ Download cleaned DOCX
new_doc = Document()
for para in cleaned_paragraphs:
new_doc.add_paragraph(para)
buffer = BytesIO()
new_doc.save(buffer)
buffer.seek(0)
st.download_button(
label="β¬οΈ Download Cleaned DOCX",
data=buffer,
file_name="cleaned.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
) |