Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from docx import Document | |
| from io import BytesIO | |
| import re | |
| st.set_page_config(page_title="Doc Cleaner", layout="wide") | |
| st.title("π DOCX Heading Cleaner") | |
| uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"]) | |
| if uploaded_file: | |
| doc = Document(uploaded_file) | |
| removed_headings = [] | |
| cleaned_paragraphs = [] | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if not text: | |
| continue | |
| style_name = para.style.name.lower() | |
| # β Detection logic | |
| is_docx_heading = "heading" in style_name | |
| is_markdown_heading = text.startswith("#") | |
| is_segment_heading = re.match(r"(segment\s*\d+)", text.lower()) | |
| is_colon_title = text.endswith(":") and len(text.split()) <= 6 # short titles like "Introduction:" | |
| if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title: | |
| removed_headings.append(text) | |
| else: | |
| cleaned_paragraphs.append(text) | |
| cleaned_text = "\n\n".join(cleaned_paragraphs) | |
| # ποΈ Removed Headings Section | |
| st.subheader("ποΈ Removed Headings") | |
| if removed_headings: | |
| st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1))) | |
| else: | |
| st.info("No headings found") | |
| # π Cleaned Text Section | |
| st.subheader("π Cleaned Text") | |
| st.text_area( | |
| "Paragraph Content", | |
| value=cleaned_text, | |
| height=400 | |
| ) | |
| # π₯ Download cleaned DOCX | |
| new_doc = Document() | |
| for para in cleaned_paragraphs: | |
| new_doc.add_paragraph(para) | |
| buffer = BytesIO() | |
| new_doc.save(buffer) | |
| buffer.seek(0) | |
| st.download_button( | |
| label="β¬οΈ Download Cleaned DOCX", | |
| data=buffer, | |
| file_name="cleaned.docx", | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ) |