File size: 1,903 Bytes
c163d1f
f30e472
 
5e6bcca
c163d1f
f30e472
 
 
 
 
 
 
 
 
 
 
 
 
aee62e4
f30e472
aee62e4
 
 
5e6bcca
 
 
aee62e4
 
5e6bcca
 
aee62e4
5e6bcca
aee62e4
f30e472
aee62e4
f30e472
 
 
5e6bcca
f30e472
aee62e4
f30e472
aee62e4
f30e472
 
 
5e6bcca
f30e472
 
 
 
 
 
 
 
5e6bcca
aee62e4
 
 
 
5e6bcca
aee62e4
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import streamlit as st
from docx import Document
from io import BytesIO
import re

st.set_page_config(page_title="Doc Cleaner", layout="wide")

st.title("πŸ“„ DOCX Heading Cleaner")

uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"])

if uploaded_file:
    doc = Document(uploaded_file)

    removed_headings = []
    cleaned_paragraphs = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if not text:
            continue

        style_name = para.style.name.lower()

        # βœ… Detection logic
        is_docx_heading = "heading" in style_name
        is_markdown_heading = text.startswith("#")
        is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
        is_colon_title = text.endswith(":") and len(text.split()) <= 6  # short titles like "Introduction:"

        if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title:
            removed_headings.append(text)
        else:
            cleaned_paragraphs.append(text)

    cleaned_text = "\n\n".join(cleaned_paragraphs)

    # πŸ—‘οΈ Removed Headings Section
    st.subheader("πŸ—‘οΈ Removed Headings")

    if removed_headings:
        st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1)))
    else:
        st.info("No headings found")

    # πŸ“ Cleaned Text Section
    st.subheader("πŸ“ Cleaned Text")

    st.text_area(
        "Paragraph Content",
        value=cleaned_text,
        height=400
    )

    # πŸ“₯ Download cleaned DOCX
    new_doc = Document()
    for para in cleaned_paragraphs:
        new_doc.add_paragraph(para)

    buffer = BytesIO()
    new_doc.save(buffer)
    buffer.seek(0)

    st.download_button(
        label="⬇️ Download Cleaned DOCX",
        data=buffer,
        file_name="cleaned.docx",
        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    )