Spaces:

Skoob
/

Heading_Removed

Sleeping

App Files Files Community

Heading_Removed / src /streamlit_app.py

rohitdiwane

Update src/streamlit_app.py

5e6bcca verified about 2 months ago

raw

history blame contribute delete

1.9 kB

	import streamlit as st
	from docx import Document
	from io import BytesIO
	import re

	st.set_page_config(page_title="Doc Cleaner", layout="wide")

	st.title("📄 DOCX Heading Cleaner")

	uploaded_file = st.file_uploader("Upload a .docx file", type=["docx"])

	if uploaded_file:
	doc = Document(uploaded_file)

	removed_headings = []
	cleaned_paragraphs = []

	for para in doc.paragraphs:
	text = para.text.strip()

	if not text:
	continue

	style_name = para.style.name.lower()

	# ✅ Detection logic
	is_docx_heading = "heading" in style_name
	is_markdown_heading = text.startswith("#")
	is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
	is_colon_title = text.endswith(":") and len(text.split()) <= 6 # short titles like "Introduction:"

	if is_docx_heading or is_markdown_heading or is_segment_heading or is_colon_title:
	removed_headings.append(text)
	else:
	cleaned_paragraphs.append(text)

	cleaned_text = "\n\n".join(cleaned_paragraphs)

	# 🗑️ Removed Headings Section
	st.subheader("🗑️ Removed Headings")

	if removed_headings:
	st.code("\n".join(f"{i}. {h}" for i, h in enumerate(removed_headings, 1)))
	else:
	st.info("No headings found")

	# 📝 Cleaned Text Section
	st.subheader("📝 Cleaned Text")

	st.text_area(
	"Paragraph Content",
	value=cleaned_text,
	height=400
	)

	# 📥 Download cleaned DOCX
	new_doc = Document()
	for para in cleaned_paragraphs:
	new_doc.add_paragraph(para)

	buffer = BytesIO()
	new_doc.save(buffer)
	buffer.seek(0)

	st.download_button(
	label="⬇️ Download Cleaned DOCX",
	data=buffer,
	file_name="cleaned.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)