rohitdiwane's picture
Update app.py
cefcafd verified
import gradio as gr
from docx import Document
import re
def clean_markdown_heading(text):
return re.sub(r"^#+\s*", "", text).strip()
def get_color(tag):
return {
"docx": "#1f77b4",
"markdown": "#2ca02c",
"segment": "#ff7f0e",
"colon": "#9467bd",
"stage": "#d62728"
}.get(tag, "black")
def process_doc(file):
removed_headings = []
cleaned_paragraphs = []
if file is not None:
doc = Document(file.name)
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
style_name = para.style.name.lower()
is_docx_heading = "heading" in style_name
is_markdown_heading = re.match(r"^#+\s+", text)
is_segment_heading = re.match(r"(segment\s*\d+)", text.lower())
is_colon_title = text.endswith(":") and len(text.split()) <= 6
is_stage_direction = re.match(r"^\*\[.*\]\*$", text)
if (
is_docx_heading
or is_markdown_heading
or is_segment_heading
or is_colon_title
or is_stage_direction
):
if is_markdown_heading:
clean_text = clean_markdown_heading(text)
removed_headings.append((clean_text, "markdown"))
elif is_stage_direction:
clean_text = re.sub(r"^\*\[|\]\*$", "", text).strip()
removed_headings.append((clean_text, "stage"))
elif is_docx_heading:
removed_headings.append((text, "docx"))
elif is_segment_heading:
removed_headings.append((text, "segment"))
elif is_colon_title:
removed_headings.append((text, "colon"))
else:
cleaned_paragraphs.append(text)
cleaned_text = "\n\n".join(cleaned_paragraphs)
if removed_headings:
headings_output = "<br><br>".join(
f"<span style='color:{get_color(tag)}; font-weight:600'>{text}</span>"
for text, tag in removed_headings
)
else:
headings_output = "<span style='color:gray'> πŸ”΄ No headings found πŸ”΄ </span>"
return headings_output, cleaned_text
# βœ… Custom CSS targeting only the cleaned output textbox
custom_css = """
#cleaned_output textarea {
background-color: black !important;
color: white !important;
}
"""
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as app:
with gr.Row():
with gr.Column(scale=1): pass
with gr.Column(scale=2):
gr.Markdown("## Upload Docx Here to Remove Titles")
with gr.Column(scale=1): pass
with gr.Row():
with gr.Column(scale=1): pass
with gr.Column(scale=2):
file_input = gr.File(
file_types=[".docx"],
label="πŸ“€ Upload DOCX File",
height=60
)
process_btn = gr.Button("πŸš€ Click to Process", variant="primary")
with gr.Column(scale=1): pass
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown("### ===========-------> Removed Headings <-------===========")
headings_output = gr.Markdown()
with gr.Column():
# βœ… elem_id added to target this textbox specifically
cleaned_output = gr.Textbox(lines=25, elem_id="cleaned_output", buttons=["copy"])
process_btn.click(
fn=process_doc,
inputs=[file_input],
outputs=[headings_output, cleaned_output]
)
app.launch()