Spaces:
Running
Running
| import gradio as gr | |
| from docx import Document | |
| import re | |
| def clean_markdown_heading(text): | |
| return re.sub(r"^#+\s*", "", text).strip() | |
| def get_color(tag): | |
| return { | |
| "docx": "#1f77b4", | |
| "markdown": "#2ca02c", | |
| "segment": "#ff7f0e", | |
| "colon": "#9467bd", | |
| "stage": "#d62728" | |
| }.get(tag, "black") | |
| def process_doc(file): | |
| removed_headings = [] | |
| cleaned_paragraphs = [] | |
| if file is not None: | |
| doc = Document(file.name) | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if not text: | |
| continue | |
| style_name = para.style.name.lower() | |
| is_docx_heading = "heading" in style_name | |
| is_markdown_heading = re.match(r"^#+\s+", text) | |
| is_segment_heading = re.match(r"(segment\s*\d+)", text.lower()) | |
| is_colon_title = text.endswith(":") and len(text.split()) <= 6 | |
| is_stage_direction = re.match(r"^\*\[.*\]\*$", text) | |
| if ( | |
| is_docx_heading | |
| or is_markdown_heading | |
| or is_segment_heading | |
| or is_colon_title | |
| or is_stage_direction | |
| ): | |
| if is_markdown_heading: | |
| clean_text = clean_markdown_heading(text) | |
| removed_headings.append((clean_text, "markdown")) | |
| elif is_stage_direction: | |
| clean_text = re.sub(r"^\*\[|\]\*$", "", text).strip() | |
| removed_headings.append((clean_text, "stage")) | |
| elif is_docx_heading: | |
| removed_headings.append((text, "docx")) | |
| elif is_segment_heading: | |
| removed_headings.append((text, "segment")) | |
| elif is_colon_title: | |
| removed_headings.append((text, "colon")) | |
| else: | |
| cleaned_paragraphs.append(text) | |
| cleaned_text = "\n\n".join(cleaned_paragraphs) | |
| if removed_headings: | |
| headings_output = "<br><br>".join( | |
| f"<span style='color:{get_color(tag)}; font-weight:600'>{text}</span>" | |
| for text, tag in removed_headings | |
| ) | |
| else: | |
| headings_output = "<span style='color:gray'> π΄ No headings found π΄ </span>" | |
| return headings_output, cleaned_text | |
| # β Custom CSS targeting only the cleaned output textbox | |
| custom_css = """ | |
| #cleaned_output textarea { | |
| background-color: black !important; | |
| color: white !important; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as app: | |
| with gr.Row(): | |
| with gr.Column(scale=1): pass | |
| with gr.Column(scale=2): | |
| gr.Markdown("## Upload Docx Here to Remove Titles") | |
| with gr.Column(scale=1): pass | |
| with gr.Row(): | |
| with gr.Column(scale=1): pass | |
| with gr.Column(scale=2): | |
| file_input = gr.File( | |
| file_types=[".docx"], | |
| label="π€ Upload DOCX File", | |
| height=60 | |
| ) | |
| process_btn = gr.Button("π Click to Process", variant="primary") | |
| with gr.Column(scale=1): pass | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### ===========-------> Removed Headings <-------===========") | |
| headings_output = gr.Markdown() | |
| with gr.Column(): | |
| # β elem_id added to target this textbox specifically | |
| cleaned_output = gr.Textbox(lines=25, elem_id="cleaned_output", buttons=["copy"]) | |
| process_btn.click( | |
| fn=process_doc, | |
| inputs=[file_input], | |
| outputs=[headings_output, cleaned_output] | |
| ) | |
| app.launch() |