import gradio as gr from punctuators.models import PunctCapSegModelONNX # Load the punctuation model print("Loading XLM-RoBERTa punctuation model...") model = PunctCapSegModelONNX.from_pretrained( "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase" ) print("Model loaded successfully!") def punctuate_text(input_text, progress=gr.Progress()): """ Generate 3 different punctuation corrections with varying strategies """ if not input_text.strip(): return ["", "", ""] corrections = [] # Three different approaches configs = [ {"name": "Conservative", "apply_sbd": False}, {"name": "With Sentence Boundaries", "apply_sbd": True}, {"name": "Balanced", "apply_sbd": True} ] for i, config in enumerate(configs): progress((i + 0.5) / 3, desc=f"Generating {config['name']} version...") if config["name"] == "Conservative": # Single text processing without sentence boundaries result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"]) corrected_text = result[0] elif config["name"] == "With Sentence Boundaries": # Process with sentence boundary detection result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"]) corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0] else: # Balanced # Process text in chunks if it's long if len(input_text) > 500: # Split into chunks chunks = [input_text[i:i+500] for i in range(0, len(input_text), 400)] results = [] for chunk in chunks: chunk_result = model.infer(texts=[chunk], apply_sbd=False) results.append(chunk_result[0]) corrected_text = " ".join(results) else: result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"]) corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0] corrections.append(corrected_text) progress((i + 1) / 3, desc=f"{config['name']} version complete") progress(1.0, desc="All corrections generated!") return corrections # Create Gradio interface with gr.Blocks(title="Multilingual Punctuation & Capitalization Correction", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🌍 Multilingual Punctuation & Capitalization Correction This tool uses **XLM-RoBERTa** to restore punctuation, fix capitalization, and detect sentence boundaries in **47 languages**. Enter text without proper punctuation or capitalization, and get 3 different correction styles: - **📝 Conservative**: Minimal changes, preserves original flow - **📖 With Sentence Boundaries**: Splits text into clear sentences - **⚖️ Balanced**: Smart chunking for longer texts """) with gr.Row(): with gr.Column(scale=2): input_text = gr.Textbox( label="Input Text (any of 47 supported languages)", placeholder="enter text without punctuation or capitalization like this example here it will be fixed", lines=12, max_lines=20 ) correct_btn = gr.Button("🚀 Add Punctuation & Capitalization", variant="primary", size="lg") # Output section with 3 versions gr.Markdown("### 📝 Correction Options") with gr.Row(): with gr.Column(): gr.Markdown("#### 📝 Conservative") output_conservative = gr.Textbox( label="Conservative Correction", lines=10, max_lines=15, interactive=True, elem_id="conservative_output" ) copy_btn_1 = gr.Button("📋 Copy", variant="secondary", size="sm") with gr.Column(): gr.Markdown("#### 📖 With Sentence Boundaries") output_boundaries = gr.Textbox( label="Sentence Boundary Detection", lines=10, max_lines=15, interactive=True, elem_id="boundaries_output" ) copy_btn_2 = gr.Button("📋 Copy", variant="secondary", size="sm") with gr.Column(): gr.Markdown("#### ⚖️ Balanced") output_balanced = gr.Textbox( label="Balanced Correction", lines=10, max_lines=15, interactive=True, elem_id="balanced_output" ) copy_btn_3 = gr.Button("📋 Copy", variant="secondary", size="sm") # Selected version display with gr.Row(): gr.Markdown("### ✅ Selected Correction") selected_text = gr.Textbox( label="Your Selected Correction", lines=5, interactive=True, placeholder="Click 'Use This' under any correction to select it" ) # Add selection buttons with gr.Row(): with gr.Column(): select_btn_1 = gr.Button("✅ Use This", variant="primary", size="sm") with gr.Column(): select_btn_2 = gr.Button("✅ Use This", variant="primary", size="sm") with gr.Column(): select_btn_3 = gr.Button("✅ Use This", variant="primary", size="sm") # Add examples gr.Examples( examples=[ ["hello there how are you doing today i hope everything is going well"], ["the quick brown fox jumps over the lazy dog this is a test sentence"], ["machine learning is revolutionizing many industries from healthcare to finance"], ["bonjour comment allez vous aujourdhui jespere que tout va bien"], ["hola como estas espero que todo este bien contigo y tu familia"], ], inputs=input_text, label="Example sentences (click to try)" ) # Set up event handlers outputs = [output_conservative, output_boundaries, output_balanced] correct_btn.click(fn=punctuate_text, inputs=input_text, outputs=outputs) input_text.submit(fn=punctuate_text, inputs=input_text, outputs=outputs) # Selection handlers select_btn_1.click(fn=lambda x: x, inputs=output_conservative, outputs=selected_text) select_btn_2.click(fn=lambda x: x, inputs=output_boundaries, outputs=selected_text) select_btn_3.click(fn=lambda x: x, inputs=output_balanced, outputs=selected_text) # JavaScript for copy functionality copy_btn_1.click( None, None, None, js=""" () => { const outputText = document.querySelector('#conservative_output textarea').value; navigator.clipboard.writeText(outputText); alert('Conservative version copied to clipboard!'); } """ ) copy_btn_2.click( None, None, None, js=""" () => { const outputText = document.querySelector('#boundaries_output textarea').value; navigator.clipboard.writeText(outputText); alert('Sentence boundaries version copied to clipboard!'); } """ ) copy_btn_3.click( None, None, None, js=""" () => { const outputText = document.querySelector('#balanced_output textarea').value; navigator.clipboard.writeText(outputText); alert('Balanced version copied to clipboard!'); } """ ) gr.Markdown(""" --- **Model:** [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase) **Supports 47 languages** including: English, French, Spanish, German, Italian, Portuguese, Russian, Turkish, Chinese, Japanese, Arabic, and many more! """) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )