asdfasdfdsafdsa's picture
Upload 3 files
06027df verified
import gradio as gr
from punctuators.models import PunctCapSegModelONNX
# Load the punctuation model
print("Loading XLM-RoBERTa punctuation model...")
model = PunctCapSegModelONNX.from_pretrained(
"1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)
print("Model loaded successfully!")
def punctuate_text(input_text, progress=gr.Progress()):
"""
Generate 3 different punctuation corrections with varying strategies
"""
if not input_text.strip():
return ["", "", ""]
corrections = []
# Three different approaches
configs = [
{"name": "Conservative", "apply_sbd": False},
{"name": "With Sentence Boundaries", "apply_sbd": True},
{"name": "Balanced", "apply_sbd": True}
]
for i, config in enumerate(configs):
progress((i + 0.5) / 3, desc=f"Generating {config['name']} version...")
if config["name"] == "Conservative":
# Single text processing without sentence boundaries
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = result[0]
elif config["name"] == "With Sentence Boundaries":
# Process with sentence boundary detection
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]
else: # Balanced
# Process text in chunks if it's long
if len(input_text) > 500:
# Split into chunks
chunks = [input_text[i:i+500] for i in range(0, len(input_text), 400)]
results = []
for chunk in chunks:
chunk_result = model.infer(texts=[chunk], apply_sbd=False)
results.append(chunk_result[0])
corrected_text = " ".join(results)
else:
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]
corrections.append(corrected_text)
progress((i + 1) / 3, desc=f"{config['name']} version complete")
progress(1.0, desc="All corrections generated!")
return corrections
# Create Gradio interface
with gr.Blocks(title="Multilingual Punctuation & Capitalization Correction", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🌍 Multilingual Punctuation & Capitalization Correction
This tool uses **XLM-RoBERTa** to restore punctuation, fix capitalization, and detect sentence boundaries in **47 languages**.
Enter text without proper punctuation or capitalization, and get 3 different correction styles:
- **πŸ“ Conservative**: Minimal changes, preserves original flow
- **πŸ“– With Sentence Boundaries**: Splits text into clear sentences
- **βš–οΈ Balanced**: Smart chunking for longer texts
""")
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(
label="Input Text (any of 47 supported languages)",
placeholder="enter text without punctuation or capitalization like this example here it will be fixed",
lines=12,
max_lines=20
)
correct_btn = gr.Button("πŸš€ Add Punctuation & Capitalization", variant="primary", size="lg")
# Output section with 3 versions
gr.Markdown("### πŸ“ Correction Options")
with gr.Row():
with gr.Column():
gr.Markdown("#### πŸ“ Conservative")
output_conservative = gr.Textbox(
label="Conservative Correction",
lines=10,
max_lines=15,
interactive=True,
elem_id="conservative_output"
)
copy_btn_1 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")
with gr.Column():
gr.Markdown("#### πŸ“– With Sentence Boundaries")
output_boundaries = gr.Textbox(
label="Sentence Boundary Detection",
lines=10,
max_lines=15,
interactive=True,
elem_id="boundaries_output"
)
copy_btn_2 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")
with gr.Column():
gr.Markdown("#### βš–οΈ Balanced")
output_balanced = gr.Textbox(
label="Balanced Correction",
lines=10,
max_lines=15,
interactive=True,
elem_id="balanced_output"
)
copy_btn_3 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")
# Selected version display
with gr.Row():
gr.Markdown("### βœ… Selected Correction")
selected_text = gr.Textbox(
label="Your Selected Correction",
lines=5,
interactive=True,
placeholder="Click 'Use This' under any correction to select it"
)
# Add selection buttons
with gr.Row():
with gr.Column():
select_btn_1 = gr.Button("βœ… Use This", variant="primary", size="sm")
with gr.Column():
select_btn_2 = gr.Button("βœ… Use This", variant="primary", size="sm")
with gr.Column():
select_btn_3 = gr.Button("βœ… Use This", variant="primary", size="sm")
# Add examples
gr.Examples(
examples=[
["hello there how are you doing today i hope everything is going well"],
["the quick brown fox jumps over the lazy dog this is a test sentence"],
["machine learning is revolutionizing many industries from healthcare to finance"],
["bonjour comment allez vous aujourdhui jespere que tout va bien"],
["hola como estas espero que todo este bien contigo y tu familia"],
],
inputs=input_text,
label="Example sentences (click to try)"
)
# Set up event handlers
outputs = [output_conservative, output_boundaries, output_balanced]
correct_btn.click(fn=punctuate_text, inputs=input_text, outputs=outputs)
input_text.submit(fn=punctuate_text, inputs=input_text, outputs=outputs)
# Selection handlers
select_btn_1.click(fn=lambda x: x, inputs=output_conservative, outputs=selected_text)
select_btn_2.click(fn=lambda x: x, inputs=output_boundaries, outputs=selected_text)
select_btn_3.click(fn=lambda x: x, inputs=output_balanced, outputs=selected_text)
# JavaScript for copy functionality
copy_btn_1.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#conservative_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Conservative version copied to clipboard!');
}
"""
)
copy_btn_2.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#boundaries_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Sentence boundaries version copied to clipboard!');
}
"""
)
copy_btn_3.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#balanced_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Balanced version copied to clipboard!');
}
"""
)
gr.Markdown("""
---
**Model:** [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase)
**Supports 47 languages** including: English, French, Spanish, German, Italian, Portuguese, Russian, Turkish, Chinese, Japanese, Arabic, and many more!
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)