Spaces:
Runtime error
Runtime error
File size: 8,073 Bytes
06027df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import gradio as gr
from punctuators.models import PunctCapSegModelONNX
# Load the punctuation model
print("Loading XLM-RoBERTa punctuation model...")
model = PunctCapSegModelONNX.from_pretrained(
"1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)
print("Model loaded successfully!")
def punctuate_text(input_text, progress=gr.Progress()):
"""
Generate 3 different punctuation corrections with varying strategies
"""
if not input_text.strip():
return ["", "", ""]
corrections = []
# Three different approaches
configs = [
{"name": "Conservative", "apply_sbd": False},
{"name": "With Sentence Boundaries", "apply_sbd": True},
{"name": "Balanced", "apply_sbd": True}
]
for i, config in enumerate(configs):
progress((i + 0.5) / 3, desc=f"Generating {config['name']} version...")
if config["name"] == "Conservative":
# Single text processing without sentence boundaries
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = result[0]
elif config["name"] == "With Sentence Boundaries":
# Process with sentence boundary detection
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]
else: # Balanced
# Process text in chunks if it's long
if len(input_text) > 500:
# Split into chunks
chunks = [input_text[i:i+500] for i in range(0, len(input_text), 400)]
results = []
for chunk in chunks:
chunk_result = model.infer(texts=[chunk], apply_sbd=False)
results.append(chunk_result[0])
corrected_text = " ".join(results)
else:
result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]
corrections.append(corrected_text)
progress((i + 1) / 3, desc=f"{config['name']} version complete")
progress(1.0, desc="All corrections generated!")
return corrections
# Create Gradio interface
with gr.Blocks(title="Multilingual Punctuation & Capitalization Correction", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π Multilingual Punctuation & Capitalization Correction
This tool uses **XLM-RoBERTa** to restore punctuation, fix capitalization, and detect sentence boundaries in **47 languages**.
Enter text without proper punctuation or capitalization, and get 3 different correction styles:
- **π Conservative**: Minimal changes, preserves original flow
- **π With Sentence Boundaries**: Splits text into clear sentences
- **βοΈ Balanced**: Smart chunking for longer texts
""")
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(
label="Input Text (any of 47 supported languages)",
placeholder="enter text without punctuation or capitalization like this example here it will be fixed",
lines=12,
max_lines=20
)
correct_btn = gr.Button("π Add Punctuation & Capitalization", variant="primary", size="lg")
# Output section with 3 versions
gr.Markdown("### π Correction Options")
with gr.Row():
with gr.Column():
gr.Markdown("#### π Conservative")
output_conservative = gr.Textbox(
label="Conservative Correction",
lines=10,
max_lines=15,
interactive=True,
elem_id="conservative_output"
)
copy_btn_1 = gr.Button("π Copy", variant="secondary", size="sm")
with gr.Column():
gr.Markdown("#### π With Sentence Boundaries")
output_boundaries = gr.Textbox(
label="Sentence Boundary Detection",
lines=10,
max_lines=15,
interactive=True,
elem_id="boundaries_output"
)
copy_btn_2 = gr.Button("π Copy", variant="secondary", size="sm")
with gr.Column():
gr.Markdown("#### βοΈ Balanced")
output_balanced = gr.Textbox(
label="Balanced Correction",
lines=10,
max_lines=15,
interactive=True,
elem_id="balanced_output"
)
copy_btn_3 = gr.Button("π Copy", variant="secondary", size="sm")
# Selected version display
with gr.Row():
gr.Markdown("### β
Selected Correction")
selected_text = gr.Textbox(
label="Your Selected Correction",
lines=5,
interactive=True,
placeholder="Click 'Use This' under any correction to select it"
)
# Add selection buttons
with gr.Row():
with gr.Column():
select_btn_1 = gr.Button("β
Use This", variant="primary", size="sm")
with gr.Column():
select_btn_2 = gr.Button("β
Use This", variant="primary", size="sm")
with gr.Column():
select_btn_3 = gr.Button("β
Use This", variant="primary", size="sm")
# Add examples
gr.Examples(
examples=[
["hello there how are you doing today i hope everything is going well"],
["the quick brown fox jumps over the lazy dog this is a test sentence"],
["machine learning is revolutionizing many industries from healthcare to finance"],
["bonjour comment allez vous aujourdhui jespere que tout va bien"],
["hola como estas espero que todo este bien contigo y tu familia"],
],
inputs=input_text,
label="Example sentences (click to try)"
)
# Set up event handlers
outputs = [output_conservative, output_boundaries, output_balanced]
correct_btn.click(fn=punctuate_text, inputs=input_text, outputs=outputs)
input_text.submit(fn=punctuate_text, inputs=input_text, outputs=outputs)
# Selection handlers
select_btn_1.click(fn=lambda x: x, inputs=output_conservative, outputs=selected_text)
select_btn_2.click(fn=lambda x: x, inputs=output_boundaries, outputs=selected_text)
select_btn_3.click(fn=lambda x: x, inputs=output_balanced, outputs=selected_text)
# JavaScript for copy functionality
copy_btn_1.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#conservative_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Conservative version copied to clipboard!');
}
"""
)
copy_btn_2.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#boundaries_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Sentence boundaries version copied to clipboard!');
}
"""
)
copy_btn_3.click(
None,
None,
None,
js="""
() => {
const outputText = document.querySelector('#balanced_output textarea').value;
navigator.clipboard.writeText(outputText);
alert('Balanced version copied to clipboard!');
}
"""
)
gr.Markdown("""
---
**Model:** [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase)
**Supports 47 languages** including: English, French, Spanish, German, Italian, Portuguese, Russian, Turkish, Chinese, Japanese, Arabic, and many more!
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
) |