File size: 8,073 Bytes
06027df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import gradio as gr
from punctuators.models import PunctCapSegModelONNX

# Load the punctuation model
print("Loading XLM-RoBERTa punctuation model...")
model = PunctCapSegModelONNX.from_pretrained(
    "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)
print("Model loaded successfully!")

def punctuate_text(input_text, progress=gr.Progress()):
    """
    Generate 3 different punctuation corrections with varying strategies
    """
    if not input_text.strip():
        return ["", "", ""]

    corrections = []

    # Three different approaches
    configs = [
        {"name": "Conservative", "apply_sbd": False},
        {"name": "With Sentence Boundaries", "apply_sbd": True},
        {"name": "Balanced", "apply_sbd": True}
    ]

    for i, config in enumerate(configs):
        progress((i + 0.5) / 3, desc=f"Generating {config['name']} version...")

        if config["name"] == "Conservative":
            # Single text processing without sentence boundaries
            result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
            corrected_text = result[0]

        elif config["name"] == "With Sentence Boundaries":
            # Process with sentence boundary detection
            result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
            corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]

        else:  # Balanced
            # Process text in chunks if it's long
            if len(input_text) > 500:
                # Split into chunks
                chunks = [input_text[i:i+500] for i in range(0, len(input_text), 400)]
                results = []
                for chunk in chunks:
                    chunk_result = model.infer(texts=[chunk], apply_sbd=False)
                    results.append(chunk_result[0])
                corrected_text = " ".join(results)
            else:
                result = model.infer(texts=[input_text], apply_sbd=config["apply_sbd"])
                corrected_text = "\n".join(result[0]) if isinstance(result[0], list) else result[0]

        corrections.append(corrected_text)
        progress((i + 1) / 3, desc=f"{config['name']} version complete")

    progress(1.0, desc="All corrections generated!")
    return corrections

# Create Gradio interface
with gr.Blocks(title="Multilingual Punctuation & Capitalization Correction", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🌍 Multilingual Punctuation & Capitalization Correction

    This tool uses **XLM-RoBERTa** to restore punctuation, fix capitalization, and detect sentence boundaries in **47 languages**.

    Enter text without proper punctuation or capitalization, and get 3 different correction styles:
    - **πŸ“ Conservative**: Minimal changes, preserves original flow
    - **πŸ“– With Sentence Boundaries**: Splits text into clear sentences
    - **βš–οΈ Balanced**: Smart chunking for longer texts
    """)

    with gr.Row():
        with gr.Column(scale=2):
            input_text = gr.Textbox(
                label="Input Text (any of 47 supported languages)",
                placeholder="enter text without punctuation or capitalization like this example here it will be fixed",
                lines=12,
                max_lines=20
            )
            correct_btn = gr.Button("πŸš€ Add Punctuation & Capitalization", variant="primary", size="lg")

    # Output section with 3 versions
    gr.Markdown("### πŸ“ Correction Options")

    with gr.Row():
        with gr.Column():
            gr.Markdown("#### πŸ“ Conservative")
            output_conservative = gr.Textbox(
                label="Conservative Correction",
                lines=10,
                max_lines=15,
                interactive=True,
                elem_id="conservative_output"
            )
            copy_btn_1 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")

        with gr.Column():
            gr.Markdown("#### πŸ“– With Sentence Boundaries")
            output_boundaries = gr.Textbox(
                label="Sentence Boundary Detection",
                lines=10,
                max_lines=15,
                interactive=True,
                elem_id="boundaries_output"
            )
            copy_btn_2 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")

        with gr.Column():
            gr.Markdown("#### βš–οΈ Balanced")
            output_balanced = gr.Textbox(
                label="Balanced Correction",
                lines=10,
                max_lines=15,
                interactive=True,
                elem_id="balanced_output"
            )
            copy_btn_3 = gr.Button("πŸ“‹ Copy", variant="secondary", size="sm")

    # Selected version display
    with gr.Row():
        gr.Markdown("### βœ… Selected Correction")
        selected_text = gr.Textbox(
            label="Your Selected Correction",
            lines=5,
            interactive=True,
            placeholder="Click 'Use This' under any correction to select it"
        )

    # Add selection buttons
    with gr.Row():
        with gr.Column():
            select_btn_1 = gr.Button("βœ… Use This", variant="primary", size="sm")
        with gr.Column():
            select_btn_2 = gr.Button("βœ… Use This", variant="primary", size="sm")
        with gr.Column():
            select_btn_3 = gr.Button("βœ… Use This", variant="primary", size="sm")

    # Add examples
    gr.Examples(
        examples=[
            ["hello there how are you doing today i hope everything is going well"],
            ["the quick brown fox jumps over the lazy dog this is a test sentence"],
            ["machine learning is revolutionizing many industries from healthcare to finance"],
            ["bonjour comment allez vous aujourdhui jespere que tout va bien"],
            ["hola como estas espero que todo este bien contigo y tu familia"],
        ],
        inputs=input_text,
        label="Example sentences (click to try)"
    )

    # Set up event handlers
    outputs = [output_conservative, output_boundaries, output_balanced]
    correct_btn.click(fn=punctuate_text, inputs=input_text, outputs=outputs)
    input_text.submit(fn=punctuate_text, inputs=input_text, outputs=outputs)

    # Selection handlers
    select_btn_1.click(fn=lambda x: x, inputs=output_conservative, outputs=selected_text)
    select_btn_2.click(fn=lambda x: x, inputs=output_boundaries, outputs=selected_text)
    select_btn_3.click(fn=lambda x: x, inputs=output_balanced, outputs=selected_text)

    # JavaScript for copy functionality
    copy_btn_1.click(
        None,
        None,
        None,
        js="""
        () => {
            const outputText = document.querySelector('#conservative_output textarea').value;
            navigator.clipboard.writeText(outputText);
            alert('Conservative version copied to clipboard!');
        }
        """
    )

    copy_btn_2.click(
        None,
        None,
        None,
        js="""
        () => {
            const outputText = document.querySelector('#boundaries_output textarea').value;
            navigator.clipboard.writeText(outputText);
            alert('Sentence boundaries version copied to clipboard!');
        }
        """
    )

    copy_btn_3.click(
        None,
        None,
        None,
        js="""
        () => {
            const outputText = document.querySelector('#balanced_output textarea').value;
            navigator.clipboard.writeText(outputText);
            alert('Balanced version copied to clipboard!');
        }
        """
    )

    gr.Markdown("""
    ---
    **Model:** [1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase)

    **Supports 47 languages** including: English, French, Spanish, German, Italian, Portuguese, Russian, Turkish, Chinese, Japanese, Arabic, and many more!
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )