|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
import torch |
|
|
|
|
|
|
|
|
model_name = "ufal/byt5-large-geccc-mate" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model = model.to(device) |
|
|
|
|
|
def correct_text(input_text): |
|
|
""" |
|
|
Correct grammatical errors in the input text using ByT5 GEC model |
|
|
""" |
|
|
if not input_text.strip(): |
|
|
return "" |
|
|
|
|
|
|
|
|
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True) |
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_length=512, |
|
|
num_beams=5, |
|
|
early_stopping=True, |
|
|
no_repeat_ngram_size=2 |
|
|
) |
|
|
|
|
|
|
|
|
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return corrected_text |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Czech Grammar Error Correction - ByT5") as demo: |
|
|
gr.Markdown(""" |
|
|
# Czech Grammar Error Correction with ByT5 |
|
|
|
|
|
This tool uses the **ByT5-large-geccc-mate** model to correct grammatical errors in Czech text. |
|
|
|
|
|
Simply enter your text below and click "Correct Text" to get the grammatically corrected version. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
input_text = gr.Textbox( |
|
|
label="Input Text", |
|
|
placeholder="Enter Czech text with potential grammar errors...", |
|
|
lines=10, |
|
|
max_lines=20 |
|
|
) |
|
|
correct_btn = gr.Button("Correct Text", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox( |
|
|
label="Corrected Text", |
|
|
lines=10, |
|
|
max_lines=20, |
|
|
interactive=True, |
|
|
placeholder="Corrected text will appear here..." |
|
|
) |
|
|
copy_btn = gr.Button("📋 Copy to Clipboard", variant="secondary") |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Včera jsem šel do obchodu a koupil jsem si rohlíky."], |
|
|
["Chtěl bych se zeptat, jestli máte volno zítra."], |
|
|
["Mám rád když svítí slunce a můžu jít ven."] |
|
|
], |
|
|
inputs=input_text, |
|
|
label="Example sentences (click to try)" |
|
|
) |
|
|
|
|
|
|
|
|
correct_btn.click(fn=correct_text, inputs=input_text, outputs=output_text) |
|
|
input_text.submit(fn=correct_text, inputs=input_text, outputs=output_text) |
|
|
|
|
|
|
|
|
copy_btn.click( |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
js=""" |
|
|
() => { |
|
|
const outputText = document.querySelector('#component-5 textarea').value; |
|
|
navigator.clipboard.writeText(outputText); |
|
|
alert('Text copied to clipboard!'); |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
**Model:** [ufal/byt5-large-geccc-mate](https://huggingface.co/ufal/byt5-large-geccc-mate) |
|
|
|
|
|
**Note:** This model is specifically trained for Czech language grammar correction. |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |