import gradio as gr import difflib import re from transformers import AutoModelForSeq2SeqLM, AutoTokenizer def load_model(checkpoint_path): tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path) return model, tokenizer checkpoint_path = "Diezu/Batpho_v2" model, tokenizer = load_model(checkpoint_path) def correct_spelling(text): inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True) corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return corrected_text def find_spelling_errors(s1, s2): a = s1.split() b = s2.split() matcher = difflib.SequenceMatcher(None, a, b) highlighted_text = [] corrected_words = {} for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == 'replace': incorrect_word = " ".join(a[i1:i2]) correct_word = " ".join(b[j1:j2]) highlighted_text.append(f'') corrected_words[incorrect_word] = correct_word elif tag == 'delete': highlighted_text.append(f'{" ".join(a[i1:i2])}') elif tag == 'insert': highlighted_text.append(f'{" ".join(b[j1:j2])}') else: highlighted_text.append(" ".join(a[i1:i2])) return " ".join(highlighted_text), corrected_words def process_text(text): sentences = re.split(r'(?<=[.!?])\s+', text) corrected_sentences = [] highlighted_sentences = [] all_corrected_words = {} for sentence in sentences: if sentence.strip(): corrected_text = correct_spelling(sentence) highlighted_text, corrected_words = find_spelling_errors(sentence, corrected_text) corrected_sentences.append(corrected_text) highlighted_sentences.append(highlighted_text) all_corrected_words.update(corrected_words) return "
".join(highlighted_sentences), "\n".join(corrected_sentences), all_corrected_words def apply_full_correction(text, corrected_words): for incorrect, correct in corrected_words.items(): text = text.replace(incorrect, f'{correct}') return text demo = gr.Blocks() with demo: input_text = gr.Textbox(placeholder="Nhập văn bản có lỗi chính tả...") output_html = gr.HTML() corrected_textbox = gr.Textbox() corrected_words_state = gr.State({}) btn_check = gr.Button("Phát hiện lỗi") btn_correct_all = gr.Button("Sửa toàn bộ lỗi") def update_text(text): highlighted, corrected, corrected_words = process_text(text) return highlighted, corrected, corrected_words btn_check.click(update_text, inputs=input_text, outputs=[output_html, corrected_textbox, corrected_words_state]) btn_correct_all.click(apply_full_correction, inputs=[input_text, corrected_words_state], outputs=output_html) demo.launch()