| # app.py | |
| import os | |
| import html | |
| import difflib | |
| from typing import List, Tuple | |
| import gradio as gr | |
| # --- LanguageTool (rule-based grammar checker) --- | |
| import language_tool_python | |
| # --- Transformers model for grammar correction --- | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| LT_LANG = "en-US" | |
| # Инициализаци (нэг удаа) | |
| tool = language_tool_python.LanguageTool(LT_LANG) | |
| # Жижиг, CPU-д ээлтэй T5 correction загвар | |
| MODEL_NAME = "vennify/t5-base-grammar-correction" # HF дээрх нийтлэг GEC загвар | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) | |
| model.eval() | |
| def t5_correct(text: str, max_new_tokens: int = 128) -> str: | |
| """ | |
| T5 загвараар англи бичвэрийн зассан хувилбар гаргана. | |
| """ | |
| if not text.strip(): | |
| return "" | |
| # Зарим GEC-T5 загваруудад тусгай prefix хэрэглэдэг. Энэ загварт prefix шаардлагагүй. | |
| inputs = tokenizer([text], return_tensors="pt", truncation=True) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| num_beams=4, | |
| early_stopping=True | |
| ) | |
| out = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return out.strip() | |
| def lt_find_issues(text: str): | |
| """ | |
| LanguageTool ашиглан дүрмийн асуудлууд, тайлбар, санал болгох засваруудыг буцаана. | |
| """ | |
| matches = tool.check(text) | |
| rows = [] | |
| for m in matches: | |
| err = text[m.offset : m.offset + m.errorLength] | |
| repls = ", ".join(m.replacements[:5]) if m.replacements else "" | |
| rows.append( | |
| { | |
| "Error": err, | |
| "Message": m.message, | |
| "Rule": m.ruleId, | |
| "Suggestions": repls, | |
| "Start": m.offset, | |
| "Length": m.errorLength, | |
| } | |
| ) | |
| return rows | |
| def highlight_html(text: str, rows: List[dict]) -> str: | |
| """ | |
| Олдсон алдаануудыг текст дээр <mark> таг ашиглан тэмдэглэсэн HTML үүсгэнэ. | |
| Давхцах highlight үүсэхээс сэргийлж баруунаас нь эхлэн wrap хийнэ. | |
| """ | |
| if not rows: | |
| return f"<div style='white-space:pre-wrap'>{html.escape(text)}</div>" | |
| # Баруунаас нь wrap хийхийн тулд эхлээд offset-ээр эрэмбэлнэ. | |
| rows_sorted = sorted(rows, key=lambda r: r["Start"], reverse=True) | |
| buf = text | |
| for r in rows_sorted: | |
| s = r["Start"] | |
| e = s + r["Length"] | |
| if s < 0 or e > len(buf): # хамгаалалт | |
| continue | |
| frag = html.escape(buf[s:e]) | |
| tip = html.escape(f'{r["Message"]} | Suggestions: {r.get("Suggestions","")}') | |
| wrapped = f"<mark title='{tip}'>{frag}</mark>" | |
| buf = html.escape(buf[:s]) + wrapped + html.escape(buf[e:]) | |
| # Дээр нь escape давхардахгүй байх үүднээс буферийг дахин unescape хийхгүй. | |
| # Учир нь бид хэсгүүдийг аль хэдийн escape хийсэн. | |
| # Гэхдээ бид буферийн үлдсэн escape-г зөв үлдээхийн тулд дараах жижиг тохируулга: | |
| # html.escape аль хэдийн хийсэн тул <mark> тагийг буцааж сэргээе. | |
| buf = buf.replace("<mark", "<mark").replace("mark>", "mark>") | |
| return f"<div style='white-space:pre-wrap'>{buf}</div>" | |
| def diff_html(a: str, b: str) -> str: | |
| """ | |
| Эх бичвэр (a) ба зассан бичвэр (b)-ийн ялгааг HTML хэлбэрээр харуулна. | |
| """ | |
| a_esc = html.escape(a) | |
| b_esc = html.escape(b) | |
| diff = difflib.ndiff(a_esc.split(), b_esc.split()) | |
| parts = [] | |
| for token in diff: | |
| if token.startswith("+ "): | |
| parts.append(f"<ins>{token[2:]}</ins>") | |
| elif token.startswith("- "): | |
| parts.append(f"<del>{token[2:]}</del>") | |
| elif token.startswith("? "): | |
| # туслах мөр – алгасъя | |
| continue | |
| else: | |
| parts.append(token[2:]) | |
| return "<div style='line-height:1.9; word-wrap:break-word'>" + " ".join(parts) + "</div>" | |
| def pipeline(text: str): | |
| """ | |
| Нэг товчоор гурван үр дүн: | |
| 1) LanguageTool-ийн алдааны хүснэгт | |
| 2) Алдаатай хэсгийг highlight хийсэн HTML | |
| 3) T5 загвараар зассан хувилбар + diff | |
| """ | |
| text = (text or "").strip() | |
| if not text: | |
| return "", [], "", "", "" | |
| # 1) LanguageTool | |
| rows = lt_find_issues(text) | |
| table_rows = [ | |
| [r["Error"], r["Message"], r["Rule"], r["Suggestions"]] for r in rows | |
| ] | |
| highlighted = highlight_html(text, rows) | |
| # 2) T5 correction | |
| corrected = t5_correct(text) | |
| # 3) Diff | |
| dhtml = diff_html(text, corrected) | |
| return corrected, table_rows, highlighted, dhtml, f"Found {len(rows)} potential issue(s)." | |
| with gr.Blocks(title="Grammar Classroom (HF Space)") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🧑🏫 English Grammar Classroom | |
| - **Rule-based check** (LanguageTool): errors, explanations, suggestions | |
| - **AI correction** (T5): corrected version | |
| - **Diff view**: see changes compared to your original | |
| """ | |
| ) | |
| with gr.Row(): | |
| inp = gr.Textbox( | |
| label="Enter English text", | |
| placeholder="Paste or type your sentence/paragraph here…", | |
| lines=7, | |
| ) | |
| run_btn = gr.Button("Check & Correct", variant="primary") | |
| gr.Markdown("### ✅ AI-corrected Text") | |
| corrected_out = gr.Textbox(label="Corrected", lines=6) | |
| gr.Markdown("### 📋 Grammar Issues (LanguageTool)") | |
| issues_df = gr.Dataframe( | |
| headers=["Error", "Message", "Rule", "Suggestions"], | |
| datatype=["str", "str", "str", "str"], | |
| wrap=True, | |
| interactive=False, | |
| row_count=(0, "dynamic"), | |
| ) | |
| issues_summary = gr.Markdown() | |
| gr.Markdown("### ✨ Highlighted Issues") | |
| highlighted_out = gr.HTML() | |
| gr.Markdown("### 🔍 Diff (Original vs Corrected)") | |
| diff_out = gr.HTML() | |
| run_btn.click( | |
| fn=pipeline, | |
| inputs=[inp], | |
| outputs=[corrected_out, issues_df, highlighted_out, diff_out, issues_summary], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |