import gradio as gr import requests from bs4 import BeautifulSoup from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM ) # ========================= # MODELS # ========================= # RU -> VI ruvi_model_name = "Helsinki-NLP/opus-mt-ru-vi" ruvi_tokenizer = AutoTokenizer.from_pretrained( ruvi_model_name ) ruvi_model = AutoModelForSeq2SeqLM.from_pretrained( ruvi_model_name ) # VI -> RU viru_model_name = "Helsinki-NLP/opus-mt-vi-ru" viru_tokenizer = AutoTokenizer.from_pretrained( viru_model_name ) viru_model = AutoModelForSeq2SeqLM.from_pretrained( viru_model_name ) # ========================= # WIKTIONARY SCRAPER # ========================= def get_russian_info(word): try: url = f"https://ru.wiktionary.org/wiki/{word}" headers = { "User-Agent": "Mozilla/5.0" } response = requests.get( url, headers=headers, timeout=10 ) soup = BeautifulSoup( response.text, "html.parser" ) # ===================== # DEFINITIONS # ===================== definitions = [] ols = soup.find_all("ol") for ol in ols[:2]: items = ol.find_all("li") for item in items[:5]: text = item.get_text( " ", strip=True ) if len(text) > 20: definitions.append(text) # ===================== # EXAMPLES # ===================== examples = [] quotes = soup.find_all( "span", class_="example-block" ) for q in quotes[:5]: txt = q.get_text( " ", strip=True ) if txt: examples.append(txt) # fallback example search if len(examples) == 0: for i in soup.find_all("i")[:10]: txt = i.get_text( " ", strip=True ) if ( len(txt) > 15 and word.lower() in txt.lower() ): examples.append(txt) definition_text = ( "\n\n".join(definitions[:5]) if definitions else "Không tìm thấy định nghĩa" ) example_text = ( "\n\n".join(examples[:5]) if examples else "Không có ví dụ" ) return ( definition_text, example_text ) except Exception as e: return ( f"Lỗi định nghĩa: {str(e)}", "Không có ví dụ" ) # ========================= # TRANSLATION FUNCTIONS # ========================= def ru_to_vi(word): inputs = ruvi_tokenizer( word, return_tensors="pt" ) outputs = ruvi_model.generate(**inputs) translated = ruvi_tokenizer.decode( outputs[0], skip_special_tokens=True ) definition, example = ( get_russian_info(word) ) return ( translated, definition, example ) def vi_to_ru(word): inputs = viru_tokenizer( word, return_tensors="pt" ) outputs = viru_model.generate(**inputs) translated = viru_tokenizer.decode( outputs[0], skip_special_tokens=True ) return translated # ========================= # UI # ========================= with gr.Blocks() as demo: gr.Markdown( "# 🇷🇺⇄🇻🇳 Russian ↔ Vietnamese Dictionary" ) with gr.Tab("🇷🇺 Nga → Việt"): ru_input = gr.Textbox( placeholder="Nhập tiếng Nga..." ) vi_output = gr.Textbox( label="🇻🇳 Nghĩa tiếng Việt" ) definition_output = gr.Textbox( label="📖 Định nghĩa" ) example_output = gr.Textbox( label="💬 Ví dụ" ) ru_btn = gr.Button("Tra cứu") ru_btn.click( ru_to_vi, inputs=ru_input, outputs=[ vi_output, definition_output, example_output ] ) with gr.Tab("🇻🇳 Việt → Nga"): vi_input = gr.Textbox( placeholder="Nhập tiếng Việt..." ) ru_output = gr.Textbox( label="🇷🇺 Nghĩa tiếng Nga" ) vi_btn = gr.Button("Dịch") vi_btn.click( vi_to_ru, inputs=vi_input, outputs=ru_output ) demo.launch(server_name="0.0.0.0")