Spaces:

Jay4769
/

Trans

Running

File size: 4,810 Bytes

f93918a
db77a3e
6a31219
db77a3e
 
 
 
 
 
 
 
 
f93918a
f91d2dd
 
db77a3e
 
 
 
 
 
 
 
d519129
f91d2dd
 
db77a3e
 
 
 
 
 
 
 
 
 
6a31219
db77a3e
 
b1c5cca
db77a3e
 
 
6a31219
 
 
 
 
db77a3e
 
 
6a31219
db77a3e
 
 
6a31219
 
 
 
db77a3e
6a31219
 
 
db77a3e
6a31219
db77a3e
6a31219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85e9fae
 
 
6a31219
 
 
 
 
 
85e9fae
6a31219
 
 
85e9fae
 
6a31219
 
85e9fae
6a31219
 
db77a3e
6a31219
db77a3e
6a31219
 
 
85e9fae
 
6a31219
 
 
 
 
 
85e9fae
 
 
 
 
b1c5cca
db77a3e
85e9fae
 
 
b1c5cca
 
db77a3e
85e9fae
 
 
 
db77a3e
6a31219
db77a3e
b1c5cca
6a31219
b1c5cca
 
db77a3e
 
 
 
9638a20
 
 
db77a3e
 
 
 
9638a20
f91d2dd
d519129
9638a20
f91d2dd
 
 
 
b1c5cca
 
 
9638a20
b1c5cca
 
 
 
 
9638a20
 
 
db77a3e
 
 
 
9638a20
f91d2dd
d519129
9638a20
d519129
 
 
 
db77a3e
9638a20
db77a3e
 
 
9638a20
 
f91d2dd
db77a3e
 
 
 
f91d2dd
9638a20
 
f91d2dd
 
 
9638a20
db77a3e
9638a20
f91d2dd
b1c5cca
 
 
 
 
 
9638a20
f91d2dd
9638a20
 
 
f91d2dd
9638a20
 
 
b1c5cca
 
9638a20
f91d2dd
 
 
9638a20
 
f91d2dd
 
 
9638a20
db77a3e
9638a20
 
db77a3e
f91d2dd
9638a20
f91d2dd
9638a20
db77a3e
f91d2dd
f93918a
9638a20

import gradio as gr
import requests
from bs4 import BeautifulSoup

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)

# =========================
# MODELS
# =========================

# RU -> VI
ruvi_model_name = "Helsinki-NLP/opus-mt-ru-vi"

ruvi_tokenizer = AutoTokenizer.from_pretrained(
    ruvi_model_name
)

ruvi_model = AutoModelForSeq2SeqLM.from_pretrained(
    ruvi_model_name
)

# VI -> RU
viru_model_name = "Helsinki-NLP/opus-mt-vi-ru"

viru_tokenizer = AutoTokenizer.from_pretrained(
    viru_model_name
)

viru_model = AutoModelForSeq2SeqLM.from_pretrained(
    viru_model_name
)

# =========================
# WIKTIONARY SCRAPER
# =========================

def get_russian_info(word):

    try:

        url = f"https://ru.wiktionary.org/wiki/{word}"

        headers = {
            "User-Agent": "Mozilla/5.0"
        }

        response = requests.get(
            url,
            headers=headers,
            timeout=10
        )

        soup = BeautifulSoup(
            response.text,
            "html.parser"
        )

        # =====================
        # DEFINITIONS
        # =====================

        definitions = []

        ols = soup.find_all("ol")

        for ol in ols[:2]:

            items = ol.find_all("li")

            for item in items[:5]:

                text = item.get_text(
                    " ",
                    strip=True
                )

                if len(text) > 20:
                    definitions.append(text)

        # =====================
        # EXAMPLES
        # =====================

        examples = []

        quotes = soup.find_all(
            "span",
            class_="example-block"
        )

        for q in quotes[:5]:

            txt = q.get_text(
                " ",
                strip=True
            )

            if txt:
                examples.append(txt)

        # fallback example search
        if len(examples) == 0:

            for i in soup.find_all("i")[:10]:

                txt = i.get_text(
                    " ",
                    strip=True
                )

                if (
                    len(txt) > 15
                    and word.lower()
                    in txt.lower()
                ):
                    examples.append(txt)

        definition_text = (
            "\n\n".join(definitions[:5])
            if definitions
            else "Không tìm thấy định nghĩa"
        )

        example_text = (
            "\n\n".join(examples[:5])
            if examples
            else "Không có ví dụ"
        )

        return (
            definition_text,
            example_text
        )

    except Exception as e:

        return (
            f"Lỗi định nghĩa: {str(e)}",
            "Không có ví dụ"
        )

# =========================
# TRANSLATION FUNCTIONS
# =========================

def ru_to_vi(word):

    inputs = ruvi_tokenizer(
        word,
        return_tensors="pt"
    )

    outputs = ruvi_model.generate(**inputs)

    translated = ruvi_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

    definition, example = (
        get_russian_info(word)
    )

    return (
        translated,
        definition,
        example
    )

def vi_to_ru(word):

    inputs = viru_tokenizer(
        word,
        return_tensors="pt"
    )

    outputs = viru_model.generate(**inputs)

    translated = viru_tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

    return translated

# =========================
# UI
# =========================

with gr.Blocks() as demo:

    gr.Markdown(
        "# 🇷🇺⇄🇻🇳 Russian ↔ Vietnamese Dictionary"
    )

    with gr.Tab("🇷🇺 Nga → Việt"):

        ru_input = gr.Textbox(
            placeholder="Nhập tiếng Nga..."
        )

        vi_output = gr.Textbox(
            label="🇻🇳 Nghĩa tiếng Việt"
        )

        definition_output = gr.Textbox(
            label="📖 Định nghĩa"
        )

        example_output = gr.Textbox(
            label="💬 Ví dụ"
        )

        ru_btn = gr.Button("Tra cứu")

        ru_btn.click(
            ru_to_vi,
            inputs=ru_input,
            outputs=[
                vi_output,
                definition_output,
                example_output
            ]
        )

    with gr.Tab("🇻🇳 Việt → Nga"):

        vi_input = gr.Textbox(
            placeholder="Nhập tiếng Việt..."
        )

        ru_output = gr.Textbox(
            label="🇷🇺 Nghĩa tiếng Nga"
        )

        vi_btn = gr.Button("Dịch")

        vi_btn.click(
            vi_to_ru,
            inputs=vi_input,
            outputs=ru_output
        )

demo.launch(server_name="0.0.0.0")