Translate-100-languages

Runtime error

App Files Files Community

lanzhiwang commited on Dec 11, 2023

Commit

e90feb1

1 Parent(s): d5bdfe9

test

Browse files

Files changed (4) hide show

01.ipynb +0 -0
app-bak.py +0 -172
app.py +8 -15
requirements.txt +4 -5

01.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

app-bak.py DELETED Viewed

@@ -1,172 +0,0 @@
-import streamlit as st
-import os
-import io
-from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
-import time
-import json
-from typing import List
-import torch
-import random
-import logging
-if torch.cuda.is_available():
-    device = torch.device("cuda:0")
-else:
-    device = torch.device("cpu")
-    logging.warning("GPU not found, using CPU, translation will be very slow.")
-st.cache(suppress_st_warning=True, allow_output_mutation=True)
-st.set_page_config(page_title="M2M100 Translator")
-lang_id = {
-    "Afrikaans": "af",
-    "Amharic": "am",
-    "Arabic": "ar",
-    "Asturian": "ast",
-    "Azerbaijani": "az",
-    "Bashkir": "ba",
-    "Belarusian": "be",
-    "Bulgarian": "bg",
-    "Bengali": "bn",
-    "Breton": "br",
-    "Bosnian": "bs",
-    "Catalan": "ca",
-    "Cebuano": "ceb",
-    "Czech": "cs",
-    "Welsh": "cy",
-    "Danish": "da",
-    "German": "de",
-    "Greeek": "el",
-    "English": "en",
-    "Spanish": "es",
-    "Estonian": "et",
-    "Persian": "fa",
-    "Fulah": "ff",
-    "Finnish": "fi",
-    "French": "fr",
-    "Western Frisian": "fy",
-    "Irish": "ga",
-    "Gaelic": "gd",
-    "Galician": "gl",
-    "Gujarati": "gu",
-    "Hausa": "ha",
-    "Hebrew": "he",
-    "Hindi": "hi",
-    "Croatian": "hr",
-    "Haitian": "ht",
-    "Hungarian": "hu",
-    "Armenian": "hy",
-    "Indonesian": "id",
-    "Igbo": "ig",
-    "Iloko": "ilo",
-    "Icelandic": "is",
-    "Italian": "it",
-    "Japanese": "ja",
-    "Javanese": "jv",
-    "Georgian": "ka",
-    "Kazakh": "kk",
-    "Central Khmer": "km",
-    "Kannada": "kn",
-    "Korean": "ko",
-    "Luxembourgish": "lb",
-    "Ganda": "lg",
-    "Lingala": "ln",
-    "Lao": "lo",
-    "Lithuanian": "lt",
-    "Latvian": "lv",
-    "Malagasy": "mg",
-    "Macedonian": "mk",
-    "Malayalam": "ml",
-    "Mongolian": "mn",
-    "Marathi": "mr",
-    "Malay": "ms",
-    "Burmese": "my",
-    "Nepali": "ne",
-    "Dutch": "nl",
-    "Norwegian": "no",
-    "Northern Sotho": "ns",
-    "Occitan": "oc",
-    "Oriya": "or",
-    "Panjabi": "pa",
-    "Polish": "pl",
-    "Pushto": "ps",
-    "Portuguese": "pt",
-    "Romanian": "ro",
-    "Russian": "ru",
-    "Sindhi": "sd",
-    "Sinhala": "si",
-    "Slovak": "sk",
-    "Slovenian": "sl",
-    "Somali": "so",
-    "Albanian": "sq",
-    "Serbian": "sr",
-    "Swati": "ss",
-    "Sundanese": "su",
-    "Swedish": "sv",
-    "Swahili": "sw",
-    "Tamil": "ta",
-    "Thai": "th",
-    "Tagalog": "tl",
-    "Tswana": "tn",
-    "Turkish": "tr",
-    "Ukrainian": "uk",
-    "Urdu": "ur",
-    "Uzbek": "uz",
-    "Vietnamese": "vi",
-    "Wolof": "wo",
-    "Xhosa": "xh",
-    "Yiddish": "yi",
-    "Yoruba": "yo",
-    "Chinese": "zh",
-    "Zulu": "zu",
-}
-@st.cache(suppress_st_warning=True, allow_output_mutation=True)
-def load_model(
-    pretrained_model: str = "facebook/m2m100_1.2B",
-    cache_dir: str = "models/",
-):
-    tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
-    model = M2M100ForConditionalGeneration.from_pretrained(
-        pretrained_model, cache_dir=cache_dir
-    ).to(device)
-    model.eval()
-    return tokenizer, model
-st.title("M2M100 Translator")
-st.write("M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It was introduced in this paper https://arxiv.org/abs/2010.11125 and first released in https://github.com/pytorch/fairseq/tree/master/examples/m2m_100 repository. The model that can directly translate between the 9,900 directions of 100 languages.\n")
-st.write(" This demo uses the facebook/m2m100_1.2B model. For local inference see https://github.com/ikergarcia1996/Easy-Translate")
-user_input: str = st.text_area(
-    "Input text",
-    height=200,
-    max_chars=5120,
-)
-source_lang = st.selectbox(label="Source language", options=list(lang_id.keys()))
-target_lang = st.selectbox(label="Target language", options=list(lang_id.keys()))
-if st.button("Run"):
-    time_start = time.time()
-    tokenizer, model = load_model()
-    src_lang = lang_id[source_lang]
-    trg_lang = lang_id[target_lang]
-    tokenizer.src_lang = src_lang
-    with torch.no_grad():
-        encoded_input = tokenizer(user_input, return_tensors="pt").to(device)
-        generated_tokens = model.generate(
-            **encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang)
-        )
-        translated_text = tokenizer.batch_decode(
-            generated_tokens, skip_special_tokens=True
-        )[0]
-    time_end = time.time()
-    st.success(translated_text)
-    st.write(f"Computation time: {round((time_end-time_start),3)} segs")

app.py CHANGED Viewed

@@ -2,13 +2,13 @@ import streamlit as st
 from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
 import time
 import torch
-import logging
 if torch.cuda.is_available():
     device = torch.device("cuda:0")
 else:
     device = torch.device("cpu")
-    logging.warning("GPU not found, using CPU, translation will be very slow.")
 st.set_page_config(page_title="M2M100 Translator")
@@ -125,25 +125,18 @@ def load_model(
     model = M2M100ForConditionalGeneration.from_pretrained(
         pretrained_model, cache_dir=cache_dir
     ).to(device)
-    """
-    在PyTorch中，`model.eval()`是用来将模型设置为评估（evaluation）模式的方法。在深度学习中，训练和评估两个阶段的模型行为可能会有所不同。以下是`model.eval()`的主要作用：
-    1. **Batch Normalization和Dropout的影响：**
-    - 在训练阶段，`Batch Normalization`和`Dropout`等层的行为通常是不同的。在训练时，`Batch Normalization`使用批次统计信息来规范化输入，而`Dropout`层会随机丢弃一些神经元。在评估阶段，我们通常希望使用整个数据集的统计信息来规范化，而不是每个批次的统计信息，并且不再需要随机丢弃神经元。因此，通过执行`model.eval()`，模型会切换到评估模式，从而确保这些层的行为在评估时是正确的。
-    2. **梯度计算的关闭：**
-    - 在评估模式下，PyTorch会关闭自动求导（autograd）的计算图，这样可以避免不必要的梯度计算和内存消耗。在训练时，我们通常需要计算梯度以进行反向传播和参数更新，而在评估时，我们只对模型的前向传播感兴趣，因此关闭梯度计算可以提高评估的速度和减少内存使用。
-    总的来说，执行`model.eval()`是为了确保在评估阶段模型的行为和性能是正确的，并且可以提高评估时的效率。
-    """
     model.eval()
     return tokenizer, model
 st.title("M2M100 Translator")
-st.write("M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It was introduced in this paper https://arxiv.org/abs/2010.11125 and first released in https://github.com/pytorch/fairseq/tree/master/examples/m2m_100 repository. The model that can directly translate between the 9,900 directions of 100 languages.\n")
-st.write(" This demo uses the facebook/m2m100_1.2B model. For local inference see https://github.com/ikergarcia1996/Easy-Translate")
 user_input: str = st.text_area(

 from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
 import time
 import torch
 if torch.cuda.is_available():
     device = torch.device("cuda:0")
 else:
     device = torch.device("cpu")
 st.set_page_config(page_title="M2M100 Translator")
     model = M2M100ForConditionalGeneration.from_pretrained(
         pretrained_model, cache_dir=cache_dir
     ).to(device)
     model.eval()
     return tokenizer, model
 st.title("M2M100 Translator")
+st.write(
+    "M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It was introduced in this paper https://arxiv.org/abs/2010.11125 and first released in https://github.com/pytorch/fairseq/tree/master/examples/m2m_100 repository. The model that can directly translate between the 9,900 directions of 100 languages.\n"
+)
+st.write(
+    " This demo uses the facebook/m2m100_1.2B model. For local inference see https://github.com/ikergarcia1996/Easy-Translate"
+)
 user_input: str = st.text_area(

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-streamlit
-torch
-transformers
-sentencepiece
-# transformers[sentencepiece]

+streamlit==1.29.0
+torch==2.1.1
+transformers==4.35.2
+sentencepiece==0.1.99