repleeka commited on
Commit
7632073
Β·
verified Β·
1 Parent(s): 8c340e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -38
app.py CHANGED
@@ -1,7 +1,6 @@
1
- from transformers import pipeline
2
- from datasets import Dataset
3
  import streamlit as st
4
  import torch
 
5
 
6
  # Set the background color and layout with set_page_config
7
  st.set_page_config(
@@ -10,7 +9,30 @@ st.set_page_config(
10
  layout="wide",
11
  )
12
 
13
- # Streamlit app setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  st.title(":repeat: English to Tagin Translator")
15
  st.markdown("Welcome to the English to Tagin Translator. :sparkles: Simply enter your text in English, and get the translation in Tagin instantly! :thumbsup:")
16
 
@@ -19,32 +41,31 @@ if 'text_input' not in st.session_state:
19
  st.session_state.text_input = ""
20
  text_input = st.text_area("Enter English text to translate", height=150, value=st.session_state.text_input)
21
 
22
- # Define your model from Hugging Face
23
- model_directory = "repleeka/eng-tagin-nmt"
24
-
25
- device = 0 if torch.cuda.is_available() else -1
26
- translation_pipeline = pipeline(
27
- task="translation",
28
- model="repleeka/eng-tagin-nmt",
29
- tokenizer="repleeka/eng-tagin-nmt",
30
- device=device
31
- )
32
-
33
  # Translate button
34
  if st.button("Translate", key="translate_button"):
35
  if text_input:
36
  with st.spinner("Translating... Please wait"):
37
- # Prepare data for translation
38
- sentences = [text_input]
39
- data = Dataset.from_dict({"text": sentences})
40
-
41
- # Apply translation
42
  try:
43
- results = data.map(lambda x: {"translation": translation_pipeline(x["text"])})
44
- result = results[0]["translation"][0]['translation_text']
45
 
46
- # Capitalize the first letter of the result
47
- result = result.capitalize()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Display translation result with custom styling
50
  st.markdown("#### Translated text:")
@@ -58,29 +79,23 @@ if st.button("Translate", key="translate_button"):
58
  # Clear input button
59
  if st.button("Clear Input"):
60
  st.session_state.text_input = ""
 
61
 
62
- st.markdown("""❗❗❗ **Please note:** The English-to-Tagin translator is still in its initial phase, so it may provide incorrect translations at times. Your understanding is appreciated!
63
 
 
64
  🀝 **For contributions or inquiries, feel free to contact me!**
65
-
66
  """)
67
 
68
- # Additional sections remain unchanged
69
  st.markdown("""### Tagin Language
70
-
71
  Tagin is a beautiful language spoken by the Tagin tribe and belongs to the Tani group of Sino-Tibetan languages. You'll mainly find Tagin speakers in the Upper Subansiri, Shiyomi, and in some parts Kara Dadi, and Kurung Kumey districts of Arunachal Pradesh, India. While about 63,000 (according to 2011 Census of India) people speak Tagin as their mother tongue, UNESCO has marked it as 'definitely endangered', which means it's at risk of disappearing. Unfortunately, very few written materials exist in Tagin, which makes it hard to study and preserve the language.
72
-
73
  As a small contribution to preserving this rich cultural heritage, I've developed this English-Tagin translator using the GinLish Corpus v0.1.0. By creating this digital tool, I hope to help keep the Tagin language alive and make it more accessible to both the Tagin community and language enthusiasts. This project is my way of giving back to society and helping protect an important piece of our cultural diversity.
74
-
75
  """)
76
 
77
  st.markdown("""### GinLish Corpus v0.1.0 (2024)
78
-
79
  I'm excited:satisfied: to share that I have created the GinLish Corpus v0.1.0, which is actually the first-ever collection of matched Tagin and English sentences. The corpus contains 60,000 carefully paired sentences that captures how these languages relate to each other. To build this, I translated English sentences from the Tatoeba website into Tagin and included traditional Tagin folk stories too.
80
  What makes this special is that I made sure to keep the true essence of the Tagin language alive in the translations. This means including Tagin sayings, cultural elements, and the unique way Tagin people express themselves. All this careful attention to detail makes the corpus really valuable for building translation tools, studying the language, and helping people learn Tagin. It's not just a simple word-for-word translation - it's a bridge between these two languages that respects and preserves Tagin's cultural identity.
81
-
82
  Good news:smiley: for researchers and language enthusiasts - I plan to release this dataset for non-commercial use once I complete my PhD!πŸŽ“ This way, others can also contribute to preserving and studying this beautiful language.
83
-
84
  """)
85
 
86
  # Sidebar for About and Contact information
@@ -88,13 +103,9 @@ st.sidebar.header("About the Developer")
88
 
89
  st.sidebar.markdown("""
90
  Hey there! πŸ‘‹
91
-
92
  I’m **Tungon Dugi**.
93
-
94
  Right now, I’m doing my PhD in Computer Science and Engineering at NIT Arunachal Pradesh. πŸŽ“
95
-
96
  I’ve got a keen interest in Natural Language Processing (NLP), Machine Translation (MT), Deep Learning, and Linguistics.
97
-
98
  πŸ’»βœ¨ I love exploring how tech can help preserve and promote low-resource languages, especially my own language, **Tagin**! πŸŒπŸ’¬""")
99
 
100
  # Create some space between main sidebar content and footer
@@ -106,6 +117,6 @@ st.sidebar.caption("Contact: tungondugi@gmail.com")
106
  # Or using columns in sidebar:
107
  col1, col2 = st.sidebar.columns(2)
108
  with col1:
109
- st.caption("Β© 2024")
110
  with col2:
111
- st.caption("v0.1.0")
 
 
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
 
5
  # Set the background color and layout with set_page_config
6
  st.set_page_config(
 
9
  layout="wide",
10
  )
11
 
12
+ # --- Model Loading with Caching ---
13
+ # We use @st.cache_resource so the model loads only once, not on every user action.
14
+ @st.cache_resource
15
+ def load_model():
16
+ model_name = "Repleeka/mBART-tgj-final"
17
+ tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
18
+ model = MBartForConditionalGeneration.from_pretrained(model_name)
19
+ return tokenizer, model
20
+
21
+ # Load model and tokenizer
22
+ try:
23
+ tokenizer, model = load_model()
24
+ # Set the source language to English
25
+ tokenizer.src_lang = "en_XX"
26
+ except Exception as e:
27
+ st.error(f"Error loading model: {e}")
28
+
29
+ # Determine device (GPU or CPU)
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+ if 'model' in locals():
32
+ model = model.to(device)
33
+
34
+ # --- Streamlit App Layout ---
35
+
36
  st.title(":repeat: English to Tagin Translator")
37
  st.markdown("Welcome to the English to Tagin Translator. :sparkles: Simply enter your text in English, and get the translation in Tagin instantly! :thumbsup:")
38
 
 
41
  st.session_state.text_input = ""
42
  text_input = st.text_area("Enter English text to translate", height=150, value=st.session_state.text_input)
43
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Translate button
45
  if st.button("Translate", key="translate_button"):
46
  if text_input:
47
  with st.spinner("Translating... Please wait"):
 
 
 
 
 
48
  try:
49
+ # 1. Tokenize the input text
50
+ inputs = tokenizer(text_input, return_tensors="pt")
51
 
52
+ # Move inputs to the correct device (GPU/CPU)
53
+ inputs = {k: v.to(device) for k, v in inputs.items()}
54
+
55
+ # 2. Generate translation using the specific Tagin token ID
56
+ generated_tokens = model.generate(
57
+ **inputs,
58
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids("<tgj_IN>"),
59
+ num_beams=5,
60
+ max_length=128,
61
+ )
62
+
63
+ # 3. Decode the generated tokens
64
+ result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
65
+
66
+ # Capitalize the first letter of the result for aesthetics
67
+ if result:
68
+ result = result[0].upper() + result[1:]
69
 
70
  # Display translation result with custom styling
71
  st.markdown("#### Translated text:")
 
79
  # Clear input button
80
  if st.button("Clear Input"):
81
  st.session_state.text_input = ""
82
+ st.rerun() # Rerun to refresh the text area immediately
83
 
84
+ # --- Static Content ---
85
 
86
+ st.markdown("""❗❗❗ **Please note:** The English-to-Tagin translator is still in its initial phase, so it may provide incorrect translations at times. Your understanding is appreciated!
87
  🀝 **For contributions or inquiries, feel free to contact me!**
 
88
  """)
89
 
 
90
  st.markdown("""### Tagin Language
 
91
  Tagin is a beautiful language spoken by the Tagin tribe and belongs to the Tani group of Sino-Tibetan languages. You'll mainly find Tagin speakers in the Upper Subansiri, Shiyomi, and in some parts Kara Dadi, and Kurung Kumey districts of Arunachal Pradesh, India. While about 63,000 (according to 2011 Census of India) people speak Tagin as their mother tongue, UNESCO has marked it as 'definitely endangered', which means it's at risk of disappearing. Unfortunately, very few written materials exist in Tagin, which makes it hard to study and preserve the language.
 
92
  As a small contribution to preserving this rich cultural heritage, I've developed this English-Tagin translator using the GinLish Corpus v0.1.0. By creating this digital tool, I hope to help keep the Tagin language alive and make it more accessible to both the Tagin community and language enthusiasts. This project is my way of giving back to society and helping protect an important piece of our cultural diversity.
 
93
  """)
94
 
95
  st.markdown("""### GinLish Corpus v0.1.0 (2024)
 
96
  I'm excited:satisfied: to share that I have created the GinLish Corpus v0.1.0, which is actually the first-ever collection of matched Tagin and English sentences. The corpus contains 60,000 carefully paired sentences that captures how these languages relate to each other. To build this, I translated English sentences from the Tatoeba website into Tagin and included traditional Tagin folk stories too.
97
  What makes this special is that I made sure to keep the true essence of the Tagin language alive in the translations. This means including Tagin sayings, cultural elements, and the unique way Tagin people express themselves. All this careful attention to detail makes the corpus really valuable for building translation tools, studying the language, and helping people learn Tagin. It's not just a simple word-for-word translation - it's a bridge between these two languages that respects and preserves Tagin's cultural identity.
 
98
  Good news:smiley: for researchers and language enthusiasts - I plan to release this dataset for non-commercial use once I complete my PhD!πŸŽ“ This way, others can also contribute to preserving and studying this beautiful language.
 
99
  """)
100
 
101
  # Sidebar for About and Contact information
 
103
 
104
  st.sidebar.markdown("""
105
  Hey there! πŸ‘‹
 
106
  I’m **Tungon Dugi**.
 
107
  Right now, I’m doing my PhD in Computer Science and Engineering at NIT Arunachal Pradesh. πŸŽ“
 
108
  I’ve got a keen interest in Natural Language Processing (NLP), Machine Translation (MT), Deep Learning, and Linguistics.
 
109
  πŸ’»βœ¨ I love exploring how tech can help preserve and promote low-resource languages, especially my own language, **Tagin**! πŸŒπŸ’¬""")
110
 
111
  # Create some space between main sidebar content and footer
 
117
  # Or using columns in sidebar:
118
  col1, col2 = st.sidebar.columns(2)
119
  with col1:
120
+ st.caption("Β© 2026")
121
  with col2:
122
+ st.caption("v0.1.1")