ErikDaska commited on
Commit
416e378
·
verified ·
1 Parent(s): 070212f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +106 -135
src/streamlit_app.py CHANGED
@@ -3,162 +3,133 @@ import transformers
3
  from transformers import pipeline
4
  import os
5
 
6
- # Read token from environment (set as secret in Space settings)
7
- token = os.environ.get("token")
8
 
 
 
9
 
10
- def instantiate_gpt2(model_name: str,max_length_ : int, num_return_sequences : int, text : str) -> dict:
11
- pipe = pipeline(task='text-generation', model=f'Iscte-Sintra/{model_name}', tokenizer=f'Iscte-Sintra/{model_name}',
12
- token=token, truncation=True, device_map="cpu")
13
- if model_name == "Qwen_v0.1":
14
- results = pipe(
15
- text,
16
- max_new_tokens=max_length_,
17
- num_return_sequences=num_return_sequences,
18
- do_sample=True,
19
- top_p=0.95,
20
- top_k=50
21
- )
 
 
 
22
  else:
23
- results = pipe(
24
- text,
25
- max_length=max_length_,
26
- num_return_sequences=num_return_sequences,
27
- do_sample=True,
28
- top_p=0.95,
29
- top_k=50
30
- )
31
- return results
32
-
33
- def instantiate_encoder(model_name: str, top_k : int, text : str) -> dict:
34
- pipe = pipeline("fill-mask", model=f"Iscte-Sintra/{model_name}", tokenizer=f"Iscte-Sintra/{model_name}", token=token)
35
  return pipe(text, top_k=top_k)
36
 
37
- def instantiate_translation_model(model_name: str, text: str, selected_input_lg:str, selected_output_lg:str) -> dict:
38
- if model_name=="Modelo-Traducao-kea-ptpt-v1.0" or model_name=="mbart-v0.2":
39
- # Initialize the translation pipeline
40
- pipe = pipeline(
41
- "translation",
42
- model=f'Iscte-Sintra/{model_name}',
43
- tokenizer=f'Iscte-Sintra/{model_name}',
44
- token=token,
45
- use_fast=False,
46
- src_lang=selected_input_lg,
47
- tgt_lang=selected_output_lg
48
- )
49
-
50
- elif model_name=="m2m100-v1.0":
51
- # Initialize the translation pipeline
52
- pipe = pipeline(
53
- "translation",
54
- model=f'Iscte-Sintra/{model_name}',
55
- tokenizer=f'Iscte-Sintra/{model_name}',
56
- token=token,
57
- use_fast=False,
58
- src_lang="en", # source: Kabuverdianu
59
- tgt_lang="pt" # target: Portuguese
60
- )
61
 
62
  result = pipe(text)
63
  return result[0]["translation_text"]
64
 
 
65
 
66
  def build_translation_page(model_name):
67
- try:
68
- st.title(f"{model_name} : Tarefa de Tradução (Kabuverdianu → Português)")
 
 
 
 
 
69
 
70
- text = st.text_area("Introduza texto em Kabuverdiano", "Katxór sta trás di pórta.", height=100)
71
-
72
- input_supported_languages = {"pt": "pt_XX", "kea": "en_XX"}
73
- selected_input_lg = st.sidebar.selectbox("Língua (Entrada)", list(input_supported_languages.keys()))
74
- selected_output_lg = st.sidebar.selectbox("Lingua (Saída)", list(input_supported_languages.keys()))
 
 
 
 
 
 
 
75
 
76
- if st.button("Traduzir"):
77
- if not text.strip():
78
- st.warning("Por favor, introduza texto para ser traduzido!", icon="⚠️")
79
- return
80
-
81
- # Call your translation function
82
- result = instantiate_translation_model(model_name, text, input_supported_languages[selected_input_lg], input_supported_languages[selected_output_lg])
83
-
84
- if result:
85
- st.subheader("Texto Traduzido (Português)")
86
  st.write(result)
87
-
88
- except Exception as e:
89
- st.warning("Ocorreu um erro durante a tradução", icon="⚠️")
90
- st.warning(e)
91
-
92
 
93
  def build_decoder_page(model_name):
94
- try:
95
- st.title(f"{model_name} - Tarefa Geração de Texto")
96
- max_length : int = st.sidebar.slider("Tamanho Máximo da frase", 10, 200)
97
- num_return_sequences : int = st.sidebar.number_input('Número de sequências/frases desejadas', min_value=1, max_value=10, value=1, step=1)
98
- text : str = st.text_area("Texto", "Katxór sta trás di pórta.", height=75)
99
-
100
- if st.button("Submeter"):
101
- results = instantiate_gpt2(model_name ,max_length, num_return_sequences, text)
102
- if results:
103
- for result in results:
104
- st.write(f"**Texto Gerado:**: {result['generated_text']}")
105
- except Exception as e:
106
- st.warning('Tamanho máximo de tokens deve ser maior do que o número de tokens presentes na frase atual!', icon="⚠️")
107
- st.warning(e)
108
-
109
- def build_encoder_page(model_name:str):
110
-
111
- st.title(f"{model_name} - Tarefa Fill-Mask")
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- top_k = st.sidebar.number_input('Número de sequências/frases desejadas', min_value=1, max_value=5, value=1, step=1)
114
 
115
- results = None
 
 
 
 
 
 
 
116
 
117
- col1, col2 = st.columns(2)
 
118
 
119
- with col1:
120
- st.subheader("Texto")
121
- if model_name=="Albertina-Kriolu":
122
- st.write("Digite uma frase com um token **[MASK]**, e o modelo irá prever a palavra em falta.")
123
- input_text = st.text_input("Frase de entrada", "Katxór sta trás di [MASK].")
124
- else:
125
- st.write("Digite uma frase com um token **<MASK>**, e o modelo irá prever a palavra em falta.")
126
- input_text = st.text_input("Frase de entrada", "Katxór sta trás di <mask>.")
127
-
128
- submit = st.button("Submeter")
129
- try:
130
- if submit and input_text:
131
- results = instantiate_encoder(model_name, top_k, input_text)
132
- except Exception as e:
133
- st.warning('Atenção, deve de haver um token especial "<mask>" na frase!', icon="⚠️")
134
- st.warning(e)
135
-
136
- with col2:
137
- st.subheader("Previsões")
138
- if results:
139
- predicted_text = st.text_input("Token Previsto", value=results[0]['sequence'], disabled=True)
140
- for result in results:
141
- st.write(f"**Previsão**: {result['token_str']} | **Confiança**: {round(result['score'], 4)}")
142
- else:
143
- predicted_text = st.text_input("Token previsto", disabled=True)
144
-
145
-
146
- # Your dictionary of models
147
- model_dict = {'RoBERTa-Kriolu': "Encoder",
148
- "GPT2_v1.18":"Decoder",
149
- "LLM-kea-v1.0": "Decoder",
150
- "Modelo-Traducao-kea-ptpt-v1.0": "Encoder-Decoder",
151
- "nllb-v1.0": "Encoder-Decoder",
152
- "m2m100-v1.0": "Encoder-Decoder"
153
- }
154
-
155
- # Always appears at the top of the sidebar
156
- selected_model = st.sidebar.selectbox("Arquitetura", list(model_dict.keys()))
157
-
158
- if model_dict[selected_model] == "Encoder":
159
  build_encoder_page(selected_model)
160
-
161
- elif model_dict[selected_model] == "Encoder-Decoder":
162
  build_translation_page(selected_model)
163
  else:
164
  build_decoder_page(selected_model)
 
3
  from transformers import pipeline
4
  import os
5
 
6
+ # Set page config for better UI
7
+ st.set_page_config(page_title="Kriolu AI Hub", layout="wide")
8
 
9
+ # Read token from environment
10
+ token = os.environ.get("token")
11
 
12
+ # --- Model Loading with Caching ---
13
+ # This prevents the app from reloading the model every time you click a button
14
+ @st.cache_resource
15
+ def load_pipeline(task, model_path, **kwargs):
16
+ return pipeline(task, model=model_path, tokenizer=model_path, token=token, **kwargs)
17
+
18
+ def instantiate_gpt2(model_name: str, max_length_: int, num_return_sequences: int, text: str):
19
+ model_path = f'Iscte-Sintra/{model_name}'
20
+ # Use device_map="auto" to handle memory better if available
21
+ pipe = load_pipeline('text-generation', model_path)
22
+
23
+ # Logic for different generation params
24
+ if "Qwen" in model_name:
25
+ return pipe(text, max_new_tokens=max_length_, num_return_sequences=num_return_sequences,
26
+ do_sample=True, top_p=0.95, top_k=50)
27
  else:
28
+ return pipe(text, max_length=max_length_, num_return_sequences=num_return_sequences,
29
+ do_sample=True, top_p=0.95, top_k=50)
30
+
31
+ def instantiate_encoder(model_name: str, top_k: int, text: str):
32
+ pipe = load_pipeline("fill-mask", f"Iscte-Sintra/{model_name}")
 
 
 
 
 
 
 
33
  return pipe(text, top_k=top_k)
34
 
35
+ def instantiate_translation_model(model_name: str, text: str, src_lg: str, tgt_lg: str):
36
+ model_path = f'Iscte-Sintra/{model_name}'
37
+
38
+ # Dictionary to handle specific language code mapping per model type
39
+ # NLLB uses codes like 'por_Latn', MBart uses 'pt_XX'
40
+ if "nllb" in model_name:
41
+ # Simple mapping for NLLB (Example: adjust based on your specific model training)
42
+ src = "kea_Latn" if "en" in src_lg else "por_Latn"
43
+ tgt = "por_Latn" if "pt" in tgt_lg else "kea_Latn"
44
+ pipe = pipeline("translation", model=model_path, token=token, src_lang=src, tgt_lang=tgt)
45
+ else:
46
+ # Standard logic for MBart / M2M100
47
+ pipe = pipeline("translation", model=model_path, token=token, src_lang=src_lg, tgt_lang=tgt_lg)
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  result = pipe(text)
50
  return result[0]["translation_text"]
51
 
52
+ # --- UI Build Functions ---
53
 
54
  def build_translation_page(model_name):
55
+ st.title(f"🌍 {model_name}: Tradução")
56
+
57
+ # Dynamic language mapping based on model
58
+ if "nllb" in model_name:
59
+ lang_map = {"Português": "por_Latn", "Kabuverdianu": "kea_Latn"}
60
+ else:
61
+ lang_map = {"Português": "pt_XX", "Kabuverdianu": "en_XX"} # MBart style
62
 
63
+ col1, col2 = st.columns(2)
64
+ with col1:
65
+ src_label = st.selectbox("Língua de Origem", list(lang_map.keys()), index=1)
66
+ with col2:
67
+ tgt_label = st.selectbox("Língua de Destino", list(lang_map.keys()), index=0)
68
+
69
+ text = st.text_area("Texto de entrada", "Katxór sta trás di pórta.", height=100)
70
+
71
+ if st.button("Traduzir"):
72
+ if not text.strip():
73
+ st.warning("Introduza texto!")
74
+ return
75
 
76
+ with st.spinner("A traduzir..."):
77
+ try:
78
+ result = instantiate_translation_model(model_name, text, lang_map[src_label], lang_map[tgt_label])
79
+ st.success("Resultado:")
 
 
 
 
 
 
80
  st.write(result)
81
+ except Exception as e:
82
+ st.error(f"Erro: {e}")
 
 
 
83
 
84
  def build_decoder_page(model_name):
85
+ st.title(f"✍️ {model_name}: Geração de Texto")
86
+ max_length = st.sidebar.slider("Máximo de Tokens", 10, 200, 50)
87
+ num_seq = st.sidebar.number_input('Sequências', 1, 5, 1)
88
+ text = st.text_area("Prompt", "Katxór sta trás di pórta.")
89
+
90
+ if st.button("Gerar"):
91
+ with st.spinner("A processar..."):
92
+ try:
93
+ results = instantiate_gpt2(model_name, max_length, num_seq, text)
94
+ for res in results:
95
+ st.info(res['generated_text'])
96
+ except Exception as e:
97
+ st.error(f"Erro: {e}")
98
+
99
+ def build_encoder_page(model_name):
100
+ st.title(f"🔍 {model_name}: Fill-Mask")
101
+ top_k = st.sidebar.slider("Top K sugestões", 1, 5, 3)
102
+
103
+ mask_token = "[MASK]" if "RoBERTa" not in model_name else "<mask>"
104
+ st.write(f"Use o token **{mask_token}** para a palavra em falta.")
105
+
106
+ input_text = st.text_input("Frase", f"Katxór sta trás di {mask_token}.")
107
+
108
+ if st.button("Prever"):
109
+ try:
110
+ results = instantiate_encoder(model_name, top_k, input_text)
111
+ for res in results:
112
+ st.write(f"✅ **{res['token_str']}** (Confiança: {res['score']:.2%})")
113
+ except Exception as e:
114
+ st.error(f"Certifique-se que usou o token {mask_token}")
115
 
116
+ # --- Main App Logic ---
117
 
118
+ model_dict = {
119
+ 'RoBERTa-Kriolu': "Encoder",
120
+ "GPT2_v1.18": "Decoder",
121
+ "LLM-kea-v1.0": "Decoder",
122
+ "Modelo-Traducao-kea-ptpt-v1.0": "Encoder-Decoder",
123
+ "nllb-v1.0": "Encoder-Decoder",
124
+ "m2m100-v1.0": "Encoder-Decoder"
125
+ }
126
 
127
+ selected_model = st.sidebar.selectbox("Escolha o Modelo", list(model_dict.keys()))
128
+ arch = model_dict[selected_model]
129
 
130
+ if arch == "Encoder":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  build_encoder_page(selected_model)
132
+ elif arch == "Encoder-Decoder":
 
133
  build_translation_page(selected_model)
134
  else:
135
  build_decoder_page(selected_model)