File size: 6,237 Bytes
c531824 cb232b1 9ccf6e6 2022753 416e378 cb232b1 2022753 416e378 2022753 416e378 2022753 416e378 3fc0fa8 2022753 416e378 2022753 416e378 2022753 f2eb0c4 1cca777 f2eb0c4 1cca777 f2eb0c4 1cca777 2022753 f2eb0c4 2022753 416e378 2022753 10ff159 2022753 10ff159 416e378 55ad4cc 416e378 55ad4cc f2eb0c4 55ad4cc 45fd469 55ad4cc 10ff159 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2cdd040 416e378 10ff159 2022753 b6dc44c 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2022753 416e378 2022753 416e378 cb232b1 2022753 416e378 2022753 416e378 d5fdb26 a6a5f47 416e378 cb232b1 416e378 cb232b1 416e378 c6ea9d6 416e378 10ff159 cb232b1 2022753 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | import streamlit as st
from transformers import pipeline
import os
# ---------------- CONFIG ----------------
st.set_page_config(page_title="Kriolu AI Hub", layout="wide")
token = os.environ.get("token")
# ---------------- CACHE ----------------
@st.cache_resource
def load_pipeline(task, model_path, **kwargs):
return pipeline(task, model=model_path, tokenizer=model_path, token=token, **kwargs)
# ---------------- DECODER ----------------
def instantiate_gpt2(model_name, max_length_, num_return_sequences, text):
model_path = f'Iscte-Sintra/{model_name}'
pipe = load_pipeline("text-generation", model_path)
return pipe(
text,
max_new_tokens=max_length_,
num_return_sequences=num_return_sequences,
do_sample=True,
top_p=0.95,
top_k=50
)
# ---------------- ENCODER ----------------
def instantiate_encoder(model_name, top_k, text):
pipe = load_pipeline("fill-mask", f"Iscte-Sintra/{model_name}")
return pipe(text, top_k=top_k)
# ---------------- TRANSLATION ----------------
def instantiate_translation_model(model_name, text, src_lg, tgt_lg):
model_path = f'Iscte-Sintra/{model_name}'
# ---- NLLB ----
if "nllb" in model_name:
pipe = pipeline(
"translation",
model=model_path,
tokenizer=model_path,
token=token,
src_lang=src_lg,
tgt_lang=tgt_lg
)
return pipe(text)[0]["translation_text"]
# ---- M2M100 ----
elif "m2m100" in model_name:
pipe = load_pipeline("translation", model_path)
# 1. Definimos a língua de origem
# Em modelos customizados, as vezes o src_lang precisa ser o token completo
pipe.tokenizer.src_lang = src_lg
# 2. Pegamos o ID numérico do token de destino (ex: __pt__)
# Usamos convert_tokens_to_ids porque ele ignora a lógica interna de busca de idiomas
tgt_lang_id = pipe.tokenizer.convert_tokens_to_ids(tgt_lg)
if tgt_lang_id == pipe.tokenizer.unk_token_id:
st.error(f"Erro: O token {tgt_lg} não foi encontrado no vocabulário do modelo!")
return None
# 3. Executamos a tradução forçando o ID de início de frase (BOS)
result = pipe(
text,
forced_bos_token_id=tgt_lang_id
)
return result[0]["translation_text"]
# ---- MBART ----
else:
pipe = pipeline(
"translation",
model=model_path,
tokenizer=model_path,
token=token,
src_lang=src_lg,
tgt_lang=tgt_lg
)
return pipe(text)[0]["translation_text"]
# ---------------- UI: TRANSLATION ----------------
def build_translation_page(model_name):
st.title(f"🌍 {model_name}: Tradução")
if "nllb" in model_name:
lang_map = {
"Português": "por_Latn",
"Kabuverdianu": "kea_Latn"
}
elif "m2m100" in model_name:
lang_map = {
"Português": "__pt__",
"Kabuverdianu": "__en__" # Proxying kea as __en__
}
else: # mBART
lang_map = {
"Português": "pt_XX",
"Kabuverdianu": "en_XX"
}
col1, col2 = st.columns(2)
with col1:
src_label = st.selectbox("Língua de Origem", list(lang_map.keys()))
with col2:
tgt_label = st.selectbox("Língua de Destino", list(lang_map.keys()))
text = st.text_area("Texto de entrada", "Katxór sta trás di pórta.", height=100)
if st.button("Traduzir"):
if not text.strip():
st.warning("Introduza texto!")
return
with st.spinner("A traduzir..."):
try:
result = instantiate_translation_model(
model_name,
text,
lang_map[src_label],
lang_map[tgt_label]
)
st.success("Resultado:")
st.write(result)
except Exception as e:
st.error(f"Erro: {e}")
# ---------------- UI: DECODER ----------------
def build_decoder_page(model_name):
st.title(f"✍️ {model_name}: Geração de Texto")
max_length = st.sidebar.slider("Máximo de Tokens", 10, 200, 50)
num_seq = st.sidebar.number_input("Sequências", 1, 5, 1)
text = st.text_area("Prompt", "Katxór sta trás di pórta.")
if st.button("Gerar"):
with st.spinner("A processar..."):
try:
results = instantiate_gpt2(model_name, max_length, num_seq, text)
for res in results:
st.info(res["generated_text"])
except Exception as e:
st.error(f"Erro: {e}")
# ---------------- UI: ENCODER ----------------
def build_encoder_page(model_name):
st.title(f"🔍 {model_name}: Fill-Mask")
top_k = st.sidebar.slider("Top K sugestões", 1, 5, 3)
mask_token = "<mask>" if "RoBERTa" in model_name else "[MASK]"
st.write(f"Use o token **{mask_token}** para a palavra em falta.")
input_text = st.text_input("Frase", f"Katxór sta trás di {mask_token}.")
if st.button("Prever"):
try:
results = instantiate_encoder(model_name, top_k, input_text)
for res in results:
st.write(f"✅ **{res['token_str']}** ({res['score']:.2%})")
except Exception:
st.error(f"Certifique-se que usou o token {mask_token}")
# ---------------- MAIN ----------------
model_dict = {
"RoBERTa-Kriolu": "Encoder",
"GPT2_v1.18": "Decoder",
"LLM-kea-v1.0": "Decoder",
"Modelo-Traducao-kea-ptpt-v1.0": "Encoder-Decoder",
"nllb-v1.0": "Encoder-Decoder",
"m2m100-v1.0": "Encoder-Decoder",
"mbart-v2.0": "Encoder-Decoder",
"m2m100-v2.0": "Encoder-Decoder",
"mbart-v2.1": "Encoder-Decoder",
"nllb-v2.0": "Encoder-Decoder",
}
selected_model = st.sidebar.selectbox("Escolha o Modelo", list(model_dict.keys()))
arch = model_dict[selected_model]
if arch == "Encoder":
build_encoder_page(selected_model)
elif arch == "Encoder-Decoder":
build_translation_page(selected_model)
else:
build_decoder_page(selected_model)
|