import streamlit as st from transformers import pipeline import os # ---------------- CONFIG ---------------- st.set_page_config(page_title="Kriolu AI Hub", layout="wide") token = os.environ.get("token") # ---------------- CACHE ---------------- @st.cache_resource def load_pipeline(task, model_path, **kwargs): return pipeline(task, model=model_path, tokenizer=model_path, token=token, **kwargs) # ---------------- DECODER ---------------- def instantiate_gpt2(model_name, max_length_, num_return_sequences, text): model_path = f'Iscte-Sintra/{model_name}' pipe = load_pipeline("text-generation", model_path) return pipe( text, max_new_tokens=max_length_, num_return_sequences=num_return_sequences, do_sample=True, top_p=0.95, top_k=50 ) # ---------------- ENCODER ---------------- def instantiate_encoder(model_name, top_k, text): pipe = load_pipeline("fill-mask", f"Iscte-Sintra/{model_name}") return pipe(text, top_k=top_k) # ---------------- TRANSLATION ---------------- def instantiate_translation_model(model_name, text, src_lg, tgt_lg): model_path = f'Iscte-Sintra/{model_name}' # ---- NLLB ---- if "nllb" in model_name: pipe = pipeline( "translation", model=model_path, tokenizer=model_path, token=token, src_lang=src_lg, tgt_lang=tgt_lg ) return pipe(text)[0]["translation_text"] # ---- M2M100 ---- elif "m2m100" in model_name: pipe = load_pipeline("translation", model_path) # 1. Definimos a língua de origem # Em modelos customizados, as vezes o src_lang precisa ser o token completo pipe.tokenizer.src_lang = src_lg # 2. Pegamos o ID numérico do token de destino (ex: __pt__) # Usamos convert_tokens_to_ids porque ele ignora a lógica interna de busca de idiomas tgt_lang_id = pipe.tokenizer.convert_tokens_to_ids(tgt_lg) if tgt_lang_id == pipe.tokenizer.unk_token_id: st.error(f"Erro: O token {tgt_lg} não foi encontrado no vocabulário do modelo!") return None # 3. Executamos a tradução forçando o ID de início de frase (BOS) result = pipe( text, forced_bos_token_id=tgt_lang_id ) return result[0]["translation_text"] # ---- MBART ---- else: pipe = pipeline( "translation", model=model_path, tokenizer=model_path, token=token, src_lang=src_lg, tgt_lang=tgt_lg ) return pipe(text)[0]["translation_text"] # ---------------- UI: TRANSLATION ---------------- def build_translation_page(model_name): st.title(f"🌍 {model_name}: Tradução") if "nllb" in model_name: lang_map = { "Português": "por_Latn", "Kabuverdianu": "kea_Latn" } elif "m2m100" in model_name: lang_map = { "Português": "__pt__", "Kabuverdianu": "__en__" # Proxying kea as __en__ } else: # mBART lang_map = { "Português": "pt_XX", "Kabuverdianu": "en_XX" } col1, col2 = st.columns(2) with col1: src_label = st.selectbox("Língua de Origem", list(lang_map.keys())) with col2: tgt_label = st.selectbox("Língua de Destino", list(lang_map.keys())) text = st.text_area("Texto de entrada", "Katxór sta trás di pórta.", height=100) if st.button("Traduzir"): if not text.strip(): st.warning("Introduza texto!") return with st.spinner("A traduzir..."): try: result = instantiate_translation_model( model_name, text, lang_map[src_label], lang_map[tgt_label] ) st.success("Resultado:") st.write(result) except Exception as e: st.error(f"Erro: {e}") # ---------------- UI: DECODER ---------------- def build_decoder_page(model_name): st.title(f"✍️ {model_name}: Geração de Texto") max_length = st.sidebar.slider("Máximo de Tokens", 10, 200, 50) num_seq = st.sidebar.number_input("Sequências", 1, 5, 1) text = st.text_area("Prompt", "Katxór sta trás di pórta.") if st.button("Gerar"): with st.spinner("A processar..."): try: results = instantiate_gpt2(model_name, max_length, num_seq, text) for res in results: st.info(res["generated_text"]) except Exception as e: st.error(f"Erro: {e}") # ---------------- UI: ENCODER ---------------- def build_encoder_page(model_name): st.title(f"🔍 {model_name}: Fill-Mask") top_k = st.sidebar.slider("Top K sugestões", 1, 5, 3) mask_token = "" if "RoBERTa" in model_name else "[MASK]" st.write(f"Use o token **{mask_token}** para a palavra em falta.") input_text = st.text_input("Frase", f"Katxór sta trás di {mask_token}.") if st.button("Prever"): try: results = instantiate_encoder(model_name, top_k, input_text) for res in results: st.write(f"✅ **{res['token_str']}** ({res['score']:.2%})") except Exception: st.error(f"Certifique-se que usou o token {mask_token}") # ---------------- MAIN ---------------- model_dict = { "RoBERTa-Kriolu": "Encoder", "GPT2_v1.18": "Decoder", "LLM-kea-v1.0": "Decoder", "Modelo-Traducao-kea-ptpt-v1.0": "Encoder-Decoder", "nllb-v1.0": "Encoder-Decoder", "m2m100-v1.0": "Encoder-Decoder", "mbart-v2.0": "Encoder-Decoder", "m2m100-v2.0": "Encoder-Decoder", "mbart-v2.1": "Encoder-Decoder", "nllb-v2.0": "Encoder-Decoder", } selected_model = st.sidebar.selectbox("Escolha o Modelo", list(model_dict.keys())) arch = model_dict[selected_model] if arch == "Encoder": build_encoder_page(selected_model) elif arch == "Encoder-Decoder": build_translation_page(selected_model) else: build_decoder_page(selected_model)