Spaces:

CEIA-POSITIVO
/

public_chat

Sleeping

App Files Files Community

Daniel Machado Pedrozo commited on Nov 11, 2025

Commit

91c131d

1 Parent(s): 3d96e49

Implement initial project structure with Dockerfile, requirements, and Streamlit app. Added model loading and inference utilities, along with chat management features. Updated entry point and added new dependencies.

Browse files

Files changed (10) hide show

Dockerfile +2 -1
requirements.txt +4 -1
src/app.py +201 -0
src/backend/__init__.py +20 -0
src/backend/chat.py +208 -0
src/backend/chat_model.py +188 -0
src/backend/inference.py +162 -0
src/backend/model_loader.py +138 -0
src/config.py +71 -0
src/streamlit_app.py +0 -40

Dockerfile CHANGED Viewed

@@ -11,10 +11,11 @@ RUN apt-get update && apt-get install -y \
 COPY requirements.txt ./
 COPY src/ ./src/
 RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 COPY requirements.txt ./
 COPY src/ ./src/
+RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
 RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 altair
 pandas
-streamlit

 altair
 pandas
+streamlit
+dotenv
+transformers
+pydantic

src/app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""Interface de chat com modelo de linguagem."""
+import streamlit as st
+import base64
+from pathlib import Path
+from backend import load_model, ChatModel
+from config import get_model_options, GATED_MODELS
+st.set_page_config(page_title="Small LLM - Chat", layout="wide")
+# Caminho da logo (relativo à raiz do projeto)
+PROJECT_ROOT = Path(__file__).parent.parent
+LOGO_PATH = PROJECT_ROOT / "positivo-logo.png"
+# Header com logo e título usando HTML/CSS para melhor controle
+with open(LOGO_PATH, "rb") as img_file:
+    img_base64 = base64.b64encode(img_file.read()).decode()
+st.markdown(f"""
+<style>
+.logo-header {{
+    display: flex;
+    align-items: center;
+    gap: 20px;
+    margin-bottom: 0.5rem;
+}}
+.logo-header img {{
+    width: 90px;
+    height: 90px;
+    object-fit: contain;
+    flex-shrink: 0;
+}}
+</style>
+<div class="logo-header">
+    <img src="data:image/png;base64,{img_base64}" />
+    <h1 style="margin: 0; padding: 0; display: inline-block;">Small LLM - Chat</h1>
+</div>
+""", unsafe_allow_html=True)
+# ============================================================================
+# FUNÇÕES AUXILIARES
+# ============================================================================
+def handle_model_load_error(model_name: str, error_msg: str):
+    """Trata erros de carregamento de modelo, especialmente modelos gated."""
+    is_gated_error = (
+        model_name in GATED_MODELS and (
+            "401" in error_msg or
+            "gated" in error_msg.lower() or
+            "access" in error_msg.lower() or
+            "restricted" in error_msg.lower()
+        )
+    )
+    if is_gated_error:
+        st.error(
+            f"⚠️ **Modelo gated detectado!**\n\n"
+            f"O modelo `{model_name}` requer autenticação.\n\n"
+            f"**No Hugging Face Spaces:**\n"
+            f"1. Vá em Settings → Repository secrets\n"
+            f"2. Adicione `HF_TOKEN` com seu token do Hugging Face\n"
+            f"3. Aceite os termos em: https://huggingface.co/{model_name}"
+        )
+    else:
+        st.error(f"❌ Erro ao carregar modelo: {error_msg}")
+# ============================================================================
+# INTERFACE DE CHAT
+# ============================================================================
+# Sidebar para configurações
+with st.sidebar:
+    st.header("⚙️ Configurações")
+    model_options = get_model_options()
+    selected_label = st.selectbox(
+        "Selecione um Modelo",
+        options=[opt[0] for opt in model_options],
+        index=0,
+        help="Modelos pré-selecionados para teste"
+    )
+    selected_model = next(opt[1] for opt in model_options if opt[0] == selected_label)
+    use_custom = st.checkbox("Usar modelo customizado")
+    if use_custom:
+        model_name = st.text_input(
+            "Nome do Modelo (Hugging Face)",
+            value="gpt2",
+            help="Digite o nome completo do modelo no Hugging Face"
+        )
+    else:
+        model_name = selected_model
+    use_quantization = st.checkbox(
+        "Usar Quantização (8-bit)",
+        value=False,
+        help="Reduz uso de memória, mas pode ser mais lento"
+    )
+    if st.button("🔄 Carregar Modelo", type="primary"):
+        with st.spinner(f"Carregando {model_name}..."):
+            try:
+                pipeline, model_info = load_model(
+                    model_name,
+                    load_in_8bit=use_quantization
+                )
+                chat_model = ChatModel(pipeline)
+                st.session_state.chat_model = chat_model
+                st.session_state.model_info = model_info
+                st.session_state.model_name = model_name
+                st.success("✅ Modelo carregado!")
+                if "messages" in st.session_state:
+                    del st.session_state.messages
+            except Exception as e:
+                handle_model_load_error(model_name, str(e))
+    if "model_info" in st.session_state:
+        st.divider()
+        st.subheader("📊 Informações do Modelo")
+        st.json(st.session_state.model_info)
+        if "chat_model" in st.session_state:
+            chat_model = st.session_state.chat_model
+            st.divider()
+            st.subheader("💭 Estatísticas da Conversa")
+            st.metric("Mensagens", len(chat_model.conversation))
+            if st.button("🗑️ Limpar Histórico", use_container_width=True):
+                chat_model.clear_history()
+                if "messages" in st.session_state:
+                    del st.session_state.messages
+                st.rerun()
+# Área principal - Chat
+if "chat_model" not in st.session_state:
+    st.info("👈 Use a sidebar para carregar um modelo primeiro.")
+    st.markdown("""
+    ### Modelos disponíveis:
+    **Google Gemma:**
+    - `google/gemma-3-4b-it` - 4 bilhões de parâmetros
+    - `google/gemma-3-1b-it` - 1 bilhão de parâmetros
+    - `google/gemma-3-270m-it` - 270 milhões de parâmetros
+    **Qwen:**
+    - `Qwen/Qwen3-0.6B` - 600 milhões de parâmetros
+    - `Qwen/Qwen2.5-0.5B-Instruct` - 500 milhões (instruct)
+    - `Qwen/Qwen2.5-0.5B` - 500 milhões
+    **Facebook:**
+    - `facebook/MobileLLM-R1-950M` - 950 milhões de parâmetros
+    """)
+else:
+    chat_model = st.session_state.chat_model
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    if len(chat_model.conversation.messages) != len(st.session_state.messages):
+        st.session_state.messages = [
+            {"role": msg.role, "content": msg.content}
+            for msg in chat_model.conversation.messages
+        ]
+    chat_container = st.container()
+    with chat_container:
+        for message in st.session_state.messages:
+            role = message["role"]
+            content = message["content"]
+            if role == "system":
+                continue
+            with st.chat_message(role):
+                st.markdown(content)
+    if user_input := st.chat_input("Digite sua mensagem..."):
+        chat_model.add_user_message(user_input)
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        with st.chat_message("assistant"):
+            response_placeholder = st.empty()
+            full_response = ""
+            try:
+                for token in chat_model.generate_streaming(max_new_tokens=512):
+                    full_response += token
+                    response_placeholder.markdown(full_response)
+                chat_model.add_assistant_message(full_response)
+                st.session_state.messages.append({"role": "assistant", "content": full_response})
+            except Exception as e:
+                error_msg = f"Erro na geração: {str(e)}"
+                st.error(error_msg)
+                st.session_state.messages.append({"role": "assistant", "content": error_msg})

src/backend/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Backend module for LLM model loading and inference."""
+from .model_loader import load_model
+from .chat import Conversation, Message
+from .chat_model import ChatModel
+from .inference import generate_streaming, generate_simple
+__all__ = [
+    # Model loading
+    "load_model",
+    # OOP classes (recomendado)
+    "Conversation",
+    "ChatModel",
+    # Functions (compatibilidade)
+    "generate_streaming",
+    "generate_simple",
+    # Types
+    "Message",
+]

src/backend/chat.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""Chat utilities for managing conversation history with chat templates."""
+from typing import List, Optional, Literal
+from pydantic import BaseModel, Field, field_validator
+from transformers import PreTrainedTokenizer
+class Message(BaseModel):
+    """
+    Mensagem de chat no formato compatível OpenAI.
+    Exemplo:
+        msg = Message(role="user", content="Olá!")
+        msg_dict = msg.model_dump()  # {"role": "user", "content": "Olá!"}
+    """
+    role: Literal["user", "assistant", "system"] = Field(
+        ...,
+        description="Role da mensagem: user, assistant ou system"
+    )
+    content: str = Field(
+        ...,
+        min_length=1,
+        description="Conteúdo da mensagem"
+    )
+    @field_validator("content")
+    @classmethod
+    def validate_content(cls, v: str) -> str:
+        """Valida que o conteúdo não está vazio."""
+        if not v.strip():
+            raise ValueError("Content não pode estar vazio")
+        return v
+    def model_dump_dict(self) -> dict:
+        """Retorna como dicionário (compatível com transformers)."""
+        return {"role": self.role, "content": self.content}
+    class Config:
+        """Configuração do Pydantic."""
+        json_schema_extra = {
+            "example": {
+                "role": "user",
+                "content": "Olá! Como você está?"
+            }
+        }
+def _format_chat_prompt(
+    tokenizer: PreTrainedTokenizer,
+    messages: List[Message],
+    add_generation_prompt: bool = True,
+) -> str:
+    """
+    Formata histórico de chat usando o template do modelo (função auxiliar interna).
+    Args:
+        tokenizer: Tokenizer do modelo (deve ter chat_template configurado)
+        messages: Lista de mensagens (Message ou dict)
+        add_generation_prompt: Se True, adiciona prompt de geração ao final
+    Returns:
+        String formatada pronta para ser enviada ao modelo
+    """
+    # Converte Message para dict se necessário
+    messages_dict = [
+        msg.model_dump_dict() if isinstance(msg, Message) else msg
+        for msg in messages
+    ]
+    if not hasattr(tokenizer, "apply_chat_template") or tokenizer.chat_template is None:
+        # Fallback: concatena mensagens simplesmente
+        formatted = ""
+        for msg in messages_dict:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            formatted += f"{role}: {content}\n"
+        return formatted.strip()
+    return tokenizer.apply_chat_template(
+        messages_dict,
+        tokenize=False,
+        add_generation_prompt=add_generation_prompt,
+    )
+def _get_conversation_summary(messages: List[Message], max_length: int = 100) -> str:
+    """
+    Retorna resumo da conversa (função auxiliar interna).
+    Args:
+        messages: Lista de mensagens
+        max_length: Comprimento máximo do resumo
+    Returns:
+        String resumida da conversa
+    """
+    summary_parts = []
+    for msg in messages[-5:]:  # Últimas 5 mensagens
+        if isinstance(msg, Message):
+            role = msg.role
+            content = msg.content[:50]
+        else:
+            role = msg.get("role", "unknown")
+            content = msg.get("content", "")[:50]
+        summary_parts.append(f"{role}: {content}...")
+    summary = " | ".join(summary_parts)
+    if len(summary) > max_length:
+        return summary[:max_length] + "..."
+    return summary
+class Conversation(BaseModel):
+    """
+    Gerencia histórico de conversa de forma orientada a objetos com Pydantic.
+    Exemplo:
+        conv = Conversation()
+        conv.add_user_message("Olá")
+        conv.add_assistant_message("Oi! Como posso ajudar?")
+        messages = conv.messages
+    """
+    messages: List[Message] = Field(default_factory=list)
+    system_prompt: Optional[str] = Field(default=None)
+    def __init__(self, system_prompt: Optional[str] = None, **data):
+        """
+        Inicializa uma nova conversa.
+        Args:
+            system_prompt: Prompt do sistema (opcional)
+        """
+        super().__init__(**data)
+        if system_prompt and not self.messages:
+            self.set_system_prompt(system_prompt)
+    def add_message(self, role: Literal["user", "assistant", "system"], content: str) -> None:
+        """
+        Adiciona uma mensagem ao histórico.
+        Args:
+            role: Role da mensagem ("user", "assistant", "system")
+            content: Conteúdo da mensagem
+        """
+        message = Message(role=role, content=content)
+        if role == "system":
+            # Mensagens do sistema sempre vão no início
+            self.messages.insert(0, message)
+        else:
+            self.messages.append(message)
+    def add_user_message(self, content: str) -> None:
+        """Adiciona mensagem do usuário."""
+        self.add_message("user", content)
+    def add_assistant_message(self, content: str) -> None:
+        """Adiciona mensagem do assistente."""
+        self.add_message("assistant", content)
+    def set_system_prompt(self, content: str) -> None:
+        """
+        Define ou atualiza o prompt do sistema.
+        Args:
+            content: Conteúdo do prompt do sistema
+        """
+        # Remove mensagens do sistema existentes
+        self.messages = [msg for msg in self.messages if msg.role != "system"]
+        # Adiciona nova mensagem do sistema no início
+        self.messages.insert(0, Message(role="system", content=content))
+    def clear(self, keep_system: bool = True) -> None:
+        """
+        Limpa o histórico de conversa.
+        Args:
+            keep_system: Se True, mantém mensagens do sistema
+        """
+        if keep_system:
+            self.messages = [msg for msg in self.messages if msg.role == "system"]
+        else:
+            self.messages = []
+    def get_summary(self, max_length: int = 100) -> str:
+        """
+        Retorna resumo da conversa.
+        Args:
+            max_length: Comprimento máximo do resumo
+        Returns:
+            String resumida da conversa
+        """
+        return _get_conversation_summary(self.messages, max_length)
+    def model_dump_messages(self) -> List[dict]:
+        """Retorna mensagens como lista de dicionários (compatível com transformers)."""
+        return [msg.model_dump_dict() for msg in self.messages]
+    def __len__(self) -> int:
+        """Retorna número de mensagens."""
+        return len(self.messages)
+    def __repr__(self) -> str:
+        """Representação string da conversa."""
+        return f"Conversation({len(self.messages)} messages)"

src/backend/chat_model.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""ChatModel class that encapsulates pipeline + conversation history."""
+from typing import Iterator, Optional, Union, List
+from transformers import Pipeline
+from .chat import Conversation, _format_chat_prompt, Message
+from .inference import generate_streaming as _generate_streaming, generate_simple as _generate_simple
+class ChatModel:
+    """
+    Encapsula modelo + histórico de conversa para facilitar uso.
+    Exemplo:
+        model = ChatModel(pipeline, tokenizer)
+        model.add_user_message("Olá")
+        response = model.generate_streaming()
+        model.add_assistant_message(response)
+    """
+    def __init__(
+        self,
+        pipeline: Pipeline,
+        system_prompt: Optional[str] = None,
+    ):
+        """
+        Inicializa ChatModel.
+        Args:
+            pipeline: Pipeline do transformers (deve ter model e tokenizer)
+            system_prompt: Prompt do sistema (opcional)
+        """
+        self.pipeline = pipeline
+        self.tokenizer = pipeline.tokenizer
+        self.conversation = Conversation(system_prompt=system_prompt)
+    @property
+    def messages(self) -> List[Message]:
+        """Retorna lista de mensagens do histórico."""
+        return self.conversation.messages
+    @property
+    def messages_dict(self) -> List[dict]:
+        """Retorna mensagens como lista de dicionários (compatível com transformers)."""
+        return self.conversation.model_dump_messages()
+    def add_user_message(self, content: str) -> None:
+        """Adiciona mensagem do usuário ao histórico."""
+        self.conversation.add_user_message(content)
+    def add_assistant_message(self, content: str) -> None:
+        """Adiciona mensagem do assistente ao histórico."""
+        self.conversation.add_assistant_message(content)
+    def set_system_prompt(self, content: str) -> None:
+        """Define ou atualiza o prompt do sistema."""
+        self.conversation.set_system_prompt(content)
+    def clear_history(self, keep_system: bool = True) -> None:
+        """
+        Limpa o histórico de conversa.
+        Args:
+            keep_system: Se True, mantém mensagens do sistema
+        """
+        self.conversation.clear(keep_system=keep_system)
+    def get_formatted_prompt(self, add_generation_prompt: bool = True) -> str:
+        """
+        Retorna prompt formatado com histórico completo.
+        Args:
+            add_generation_prompt: Se True, adiciona prompt de geração
+        Returns:
+            String formatada pronta para o modelo
+        """
+        return _format_chat_prompt(
+            self.tokenizer,
+            self.conversation.messages,
+            add_generation_prompt=add_generation_prompt,
+        )
+    def generate_streaming(
+        self,
+        max_new_tokens: int = 512,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        do_sample: bool = True,
+        stop_sequences: Optional[list[str]] = None,
+    ) -> Iterator[str]:
+        """
+        Gera resposta com streaming usando o histórico completo.
+        Args:
+            max_new_tokens: Número máximo de tokens a gerar
+            temperature: Temperatura para sampling (opcional)
+            top_p: Nucleus sampling (opcional)
+            top_k: Top-k sampling (opcional)
+            do_sample: Se True, usa sampling
+            stop_sequences: Lista de sequências para parar
+        Yields:
+            Tokens gerados um por vez
+        """
+        return _generate_streaming(
+            pipeline=self.pipeline,
+            prompt=self.conversation.messages,  # List[Message] funciona com _format_chat_prompt
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            do_sample=do_sample,
+            stop_sequences=stop_sequences,
+        )
+    def generate(
+        self,
+        max_new_tokens: int = 512,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        do_sample: bool = True,
+    ) -> str:
+        """
+        Gera resposta completa usando o histórico completo.
+        Args:
+            max_new_tokens: Número máximo de tokens a gerar
+            temperature: Temperatura para sampling (opcional)
+            top_p: Nucleus sampling (opcional)
+            top_k: Top-k sampling (opcional)
+            do_sample: Se True, usa sampling
+        Returns:
+            Texto gerado completo
+        """
+        return _generate_simple(
+            pipeline=self.pipeline,
+            prompt=self.conversation.messages,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            do_sample=do_sample,
+        )
+    def chat(
+        self,
+        user_message: str,
+        max_new_tokens: int = 512,
+        temperature: Optional[float] = None,
+        streaming: bool = False,
+    ) -> Union[str, Iterator[str]]:
+        """
+        Método conveniente para chat completo (adiciona mensagem + gera + adiciona resposta).
+        Args:
+            user_message: Mensagem do usuário
+            max_new_tokens: Número máximo de tokens a gerar
+            temperature: Temperatura para sampling (opcional)
+            streaming: Se True, retorna iterator; se False, retorna string completa
+        Returns:
+            Resposta do modelo (string ou iterator)
+        """
+        # Adiciona mensagem do usuário
+        self.add_user_message(user_message)
+        # Gera resposta
+        if streaming:
+            return self.generate_streaming(
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+            )
+        else:
+            response = self.generate(
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+            )
+            # Adiciona resposta ao histórico
+            self.add_assistant_message(response)
+            return response
+    def __repr__(self) -> str:
+        """Representação string do modelo."""
+        return f"ChatModel({len(self.conversation)} messages)"

src/backend/inference.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Inference utilities with streaming support."""
+from typing import Iterator, Optional, Union, List
+from transformers import Pipeline, TextIteratorStreamer
+from threading import Thread
+from .chat import _format_chat_prompt, Message
+def _build_generation_kwargs(
+    max_new_tokens: int,
+    do_sample: bool,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    **extra_kwargs
+) -> dict:
+    """Constrói dicionário de kwargs para geração, incluindo apenas parâmetros fornecidos."""
+    kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        **extra_kwargs,
+    }
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+    if top_p is not None:
+        kwargs["top_p"] = top_p
+    if top_k is not None:
+        kwargs["top_k"] = top_k
+    return kwargs
+def generate_streaming(
+    pipeline: Pipeline,
+    prompt: Union[str, List[Message]],
+    max_new_tokens: int = 512,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    do_sample: bool = True,
+    stop_sequences: Optional[list[str]] = None,
+) -> Iterator[str]:
+    """
+    Gera texto com streaming usando TextIteratorStreamer.
+    Args:
+        pipeline: Pipeline do transformers
+        prompt: Texto de entrada (str) ou lista de mensagens (List[Message])
+        max_new_tokens: Número máximo de tokens a gerar
+        temperature: Temperatura para sampling (opcional, usa padrão do modelo se None)
+        top_p: Nucleus sampling (opcional, usa padrão do modelo se None)
+        top_k: Top-k sampling (opcional, usa padrão do modelo se None)
+        do_sample: Se True, usa sampling; caso contrário, usa greedy decoding
+        stop_sequences: Lista de sequências para parar a geração
+    Yields:
+        Tokens gerados um por vez
+    """
+    # Obtém o modelo e tokenizer do pipeline
+    model = pipeline.model
+    tokenizer = pipeline.tokenizer
+    # Formata prompt se for lista de mensagens
+    if isinstance(prompt, list):
+        formatted_prompt = _format_chat_prompt(tokenizer, prompt, add_generation_prompt=True)
+    else:
+        formatted_prompt = prompt
+    # Tokeniza o prompt
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+    # Cria streamer
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    # Configurações de geração (usa valores padrão do modelo se não especificados)
+    generation_kwargs = _build_generation_kwargs(
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        streamer=streamer,
+        use_cache=True,  # Usa cache de atenção para acelerar
+    )
+    generation_kwargs.update(inputs)
+    # Thread para geração
+    generation_thread = Thread(
+        target=model.generate,
+        kwargs=generation_kwargs,
+    )
+    generation_thread.start()
+    # Yield tokens conforme são gerados
+    for token in streamer:
+        if stop_sequences:
+            # Verifica se algum stop_sequence foi encontrado
+            for stop_seq in stop_sequences:
+                if stop_seq in token:
+                    generation_thread.join(timeout=1.0)
+                    return
+        yield token
+    generation_thread.join()
+def generate_simple(
+    pipeline: Pipeline,
+    prompt: Union[str, List[Message]],
+    max_new_tokens: int = 512,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    do_sample: bool = True,
+    num_return_sequences: int = 1,
+) -> str:
+    """
+    Gera texto sem streaming (mais simples, útil para testes).
+    Args:
+        pipeline: Pipeline do transformers
+        prompt: Texto de entrada (str) ou lista de mensagens (List[Message])
+        max_new_tokens: Número máximo de tokens a gerar
+        temperature: Temperatura para sampling (opcional, usa padrão do modelo se None)
+        top_p: Nucleus sampling (opcional, usa padrão do modelo se None)
+        top_k: Top-k sampling (opcional, usa padrão do modelo se None)
+        do_sample: Se True, usa sampling; caso contrário, usa greedy decoding
+        num_return_sequences: Número de sequências a retornar
+    Returns:
+        Texto gerado
+    """
+    # Formata prompt se for lista de mensagens
+    tokenizer = pipeline.tokenizer
+    if isinstance(prompt, list):
+        formatted_prompt = _format_chat_prompt(tokenizer, prompt, add_generation_prompt=True)
+    else:
+        formatted_prompt = prompt
+    # Prepara parâmetros do pipeline (usa valores padrão do modelo se não especificados)
+    pipeline_kwargs = _build_generation_kwargs(
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_return_sequences=num_return_sequences,
+        return_full_text=False,
+    )
+    outputs = pipeline(formatted_prompt, **pipeline_kwargs)
+    if num_return_sequences == 1:
+        return outputs[0]["generated_text"]
+    else:
+        return [output["generated_text"] for output in outputs]

src/backend/model_loader.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""Model loading utilities with Streamlit caching."""
+import os
+import streamlit as st
+from pathlib import Path
+from typing import Optional, Dict, Any, Tuple
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    pipeline,
+    Pipeline,
+)
+import torch
+# Obtém token do Hugging Face (disponível automaticamente no Spaces)
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
+# Define o diretório de cache dentro do projeto
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+MODELS_CACHE_DIR = PROJECT_ROOT / "models"
+MODELS_CACHE_DIR.mkdir(exist_ok=True)
+@st.cache_resource
+def load_model(
+    model_name: str,
+    device_map: Optional[str] = "auto",
+    torch_dtype: Optional[torch.dtype] = None,
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
+) -> Tuple[Pipeline, Dict[str, Any]]:
+    """
+    Carrega um modelo do Hugging Face com cache do Streamlit.
+    Args:
+        model_name: Nome do modelo no Hugging Face (ex: 'microsoft/DialoGPT-medium')
+        device_map: Mapeamento de dispositivo ('auto', 'cpu', 'cuda', etc.)
+        torch_dtype: Tipo de dados do torch (ex: torch.float16)
+        load_in_8bit: Se True, carrega modelo quantizado em 8-bit
+        load_in_4bit: Se True, carrega modelo quantizado em 4-bit
+    Returns:
+        Tupla contendo (pipeline, model_info)
+    """
+    try:
+        # Detecta dispositivo disponível
+        has_cuda = torch.cuda.is_available()
+        # Determina o dtype padrão
+        if torch_dtype is None:
+            if has_cuda:
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32
+        # Ajusta device_map: se não há GPU ou device_map é "auto" sem GPU, usa None
+        if device_map == "auto" and not has_cuda:
+            device_map = None
+        elif device_map == "auto" and has_cuda:
+            device_map = "auto"
+        # Configurações de quantização
+        model_kwargs = {
+            "torch_dtype": torch_dtype,
+        }
+        # Só adiciona device_map se não for None
+        if device_map is not None:
+            model_kwargs["device_map"] = device_map
+        if load_in_8bit or load_in_4bit:
+            try:
+                from transformers import BitsAndBytesConfig
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=load_in_8bit,
+                    load_in_4bit=load_in_4bit,
+                )
+                model_kwargs["quantization_config"] = quantization_config
+            except ImportError:
+                st.warning("bitsandbytes não está instalado. Quantização desabilitada.")
+        # Carrega tokenizer e modelo usando cache do projeto
+        cache_dir = str(MODELS_CACHE_DIR)
+        # Prepara kwargs com token de autenticação se disponível
+        hf_kwargs = {"cache_dir": cache_dir}
+        if HF_TOKEN:
+            hf_kwargs["token"] = HF_TOKEN
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            **hf_kwargs
+        )
+        # Adiciona pad_token se não existir
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            **hf_kwargs,
+            **model_kwargs
+        )
+        # Move modelo para CPU se não há GPU e device_map não foi usado
+        if device_map is None and not has_cuda:
+            model = model.to("cpu")
+        # Cria pipeline
+        pipeline_kwargs = {
+            "model": model,
+            "tokenizer": tokenizer,
+        }
+        # Só adiciona device ao pipeline se não usar device_map no modelo
+        if device_map is None:
+            pipeline_kwargs["device"] = 0 if has_cuda else -1
+        else:
+            pipeline_kwargs["device_map"] = device_map
+        pipe = pipeline("text-generation", **pipeline_kwargs)
+        # Informações do modelo
+        model_info = {
+            "model_name": model_name,
+            "device": str(next(model.parameters()).device),
+            "dtype": str(torch_dtype),
+            "quantized": load_in_8bit or load_in_4bit,
+            "cache_dir": cache_dir,
+        }
+        return pipe, model_info
+    except Exception as e:
+        st.error(f"Erro ao carregar modelo {model_name}: {str(e)}")
+        raise

src/config.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Configurações do projeto."""
+# Lista de modelos pré-selecionados para teste
+PRESELECTED_MODELS = [
+    "Qwen/Qwen3-0.6B",  # Modelo padrão
+    "google/gemma-3-4b-it",
+    "google/gemma-3-1b-it",
+    "google/gemma-3-270m-it",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "Qwen/Qwen2.5-0.5B",
+    "facebook/MobileLLM-R1-950M",
+]
+# Modelos que requerem autenticação (gated)
+GATED_MODELS = {
+    "google/gemma-3-4b-it",
+    "google/gemma-3-1b-it",
+    "google/gemma-3-270m-it",
+}
+# Informações sobre os modelos (para exibição)
+MODEL_INFO = {
+    "google/gemma-3-4b-it": {
+        "name": "Gemma 3 4B IT",
+        "params": "4 bilhões",
+        "family": "Google Gemma",
+    },
+    "google/gemma-3-1b-it": {
+        "name": "Gemma 3 1B IT",
+        "params": "1 bilhão",
+        "family": "Google Gemma",
+    },
+    "google/gemma-3-270m-it": {
+        "name": "Gemma 3 270M IT",
+        "params": "270 milhões",
+        "family": "Google Gemma",
+    },
+    "Qwen/Qwen3-0.6B": {
+        "name": "Qwen3 0.6B",
+        "params": "600 milhões",
+        "family": "Qwen",
+    },
+    "Qwen/Qwen2.5-0.5B-Instruct": {
+        "name": "Qwen2.5 0.5B Instruct",
+        "params": "500 milhões",
+        "family": "Qwen",
+    },
+    "Qwen/Qwen2.5-0.5B": {
+        "name": "Qwen2.5 0.5B",
+        "params": "500 milhões",
+        "family": "Qwen",
+    },
+    "facebook/MobileLLM-R1-950M": {
+        "name": "MobileLLM R1 950M",
+        "params": "950 milhões",
+        "family": "Facebook",
+    },
+}
+def get_model_label(model_id: str) -> str:
+    """Retorna label amigável para um modelo."""
+    if model_id in MODEL_INFO:
+        info = MODEL_INFO[model_id]
+        return f"{info['name']} ({info['params']})"
+    return model_id
+def get_model_options() -> list[tuple[str, str]]:
+    """Retorna lista de tuplas (label, model_id) para uso em selectbox."""
+    return [(get_model_label(model_id), model_id) for model_id in PRESELECTED_MODELS]

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))