Spaces:

Ryan-PC
/

grok

Build error

App Files Files Community

Ryan-PC commited on Jan 5

Commit

61887df

verified ·

1 Parent(s): 8481230

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -26

app.py CHANGED Viewed

@@ -2,46 +2,72 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
-# Baixa o modelo uncensored (só na primeira vez)
 model_path = hf_hub_download(
     repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF",
     filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf"
 )
-# Carrega o modelo em CPU (n_gpu_layers=0 força CPU)
 llm = Llama(
     model_path=model_path,
-    n_ctx=8192,        # contexto grande
-    n_batch=512,
-    n_gpu_layers=0,    # 0 = só CPU (essencial pro basic)
     verbose=False
 )
-# Template simples pra chat (funciona bem com Llama)
 def format_prompt(message, history):
-    prompt = "<|begin_of_text|>"
-    for user, assistant in history:
-        prompt += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>"
-        prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>"
-    prompt += f"<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|>"
-    prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
     return prompt
 def chat(message, history):
     prompt = format_prompt(message, history)
-    output = llm(
-        prompt,
-        max_tokens=1024,
-        temperature=0.7,
-        top_p=0.95,
-        repeat_penalty=1.15,  # evita repetição
-        stop=["<|eot_id|>"]
-    )
-    return output["choices"][0]["text"].strip()
-# Interface Gradio
 gr.ChatInterface(
-    chat,
-    title="Llama 3.2 3B Uncensored (CPU Básica)",
-    description="IA sem censura rodando só em CPU gratuita! Respostas em ~10-20s."
-).launch()

 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+# --------------------------
+# 1️⃣ Baixa o modelo uncensored (só na primeira vez)
+# --------------------------
 model_path = hf_hub_download(
     repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF",
     filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf"
 )
+# --------------------------
+# 2️⃣ Carrega o modelo em CPU
+# --------------------------
 llm = Llama(
     model_path=model_path,
+    n_ctx=2048,        # menor contexto pra CPU básica
+    n_batch=256,       # batch menor
+    n_gpu_layers=0,    # força CPU
     verbose=False
 )
+# --------------------------
+# 3️⃣ Função de chat com histórico e continuação automática
+# --------------------------
 def format_prompt(message, history):
+    """
+    Formata o prompt de forma simples:
+    User: pergunta
+    Assistant: resposta
+    """
+    prompt = ""
+    for user_msg, assistant_msg in history:
+        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
+    prompt += f"User: {message}\nAssistant: "
     return prompt
 def chat(message, history):
     prompt = format_prompt(message, history)
+    response = ""
+    # Geração em blocos para evitar truncamento
+    for _ in range(3):  # gera até 3 blocos se necessário
+        output = llm(
+            prompt + response,
+            max_tokens=512,
+            temperature=0.7,
+            top_p=0.95,
+            repeat_penalty=1.15,
+            stop=["User:"]
+        )
+        new_text = output["choices"][0]["text"]
+        if not new_text.strip():  # se não houver nova saída, para
+            break
+        response += new_text
+        # opcional: sair se resposta terminar naturalmente
+        if new_text.endswith((".", "!", "?", "\n")):
+            break
+    return response.strip()
+# --------------------------
+# 4️⃣ Interface Gradio
+# --------------------------
 gr.ChatInterface(
+    fn=chat,
+    title="Nidum LLaMA 3.2 3B Uncensored (CPU Básica)",
+    description=(
+        "Chat com LLaMA 3B em CPU básica.\n"
+        "Respostas longas são geradas em blocos para não truncar.\n"
+        "Perguntas complexas podem ser divididas em partes."
+    )
+).launch()