Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,67 +2,69 @@ import gradio as gr
|
|
| 2 |
from llama_cpp import Llama
|
| 3 |
from huggingface_hub import hf_hub_download
|
| 4 |
|
| 5 |
-
# 1.
|
|
|
|
| 6 |
model_path = hf_hub_download(
|
| 7 |
repo_id="XY26/dual-frame-llama-3",
|
| 8 |
filename="meta-llama-3.1-8b-instruct.Q4_K_M.gguf"
|
| 9 |
)
|
| 10 |
|
| 11 |
-
# 2.
|
|
|
|
|
|
|
| 12 |
llm = Llama(
|
| 13 |
model_path=model_path,
|
| 14 |
-
n_ctx=
|
| 15 |
-
n_threads=2,
|
|
|
|
| 16 |
verbose=False
|
| 17 |
)
|
| 18 |
|
| 19 |
def smart_response(message, history):
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
Answer normally, concisely, and directly. Do NOT use the Gain/Loss frames.
|
| 37 |
-
"""
|
| 38 |
-
|
| 39 |
-
# 3. Build the Memory (Context)
|
| 40 |
-
# Loop through the history so the model remembers what happened before.
|
| 41 |
-
formatted_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
|
| 42 |
-
|
| 43 |
-
for user_msg, bot_msg in history:
|
| 44 |
-
formatted_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
|
| 45 |
-
formatted_prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
|
| 46 |
-
|
| 47 |
-
# Add the current message
|
| 48 |
-
formatted_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
| 49 |
-
|
| 50 |
-
# 4. Generate Response
|
| 51 |
-
output = llm(
|
| 52 |
-
formatted_prompt,
|
| 53 |
-
max_tokens=1024, # Allowed it to write a bit more
|
| 54 |
-
stop=["<|eot_id|>"],
|
| 55 |
-
echo=False
|
| 56 |
-
)
|
| 57 |
-
return output['choices'][0]['text']
|
| 58 |
-
|
| 59 |
-
# 5. The Interface
|
| 60 |
demo = gr.ChatInterface(
|
| 61 |
fn=smart_response,
|
| 62 |
title="🤖 Smart Decision Architect",
|
| 63 |
-
description="
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
)
|
| 66 |
|
|
|
|
| 67 |
if __name__ == "__main__":
|
| 68 |
-
|
|
|
|
|
|
| 2 |
from llama_cpp import Llama
|
| 3 |
from huggingface_hub import hf_hub_download
|
| 4 |
|
| 5 |
+
# 1. Téléchargement du modèle
|
| 6 |
+
print("⬇️ Downloading model...")
|
| 7 |
model_path = hf_hub_download(
|
| 8 |
repo_id="XY26/dual-frame-llama-3",
|
| 9 |
filename="meta-llama-3.1-8b-instruct.Q4_K_M.gguf"
|
| 10 |
)
|
| 11 |
|
| 12 |
+
# 2. Chargement du Moteur
|
| 13 |
+
# On réduit n_ctx à 2048 pour être sûr que ça rentre dans la RAM sans planter
|
| 14 |
+
print("⚙️ Loading engine...")
|
| 15 |
llm = Llama(
|
| 16 |
model_path=model_path,
|
| 17 |
+
n_ctx=2048, # Réduit pour la stabilité (Mémoire)
|
| 18 |
+
n_threads=2, # Utilise les 2 cœurs disponibles
|
| 19 |
+
n_batch=512, # Traitement par lots standard
|
| 20 |
verbose=False
|
| 21 |
)
|
| 22 |
|
| 23 |
def smart_response(message, history):
|
| 24 |
+
try:
|
| 25 |
+
# Système de Prompt
|
| 26 |
+
system_prompt = """You are a helpful AI assistant with a specific behavioral directive.
|
| 27 |
+
1. IF asked for DECISION/ADVICE: Provide **The Gain Frame** and **The Loss Frame**.
|
| 28 |
+
2. IF asked for FACTS/CHAT: Answer normally."""
|
| 29 |
+
|
| 30 |
+
# Construction de l'historique (Format Llama 3)
|
| 31 |
+
prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
|
| 32 |
+
|
| 33 |
+
# On ajoute les anciens messages
|
| 34 |
+
for user_msg, bot_msg in history:
|
| 35 |
+
if user_msg and bot_msg: # Sécurité contre les messages vides
|
| 36 |
+
prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
|
| 37 |
+
prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
|
| 38 |
+
|
| 39 |
+
# On ajoute le message actuel
|
| 40 |
+
prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
| 41 |
+
|
| 42 |
+
# Génération
|
| 43 |
+
output = llm(
|
| 44 |
+
prompt,
|
| 45 |
+
max_tokens=1024,
|
| 46 |
+
stop=["<|eot_id|>"],
|
| 47 |
+
echo=False
|
| 48 |
+
)
|
| 49 |
+
return output['choices'][0]['text']
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
# En cas d'erreur, on l'affiche dans le chat au lieu de planter
|
| 53 |
+
print(f"❌ Error: {e}")
|
| 54 |
+
return f"⚠️ An error occurred: {str(e)}. Please click 'New Chat' to reset."
|
| 55 |
|
| 56 |
+
# 3. Interface avec Gestion de File d'Attente (Queue)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
demo = gr.ChatInterface(
|
| 58 |
fn=smart_response,
|
| 59 |
title="🤖 Smart Decision Architect",
|
| 60 |
+
description="Ask factual questions (1+1) or advice (Should I...).",
|
| 61 |
+
theme="soft",
|
| 62 |
+
retry_btn="🔄 Retry",
|
| 63 |
+
undo_btn="↩️ Undo",
|
| 64 |
+
clear_btn="🗑️ New Chat",
|
| 65 |
)
|
| 66 |
|
| 67 |
+
# 4. Lancement avec Concurrency Limit (Vital pour Llama.cpp)
|
| 68 |
if __name__ == "__main__":
|
| 69 |
+
# concurrency_limit=1 empêche le serveur de traiter 2 choses en même temps et de planter
|
| 70 |
+
demo.queue(default_concurrency_limit=1).launch(server_name="0.0.0.0", server_port=7860)
|