Spaces:

XY26
/

Dual-Frame-Llama-Bot

Running

App Files Files Community

XY26 commited on about 19 hours ago

Commit

ef646ae

verified ·

1 Parent(s): 9d10554

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -46

app.py CHANGED Viewed

@@ -2,67 +2,69 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# 1. Download the model
 model_path = hf_hub_download(
     repo_id="XY26/dual-frame-llama-3",
     filename="meta-llama-3.1-8b-instruct.Q4_K_M.gguf"
 )
-# 2. Load the Engine
 llm = Llama(
     model_path=model_path,
-    n_ctx=4096,
-    n_threads=2,
     verbose=False
 )
 def smart_response(message, history):
-    # This is the "Brain" instruction.
-    # It tells the model to distinguish between DECISIONS and NORMAL CHAT.
-    system_prompt = """You are a helpful AI assistant with a specific behavioral directive.
-    CORE INSTRUCTION:
-    1. IF the user asks for a DECISION, OPINION, or ADVICE (e.g., "Should I...", "Is X good...", "What do you think of..."):
-       You MUST analyze it using TWO distinct frames to avoid bias.
-       Format your answer exactly like this:
-       **The Gain Frame (Positive)**
-       [Write the positive/gain perspective here]
-       **The Loss Frame (Negative)**
-       [Write the negative/loss/risk perspective here]
-       (Do not provide a conclusion or your own opinion).
-    2. IF the user asks a FACTUAL question (e.g., "1+1", "Capital of France"), a CLARIFICATION (e.g., "What did you mean?"), or general CHAT:
-       Answer normally, concisely, and directly. Do NOT use the Gain/Loss frames.
-    """
-    # 3. Build the Memory (Context)
-    # Loop through the history so the model remembers what happened before.
-    formatted_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
-    for user_msg, bot_msg in history:
-        formatted_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
-        formatted_prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
-    # Add the current message
-    formatted_prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    # 4. Generate Response
-    output = llm(
-        formatted_prompt,
-        max_tokens=1024, # Allowed it to write a bit more
-        stop=["<|eot_id|>"],
-        echo=False
-    )
-    return output['choices'][0]['text']
-# 5. The Interface
 demo = gr.ChatInterface(
     fn=smart_response,
     title="🤖 Smart Decision Architect",
-    description="I am a Smart Assistant. Ask me factual questions (1+1) and I will answer normally. Ask me for ADVICE (Should I...) and I will give you Gain/Loss frames.",
-    examples=["Should I buy a lottery ticket?", "What is 10 + 10?", "Is nuclear energy good?", "Summarize the previous answer."]
 )
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# 1. Téléchargement du modèle
+print("⬇️ Downloading model...")
 model_path = hf_hub_download(
     repo_id="XY26/dual-frame-llama-3",
     filename="meta-llama-3.1-8b-instruct.Q4_K_M.gguf"
 )
+# 2. Chargement du Moteur
+# On réduit n_ctx à 2048 pour être sûr que ça rentre dans la RAM sans planter
+print("⚙️ Loading engine...")
 llm = Llama(
     model_path=model_path,
+    n_ctx=2048,          # Réduit pour la stabilité (Mémoire)
+    n_threads=2,         # Utilise les 2 cœurs disponibles
+    n_batch=512,         # Traitement par lots standard
     verbose=False
 )
 def smart_response(message, history):
+    try:
+        # Système de Prompt
+        system_prompt = """You are a helpful AI assistant with a specific behavioral directive.
+        1. IF asked for DECISION/ADVICE: Provide **The Gain Frame** and **The Loss Frame**.
+        2. IF asked for FACTS/CHAT: Answer normally."""
+        # Construction de l'historique (Format Llama 3)
+        prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"
+        # On ajoute les anciens messages
+        for user_msg, bot_msg in history:
+            if user_msg and bot_msg: # Sécurité contre les messages vides
+                prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
+                prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
+        # On ajoute le message actuel
+        prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        # Génération
+        output = llm(
+            prompt,
+            max_tokens=1024,
+            stop=["<|eot_id|>"],
+            echo=False
+        )
+        return output['choices'][0]['text']
+    except Exception as e:
+        # En cas d'erreur, on l'affiche dans le chat au lieu de planter
+        print(f"❌ Error: {e}")
+        return f"⚠️ An error occurred: {str(e)}. Please click 'New Chat' to reset."
+# 3. Interface avec Gestion de File d'Attente (Queue)
 demo = gr.ChatInterface(
     fn=smart_response,
     title="🤖 Smart Decision Architect",
+    description="Ask factual questions (1+1) or advice (Should I...).",
+    theme="soft",
+    retry_btn="🔄 Retry",
+    undo_btn="↩️ Undo",
+    clear_btn="🗑️ New Chat",
 )
+# 4. Lancement avec Concurrency Limit (Vital pour Llama.cpp)
 if __name__ == "__main__":
+    # concurrency_limit=1 empêche le serveur de traiter 2 choses en même temps et de planter
+    demo.queue(default_concurrency_limit=1).launch(server_name="0.0.0.0", server_port=7860)