import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # 1. Téléchargement print("⬇️ Downloading model...") model_path = hf_hub_download( repo_id="XY26/dual-frame-llama-3", filename="meta-llama-3.1-8b-instruct.Q4_K_M.gguf" ) # 2. Chargement du Moteur print("⚙️ Loading engine...") llm = Llama( model_path=model_path, n_ctx=4096, n_threads=2, verbose=False ) def smart_response(message, history): try: messages = [] # 1. System Prompt system_prompt = """You are a helpful AI assistant. - If asked for ADVICE/OPINION: Provide **Gain Frame** and **Loss Frame**. - If asked for FACTS/CHAT: Answer directly. Note: You are a small Quantized model, admit if you don't know.""" messages.append({"role": "system", "content": system_prompt}) # 2. Mémoire Sécurisée for turn in history: # On vérifie si c'est bien une liste avec au moins 2 éléments (User, Bot) if isinstance(turn, (list, tuple)) and len(turn) >= 2: user_msg = turn[0] bot_msg = turn[1] # On ajoute seulement si les messages ne sont pas vides if user_msg is not None and bot_msg is not None: messages.append({"role": "user", "content": str(user_msg)}) messages.append({"role": "assistant", "content": str(bot_msg)}) # 3. Question Actuelle messages.append({"role": "user", "content": message}) # 4. Génération stream = llm.create_chat_completion( messages=messages, max_tokens=1024, stream=True, temperature=0.7 ) partial_message = "" for chunk in stream: delta = chunk['choices'][0]['delta'] if 'content' in delta: token = delta['content'] partial_message += token yield partial_message except Exception as e: # Si ça plante, on l'affiche proprement au lieu de casser l'interface print(f"❌ Error: {e}") yield f"⚠️ Oups, une erreur technique est survenue : {e}. Essayez de cliquer sur 'New Chat'." # 3. Interface demo = gr.ChatInterface( fn=smart_response, title="🤖 Smart Decision Architect (Safe Mode)", description="Ask factual questions or advice." ) if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)