Spaces:

Gems234
/

Quantization_Alisia

Sleeping

App Files Files Community

Gems234 commited on Sep 21, 2025

Commit

b905978

verified ·

1 Parent(s): df852af

Create app.py

Browse files

Files changed (1) hide show

app.py +249 -0

app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import os
+import re
+import threading
+import warnings
+import gradio as gr
+from llama_cpp import Llama
+# -------------------------
+# TÉLÉCHARGEMENT DU MODÈLE
+# -------------------------
+MODEL_REPO = "mradermacher/Alisia-7B-it-GGUF"
+MODEL_NAME = "Alisia-7B-it.Q4_K_M.gguf"
+MODEL_PATH = f"/tmp/{MODEL_NAME}"
+# Télécharger le modèle s'il n'existe pas
+if not os.path.exists(MODEL_PATH):
+    print("📥 Téléchargement du modèle...")
+    from huggingface_hub import hf_hub_download
+    try:
+        hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_NAME,
+            local_dir="/tmp",
+            resume_download=True
+        )
+        print("✅ Modèle téléchargé avec succès!")
+    except Exception as e:
+        print(f"❌ Erreur téléchargement: {e}")
+        # Fallback: utiliser un modèle plus petit
+        MODEL_NAME = "Alisia-7B-it.Q4_K_M.gguf"  # ou un plus petit si disponible
+# -------------------------
+# CONFIGURATION LLAMA.CPP
+# -------------------------
+os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
+warnings.filterwarnings("ignore")
+print("⚡ Chargement du modèle avec llama.cpp...")
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=4096,           # Contexte plus long
+    n_gpu_layers=0,       # 0 = CPU only (plus stable)
+    n_threads=8,          # Utilise plus de threads
+    n_batch=512,          # Batch size optimisé
+    verbose=False,
+    use_mlock=True        # Meilleure performance
+)
+print("✅ Modèle chargé et prêt!")
+# -------------------------
+# ÉTAT & SYNCHRONISATION
+# -------------------------
+lock = threading.Lock()
+conversations = {"Conversation 1": []}
+stop_generation = threading.Event()
+# -------------------------
+# FONCTIONS UTILITAIRES
+# -------------------------
+def clean_output(text: str) -> str:
+    return re.sub(r"<\|im_.*?\|>", "", text).strip()
+def get_conv_names():
+    with lock:
+        return list(conversations.keys())
+def build_conversation_prompt(history, new_message):
+    """Format de prompt optimisé pour Alpaca"""
+    prompt = ""
+    # System prompt seulement au début
+    if not any(any(conv) for conv in conversations.values()):
+        prompt += """Tu es Alisia, une assistante IA utile et compétente. Réponds de manière précise et concise.
+"""
+    # Historique de conversation
+    for user_msg, assistant_msg in history:
+        prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
+    # Nouveau message
+    prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
+    return prompt
+def send_message_stream(user_message, displayed_history, current_chat_name):
+    global stop_generation
+    stop_generation.clear()
+    if not user_message or not str(user_message).strip():
+        yield displayed_history or [], ""
+        return
+    with lock:
+        if current_chat_name not in conversations:
+            conversations[current_chat_name] = []
+        local_hist = conversations[current_chat_name].copy()
+    local_hist.append((str(user_message), ""))
+    yield local_hist, ""
+    # Prompt optimisé
+    formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
+    partial = ""
+    try:
+        # Génération avec paramètres optimisés
+        stream = llm.create_completion(
+            prompt=formatted_prompt,
+            max_tokens=1024,           # Réduit pour plus de vitesse
+            temperature=0.7,
+            top_p=0.9,
+            repeat_penalty=1.1,
+            stop=["### Instruction:", "### Response:", "\n\n"],
+            stream=True
+        )
+        for chunk in stream:
+            if stop_generation.is_set():
+                break
+            if "choices" in chunk and chunk["choices"]:
+                token = chunk["choices"][0].get("text", "")
+                if token:
+                    partial += token
+                    cleaned = clean_output(partial)
+                    local_hist[-1] = (str(user_message), cleaned)
+                    yield local_hist, ""
+    except Exception as e:
+        err_text = f"[Erreur: {e}]"
+        local_hist[-1] = (str(user_message), err_text)
+        yield local_hist, ""
+    finally:
+        with lock:
+            conversations[current_chat_name] = local_hist.copy()
+        yield local_hist, ""
+# -------------------------
+# INTERFACE GRADIO OPTIMISÉE
+# -------------------------
+css = """
+:root {
+    --primary-color: #4f46e5;
+    --primary-hover: #4338ca;
+    --chat-bg: #0f172a;
+    --input-bg: #1e293b;
+}
+#chatbot {
+    flex-grow: 1;
+    height: 600px !important;
+    background: var(--chat-bg);
+    border-radius: 16px;
+    padding: 20px;
+    overflow-y: auto;
+}
+#input-container {
+    display: flex;
+    gap: 8px;
+    padding: 16px 0;
+    align-items: center;
+}
+#msg_input {
+    flex-grow: 1;
+    background: var(--input-bg);
+    color: #fff;
+    border: 1px solid #334155;
+    border-radius: 24px;
+    padding: 16px 20px;
+    font-size: 16px;
+}
+"""
+with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🚀 Alisia Chat - Version Optimisée")
+    gr.Markdown("Interface ultra-rapide avec llama.cpp")
+    with gr.Row():
+        chatbot = gr.Chatbot(height=500, show_label=False)
+    with gr.Row():
+        msg_input = gr.Textbox(
+            placeholder="Posez votre question à Alisia...",
+            lines=2,
+            show_label=False
+        )
+        send_btn = gr.Button("Envoyer", variant="primary")
+        stop_btn = gr.Button("Arrêter", variant="stop", visible=False)
+    # Événements simplifiés
+    def toggle_buttons():
+        return gr.update(visible=False), gr.update(visible=True)
+    send_btn.click(
+        fn=toggle_buttons,
+        inputs=None,
+        outputs=[send_btn, stop_btn],
+        queue=False
+    ).then(
+        fn=send_message_stream,
+        inputs=[msg_input, chatbot, gr.State("Conversation 1")],
+        outputs=[chatbot, msg_input],
+        queue=True
+    ).then(
+        fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
+        inputs=None,
+        outputs=[send_btn, stop_btn],
+        queue=False
+    )
+    msg_input.submit(
+        fn=toggle_buttons,
+        inputs=None,
+        outputs=[send_btn, stop_btn],
+        queue=False
+    ).then(
+        fn=send_message_stream,
+        inputs=[msg_input, chatbot, gr.State("Conversation 1")],
+        outputs=[chatbot, msg_input],
+        queue=True
+    ).then(
+        fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
+        inputs=None,
+        outputs=[send_btn, stop_btn],
+        queue=False
+    )
+    stop_btn.click(
+        fn=lambda: stop_generation.set(),
+        inputs=None,
+        outputs=None
+    )
+# -------------------------
+# LANCEMENT
+# -------------------------
+if __name__ == "__main__":
+    print("🚀 Lancement de l'interface optimisée...")
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        debug=False  # Désactivé pour plus de performance
+    )