Spaces:

Gems234
/

Quantization_Alisia

Sleeping

App Files Files Community

Gems234 commited on Sep 21, 2025

Commit

a6acd24

verified ·

1 Parent(s): 5977267

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -37

app.py CHANGED Viewed

@@ -29,23 +29,31 @@ if not os.path.exists(MODEL_PATH):
         print(f"❌ Erreur téléchargement: {e}")
 # -------------------------
-# CONFIGURATION LLAMA.CPP
 # -------------------------
 os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
 warnings.filterwarnings("ignore")
-print("⚡ Chargement du modèle avec llama.cpp...")
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=2028,
-    n_gpu_layers=35,
-    n_threads=6,
-    #n_batch=512,
-    verbose=False,
-    use_mlock=True
 )
-print("✅ Modèle chargé et prêt!")
 # -------------------------
 # ÉTAT & SYNCHRONISATION
@@ -55,7 +63,7 @@ conversations = {"Conversation 1": []}
 stop_generation = threading.Event()
 # -------------------------
-# FONCTIONS UTILITAIRES
 # -------------------------
 def clean_output(text: str) -> str:
     return re.sub(r"<\|im_.*?\|>", "", text).strip()
@@ -64,8 +72,15 @@ def get_conv_names():
     with lock:
         return list(conversations.keys())
 def build_conversation_prompt(history, new_message):
-    """Format de prompt optimisé pour Alpaca"""
     prompt = ""
     # System prompt seulement au début
@@ -74,12 +89,15 @@ def build_conversation_prompt(history, new_message):
 """
-    # Historique de conversation
-    for user_msg, assistant_msg in history:
         prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
     # Nouveau message
     prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
     return prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
@@ -90,6 +108,8 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
         yield displayed_history or [], ""
         return
     with lock:
         if current_chat_name not in conversations:
             conversations[current_chat_name] = []
@@ -101,21 +121,24 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
-    # PARAMÈTRES DE RÉACTIVITÉ HYBRIDE
     last_update = time.time()
     token_count = 0
-    min_tokens = 2      # Minimum de tokens avant update
-    max_delay = 0.12    # Maximum 120ms entre updates
     try:
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
-            max_tokens=2028,
             temperature=0.7,
             top_p=0.9,
             repeat_penalty=1.1,
-            stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"]
         )
         for chunk in stream:
@@ -125,24 +148,31 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
             if "choices" in chunk and chunk["choices"]:
                 token = chunk["choices"][0].get("text", "")
                 if token:
-                    partial += token
                     token_count += 1
-                    # STRATÉGIE DE RÉACTIVITÉ HYBRIDE
                     should_update = (
                         token_count >= min_tokens or
-                        time.time() - last_update > max_delay or
-                        token in [".", "!", "?", "\n", ",", ";", ":"]
                     )
-                    if should_update:
                         cleaned = clean_output(partial)
                         local_hist[-1] = (str(user_message), cleaned)
                         yield local_hist, ""
-                        last_update = time.time()
                         token_count = 0
-        # DERNIER FLUSH - Garantit que tout est affiché
         if partial:
             cleaned = clean_output(partial)
             local_hist[-1] = (str(user_message), cleaned)
@@ -154,6 +184,9 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
         yield local_hist, ""
     finally:
         with lock:
             conversations[current_chat_name] = local_hist.copy()
         yield local_hist, ""
@@ -187,7 +220,7 @@ def clear_chat():
     return [], "Conversation 1"
 # -------------------------
-# INTERFACE GRADIO OPTIMISÉE
 # -------------------------
 css = """
 :root {
@@ -359,17 +392,35 @@ css = """
 .clear-btn:hover {
     background: #64748b;
 }
 """
-with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft()) as demo:
     history_visible = gr.State(True)
     current_chat = gr.State("Conversation 1")
     with gr.Row(elem_id="topbar"):
         menu_btn = gr.Button("☰", elem_classes="hamburger")
-        gr.Markdown("### 💬 Alisia <span class='alisia-badge'>AI Assistant</span>", elem_id="title")
         gr.HTML("<div style='flex:1'></div>")
-        gr.Markdown("<small style='color:#94a3b8'>llama.cpp optimisé</small>")
     with gr.Row():
         with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
@@ -390,14 +441,24 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
                     "🗑️ Effacer chat",
                     elem_classes="clear-btn"
                 )
-                gr.Markdown("## 🚀 Mode Ultra-Rapide", elem_classes="conversation-header")
                 gr.Markdown("""
-                <div style="color: #94a3b8; font-size: 14px;">
-                ✅ Streaming hybride<br>
-                ✅ Réactivité 120ms<br>
-                ✅ Optimisé llama.cpp
                 </div>
-                """, elem_classes="conversation-subheader")
         with gr.Column(scale=3, elem_id="chatcol"):
             with gr.Column(elem_id="chat-container"):
@@ -495,8 +556,14 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
-    print("🚀 Lancement de l'interface ultra-réactive...")
-    print("⏱️  Mode streaming hybride activé (120ms)")
     demo.launch(
         share=True,
         server_name="0.0.0.0",

         print(f"❌ Erreur téléchargement: {e}")
 # -------------------------
+# CONFIGURATION LLAMA.CPP OPTIMISÉE POUR CPU
 # -------------------------
 os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
 warnings.filterwarnings("ignore")
+print("⚡ Chargement du modèle avec llama.cpp (CPU optimisé)...")
+# Détection automatique du nombre de threads
+import multiprocessing
+cpu_count = multiprocessing.cpu_count()
+n_threads = max(2, cpu_count - 1)  # Utiliser tous les cores sauf un
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=2048,           # Contexte réduit pour meilleure performance
+    n_gpu_layers=0,       # Désactivé pour CPU uniquement
+    n_threads=n_threads,  # Optimisé pour votre CPU
+    n_batch=512,          # Batch adapté pour CPU
+    n_threads_batch=n_threads,  # Même nombre de threads pour le batch
+    use_mlock=False,      # Désactivé pour meilleures performances
+    vocab_only=False,
+    verbose=False
 )
+print(f"✅ Modèle chargé! Threads: {n_threads} | CPU: {cpu_count} cores")
 # -------------------------
 # ÉTAT & SYNCHRONISATION
 stop_generation = threading.Event()
 # -------------------------
+# FONCTIONS UTILITAIRES OPTIMISÉES
 # -------------------------
 def clean_output(text: str) -> str:
     return re.sub(r"<\|im_.*?\|>", "", text).strip()
     with lock:
         return list(conversations.keys())
+# Cache pour éviter la reconstruction complète du prompt
+prompt_cache = {}
 def build_conversation_prompt(history, new_message):
+    """Format de prompt optimisé avec cache"""
+    cache_key = str(len(history)) + new_message[:50]
+    if cache_key in prompt_cache:
+        return prompt_cache[cache_key]
     prompt = ""
     # System prompt seulement au début
 """
+    # Historique de conversation (limité aux derniers messages)
+    recent_history = history[-4:]  # Limiter à 4 derniers échanges pour CPU
+    for user_msg, assistant_msg in recent_history:
         prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
     # Nouveau message
     prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
+    prompt_cache[cache_key] = prompt
     return prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
         yield displayed_history or [], ""
         return
+    start_time = time.time()
     with lock:
         if current_chat_name not in conversations:
             conversations[current_chat_name] = []
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
+    # PARAMÈTRES OPTIMISÉS POUR CPU
     last_update = time.time()
     token_count = 0
+    min_tokens = 3       # Regroupement modéré pour CPU
+    max_delay = 0.3      # 300ms entre updates pour CPU
+    buffer = ""
     try:
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
+            max_tokens=384,          # Réponse plus courte pour CPU
             temperature=0.7,
             top_p=0.9,
             repeat_penalty=1.1,
+            stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"],
+            min_p=0.05,              # Acceleration CPU
+            typical_p=0.95           # Acceleration CPU
         )
         for chunk in stream:
             if "choices" in chunk and chunk["choices"]:
                 token = chunk["choices"][0].get("text", "")
                 if token:
+                    buffer += token
                     token_count += 1
+                    # STRATÉGIE OPTIMISÉE POUR CPU
+                    current_time = time.time()
+                    time_since_update = current_time - last_update
                     should_update = (
                         token_count >= min_tokens or
+                        time_since_update > max_delay or
+                        token in [".", "!", "?", "\n", " "]
                     )
+                    if should_update and buffer.strip():
+                        partial += buffer
                         cleaned = clean_output(partial)
                         local_hist[-1] = (str(user_message), cleaned)
                         yield local_hist, ""
+                        last_update = current_time
                         token_count = 0
+                        buffer = ""
+        # Dernier flush du buffer
+        if buffer:
+            partial += buffer
         if partial:
             cleaned = clean_output(partial)
             local_hist[-1] = (str(user_message), cleaned)
         yield local_hist, ""
     finally:
+        end_time = time.time()
+        generation_time = end_time - start_time
+        print(f"⏱️  Temps de génération: {generation_time:.2f}s - {len(partial)} caractères")
         with lock:
             conversations[current_chat_name] = local_hist.copy()
         yield local_hist, ""
     return [], "Conversation 1"
 # -------------------------
+# INTERFACE GRADIO OPTIMISÉE POUR CPU
 # -------------------------
 css = """
 :root {
 .clear-btn:hover {
     background: #64748b;
 }
+.perf-info {
+    color: #94a3b8;
+    font-size: 12px;
+    margin-top: 10px;
+    padding: 8px;
+    background: #1e293b;
+    border-radius: 8px;
+}
+.cpu-warning {
+    color: #fbbf24;
+    background: #431407;
+    padding: 8px;
+    border-radius: 8px;
+    margin-top: 10px;
+    font-size: 12px;
+}
 """
+with gr.Blocks(css=css, title="Alisia Chat - Optimisé CPU", theme=gr.themes.Soft()) as demo:
     history_visible = gr.State(True)
     current_chat = gr.State("Conversation 1")
     with gr.Row(elem_id="topbar"):
         menu_btn = gr.Button("☰", elem_classes="hamburger")
+        gr.Markdown("### 💬 Alisia <span class='alisia-badge'>CPU Mode</span>", elem_id="title")
         gr.HTML("<div style='flex:1'></div>")
+        gr.Markdown(f"<small style='color:#94a3b8'>CPU: {cpu_count} cores • Threads: {n_threads}</small>")
     with gr.Row():
         with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
                     "🗑️ Effacer chat",
                     elem_classes="clear-btn"
                 )
+                # Informations de performance CPU
                 gr.Markdown("""
+                <div class="perf-info">
+                <strong>⚡ Mode CPU Optimisé</strong><br>
+                • Threads: {n_threads}/{cpu_count}<br>
+                • Contexte: 2048 tokens<br>
+                • Latence: ~300ms<br>
+                • Réponses: 384 tokens max
                 </div>
+                """.format(n_threads=n_threads, cpu_count=cpu_count))
+                gr.Markdown("""
+                <div class="cpu-warning">
+                ⚠️ Mode CPU - Les performances peuvent varier<br>
+                selon la puissance de votre processeur
+                </div>
+                """)
         with gr.Column(scale=3, elem_id="chatcol"):
             with gr.Column(elem_id="chat-container"):
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
+    print("🚀 Lancement de l'interface optimisée CPU...")
+    print(f"💻 Configuration CPU:")
+    print(f"   - Cores disponibles: {cpu_count}")
+    print(f"   - Threads utilisés: {n_threads}")
+    print(f"   - Contexte: 2048 tokens")
+    print(f"   - Réponses limitées: 384 tokens")
+    print("⏱️  Patience - Le CPU peut être plus lent que le GPU")
     demo.launch(
         share=True,
         server_name="0.0.0.0",