Spaces:

Gems234
/

Quantization_Alisia

Sleeping

App Files Files Community

Gems234 commited on Sep 21, 2025

Commit

13dd02d

verified ·

1 Parent(s): a6acd24

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -103

app.py CHANGED Viewed

@@ -29,31 +29,27 @@ if not os.path.exists(MODEL_PATH):
         print(f"❌ Erreur téléchargement: {e}")
 # -------------------------
-# CONFIGURATION LLAMA.CPP OPTIMISÉE POUR CPU
 # -------------------------
 os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
 warnings.filterwarnings("ignore")
-print("⚡ Chargement du modèle avec llama.cpp (CPU optimisé)...")
 # Détection automatique du nombre de threads
 import multiprocessing
 cpu_count = multiprocessing.cpu_count()
-n_threads = max(2, cpu_count - 1)  # Utiliser tous les cores sauf un
 llm = Llama(
     model_path=MODEL_PATH,
-    n_ctx=2048,           # Contexte réduit pour meilleure performance
-    n_gpu_layers=0,       # Désactivé pour CPU uniquement
-    n_threads=n_threads,  # Optimisé pour votre CPU
-    n_batch=512,          # Batch adapté pour CPU
-    n_threads_batch=n_threads,  # Même nombre de threads pour le batch
-    use_mlock=False,      # Désactivé pour meilleures performances
-    vocab_only=False,
     verbose=False
 )
-print(f"✅ Modèle chargé! Threads: {n_threads} | CPU: {cpu_count} cores")
 # -------------------------
 # ÉTAT & SYNCHRONISATION
@@ -61,6 +57,7 @@ print(f"✅ Modèle chargé! Threads: {n_threads} | CPU: {cpu_count} cores")
 lock = threading.Lock()
 conversations = {"Conversation 1": []}
 stop_generation = threading.Event()
 # -------------------------
 # FONCTIONS UTILITAIRES OPTIMISÉES
@@ -72,32 +69,28 @@ def get_conv_names():
     with lock:
         return list(conversations.keys())
-# Cache pour éviter la reconstruction complète du prompt
-prompt_cache = {}
 def build_conversation_prompt(history, new_message):
-    """Format de prompt optimisé avec cache"""
-    cache_key = str(len(history)) + new_message[:50]
-    if cache_key in prompt_cache:
-        return prompt_cache[cache_key]
     prompt = ""
-    # System prompt seulement au début
-    if not any(any(conv) for conv in conversations.values()):
-        prompt += """Tu es Alisia, une assistante IA utile et compétente. Réponds de manière précise et concise.
 """
-    # Historique de conversation (limité aux derniers messages)
-    recent_history = history[-4:]  # Limiter à 4 derniers échanges pour CPU
-    for user_msg, assistant_msg in recent_history:
         prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
-    # Nouveau message
     prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
-    prompt_cache[cache_key] = prompt
     return prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
@@ -121,24 +114,16 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
-    # PARAMÈTRES OPTIMISÉS POUR CPU
-    last_update = time.time()
-    token_count = 0
-    min_tokens = 3       # Regroupement modéré pour CPU
-    max_delay = 0.3      # 300ms entre updates pour CPU
-    buffer = ""
     try:
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
-            max_tokens=384,          # Réponse plus courte pour CPU
             temperature=0.7,
-            top_p=0.9,
-            repeat_penalty=1.1,
-            stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"],
-            min_p=0.05,              # Acceleration CPU
-            typical_p=0.95           # Acceleration CPU
         )
         for chunk in stream:
@@ -148,35 +133,11 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
             if "choices" in chunk and chunk["choices"]:
                 token = chunk["choices"][0].get("text", "")
                 if token:
-                    buffer += token
-                    token_count += 1
-                    # STRATÉGIE OPTIMISÉE POUR CPU
-                    current_time = time.time()
-                    time_since_update = current_time - last_update
-                    should_update = (
-                        token_count >= min_tokens or
-                        time_since_update > max_delay or
-                        token in [".", "!", "?", "\n", " "]
-                    )
-                    if should_update and buffer.strip():
-                        partial += buffer
-                        cleaned = clean_output(partial)
-                        local_hist[-1] = (str(user_message), cleaned)
-                        yield local_hist, ""
-                        last_update = current_time
-                        token_count = 0
-                        buffer = ""
-        # Dernier flush du buffer
-        if buffer:
-            partial += buffer
-        if partial:
-            cleaned = clean_output(partial)
-            local_hist[-1] = (str(user_message), cleaned)
-            yield local_hist, ""
     except Exception as e:
         err_text = f"[Erreur: {e}]"
@@ -185,8 +146,7 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
     finally:
         end_time = time.time()
-        generation_time = end_time - start_time
-        print(f"⏱️  Temps de génération: {generation_time:.2f}s - {len(partial)} caractères")
         with lock:
             conversations[current_chat_name] = local_hist.copy()
         yield local_hist, ""
@@ -215,12 +175,14 @@ def request_stop():
     return "🛑 Arrêt demandé..."
 def clear_chat():
     with lock:
         conversations["Conversation 1"] = []
     return [], "Conversation 1"
 # -------------------------
-# INTERFACE GRADIO OPTIMISÉE POUR CPU
 # -------------------------
 css = """
 :root {
@@ -401,26 +363,17 @@ css = """
     background: #1e293b;
     border-radius: 8px;
 }
-.cpu-warning {
-    color: #fbbf24;
-    background: #431407;
-    padding: 8px;
-    border-radius: 8px;
-    margin-top: 10px;
-    font-size: 12px;
-}
 """
-with gr.Blocks(css=css, title="Alisia Chat - Optimisé CPU", theme=gr.themes.Soft()) as demo:
     history_visible = gr.State(True)
     current_chat = gr.State("Conversation 1")
     with gr.Row(elem_id="topbar"):
         menu_btn = gr.Button("☰", elem_classes="hamburger")
-        gr.Markdown("### 💬 Alisia <span class='alisia-badge'>CPU Mode</span>", elem_id="title")
         gr.HTML("<div style='flex:1'></div>")
-        gr.Markdown(f"<small style='color:#94a3b8'>CPU: {cpu_count} cores • Threads: {n_threads}</small>")
     with gr.Row():
         with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
@@ -442,21 +395,12 @@ with gr.Blocks(css=css, title="Alisia Chat - Optimisé CPU", theme=gr.themes.Sof
                     elem_classes="clear-btn"
                 )
-                # Informations de performance CPU
                 gr.Markdown("""
                 <div class="perf-info">
-                <strong>⚡ Mode CPU Optimisé</strong><br>
-                • Threads: {n_threads}/{cpu_count}<br>
-                • Contexte: 2048 tokens<br>
-                • Latence: ~300ms<br>
-                • Réponses: 384 tokens max
-                </div>
-                """.format(n_threads=n_threads, cpu_count=cpu_count))
-                gr.Markdown("""
-                <div class="cpu-warning">
-                ⚠️ Mode CPU - Les performances peuvent varier<br>
-                selon la puissance de votre processeur
                 </div>
                 """)
@@ -556,13 +500,9 @@ with gr.Blocks(css=css, title="Alisia Chat - Optimisé CPU", theme=gr.themes.Sof
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
-    print("🚀 Lancement de l'interface optimisée CPU...")
-    print(f"💻 Configuration CPU:")
-    print(f"   - Cores disponibles: {cpu_count}")
-    print(f"   - Threads utilisés: {n_threads}")
-    print(f"   - Contexte: 2048 tokens")
-    print(f"   - Réponses limitées: 384 tokens")
-    print("⏱️  Patience - Le CPU peut être plus lent que le GPU")
     demo.launch(
         share=True,

         print(f"❌ Erreur téléchargement: {e}")
 # -------------------------
+# CONFIGURATION LLAMA.CPP OPTIMISÉE
 # -------------------------
 os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
 warnings.filterwarnings("ignore")
+print("⚡ Chargement du modèle avec llama.cpp...")
 # Détection automatique du nombre de threads
 import multiprocessing
 cpu_count = multiprocessing.cpu_count()
+n_threads = max(2, cpu_count - 1)
 llm = Llama(
     model_path=MODEL_PATH,
+    n_ctx=2048,
+    n_gpu_layers=0,       # CPU uniquement
+    n_threads=n_threads,
     verbose=False
 )
+print(f"✅ Modèle chargé! Threads: {n_threads}")
 # -------------------------
 # ÉTAT & SYNCHRONISATION
 lock = threading.Lock()
 conversations = {"Conversation 1": []}
 stop_generation = threading.Event()
+system_prompt_used = False  # Pour suivre si le system prompt a été utilisé
 # -------------------------
 # FONCTIONS UTILITAIRES OPTIMISÉES
     with lock:
         return list(conversations.keys())
 def build_conversation_prompt(history, new_message):
+    """Format de prompt Alpaca avec system prompt UNIQUEMENT au début"""
+    global system_prompt_used
     prompt = ""
+    # System prompt UNIQUEMENT si jamais utilisé auparavant
+    if not system_prompt_used:
+        prompt += """Your name is Alisia, you are created by the Alisia research team.
+Below is an instruction that describes a task, paired with an input that provides further context.
+Write a response that appropriately completes the request.
 """
+        system_prompt_used = True
+    # Ajouter tout l'historique de conversation
+    for user_msg, assistant_msg in history:
         prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
+    # Ajouter le nouveau message
     prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
     return prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
     try:
+        # Utilisation directe du streaming sans buffering complexe
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
+            max_tokens=1024,
             temperature=0.7,
+            top_p=0.8,
+            repeat_penalty=1.05,
+            stop=["### Instruction:", "### Response:", "<|endoftext|>", "\n\n\n"]
         )
         for chunk in stream:
             if "choices" in chunk and chunk["choices"]:
                 token = chunk["choices"][0].get("text", "")
                 if token:
+                    partial += token
+                    # Mise à jour immédiate pour une meilleure réactivité
+                    cleaned = clean_output(partial)
+                    local_hist[-1] = (str(user_message), cleaned)
+                    yield local_hist, ""
     except Exception as e:
         err_text = f"[Erreur: {e}]"
     finally:
         end_time = time.time()
+        print(f"⏱️  Génération: {end_time - start_time:.2f}s - {len(partial)} chars")
         with lock:
             conversations[current_chat_name] = local_hist.copy()
         yield local_hist, ""
     return "🛑 Arrêt demandé..."
 def clear_chat():
+    global system_prompt_used
     with lock:
         conversations["Conversation 1"] = []
+    system_prompt_used = False  # Réinitialiser pour le prochain chat
     return [], "Conversation 1"
 # -------------------------
+# INTERFACE GRADIO OPTIMISÉE
 # -------------------------
 css = """
 :root {
     background: #1e293b;
     border-radius: 8px;
 }
 """
+with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft()) as demo:
     history_visible = gr.State(True)
     current_chat = gr.State("Conversation 1")
     with gr.Row(elem_id="topbar"):
         menu_btn = gr.Button("☰", elem_classes="hamburger")
+        gr.Markdown("### 💬 Alisia <span class='alisia-badge'>AI Assistant</span>", elem_id="title")
         gr.HTML("<div style='flex:1'></div>")
+        gr.Markdown(f"<small style='color:#94a3b8'>CPU: {n_threads} threads • Mode Rapide</small>")
     with gr.Row():
         with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
                     elem_classes="clear-btn"
                 )
                 gr.Markdown("""
                 <div class="perf-info">
+                <strong>🚀 Mode Alpaca Optimisé</strong><br>
+                • System prompt unique<br>
+                • Streaming direct<br>
+                • Format Alpaca pur
                 </div>
                 """)
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
+    print("🚀 Lancement de l'interface optimisée...")
+    print("📋 Format Alpaca avec system prompt unique")
+    print(f"⚡ Threads CPU: {n_threads}")
     demo.launch(
         share=True,