Spaces:

Gems234
/

Quantization_Alisia

Sleeping

App Files Files Community

Gems234 commited on Sep 21, 2025

Commit

3505c55

verified ·

1 Parent(s): 47a640a

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -43

app.py CHANGED Viewed

@@ -64,41 +64,39 @@ def get_conv_names():
     with lock:
         return list(conversations.keys())
-# Format de prompt Alpaca
-def alpaca_prompt(instruction, input_text="", output_text=""):
-    """Format de prompt Alpaca standard"""
     if input_text:
-        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-### Instruction:
-{instruction}
-### Input:
-{input_text}
-### Response:
-{output_text}"""
     else:
-        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
-### Instruction:
-{instruction}
-### Response:
-{output_text}"""
 def build_conversation_prompt(history, new_message):
-    """Construit le prompt avec l'historique de conversation"""
-    # System prompt seulement au début
-    system_prompt = "Tu es Alisia, une assistante IA utile et compétente. Réponds de manière précise et concise en français."
-    # Construire l'historique de conversation
-    conversation_history = ""
-    for user_msg, assistant_msg in history:
-        conversation_history += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
     # Ajouter le nouveau message
-    full_prompt = f"{system_prompt}\n\n{conversation_history}### Instruction:\n{new_message}\n\n### Response:\n"
     return full_prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
@@ -117,24 +115,27 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
     local_hist.append((str(user_message), ""))
     yield local_hist, ""
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
-    # PARAMÈTRES DE RÉACTIVITÉ HYBRIDE
     last_update = time.time()
     token_count = 0
-    min_tokens = 2      # Minimum de tokens avant update
-    max_delay = 0.12    # Maximum 120ms entre updates
     try:
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
-            max_tokens=1024,
             temperature=0.7,
-            top_p=0.9,
-            repeat_penalty=1.1,
-            stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"]
         )
         for chunk in stream:
@@ -147,11 +148,11 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
                     partial += token
                     token_count += 1
-                    # STRATÉGIE DE RÉACTIVITÉ HYBRIDE
                     should_update = (
                         token_count >= min_tokens or
                         time.time() - last_update > max_delay or
-                        token in [".", "!", "?", "\n", ",", ";", ":"]
                     )
                     if should_update:
@@ -178,7 +179,7 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
         yield local_hist, ""
 # -------------------------
-# FONCTIONS POUR L'INTERFACE (inchangées)
 # -------------------------
 def toggle_history(visible_state):
     new_state = not bool(visible_state)
@@ -206,7 +207,7 @@ def clear_chat():
     return [], "Conversation 1"
 # -------------------------
-# INTERFACE GRADIO OPTIMISÉE (inchangée)
 # -------------------------
 css = """
 :root {
@@ -413,8 +414,8 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
                 gr.Markdown("""
                 <div style="color: #94a3b8; font-size: 14px;">
                 ✅ Streaming hybride<br>
-                ✅ Réactivité 120ms<br>
-                ✅ Optimisé llama.cpp
                 </div>
                 """, elem_classes="conversation-subheader")
@@ -514,8 +515,9 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
-    print("🚀 Lancement de l'interface ultra-réactive...")
-    print("⏱️  Mode streaming hybride activé (120ms)")
     demo.launch(
         share=True,
         server_name="0.0.0.0",

     with lock:
         return list(conversations.keys())
+# Format de prompt Alpaca OPTIMISÉ
+def build_alpaca_prompt(instruction, input_text="", output_text=""):
+    """Format Alpaca optimisé pour la vitesse"""
     if input_text:
+        return f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
     else:
+        return f"### Instruction:\n{instruction}\n\n### Response:\n{output_text}"
 def build_conversation_prompt(history, new_message):
+    """Prompt optimisé pour la vitesse avec format Alpaca léger"""
+    # System prompt seulement au début (plus court)
+    system_prompt = "Tu es Alisia, assistante IA compétente. Réponds en français de façon concise."
+    # Construire l'historique de façon optimisée
+    conversation_parts = []
+    # Ajouter l'historique seulement s'il y en a
+    if history:
+        for user_msg, assistant_msg in history:
+            conv_part = build_alpaca_prompt(user_msg, "", assistant_msg)
+            conversation_parts.append(conv_part)
     # Ajouter le nouveau message
+    current_prompt = build_alpaca_prompt(new_message, "", "")
+    # Combiner tout le prompt
+    full_prompt = f"{system_prompt}\n\n" if not history else ""
+    full_prompt += "\n\n".join(conversation_parts)
+    if conversation_parts:
+        full_prompt += "\n\n"
+    full_prompt += current_prompt
     return full_prompt
 def send_message_stream(user_message, displayed_history, current_chat_name):
     local_hist.append((str(user_message), ""))
     yield local_hist, ""
+    # Construction OPTIMISÉE du prompt
     formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
     partial = ""
+    # PARAMÈTRES DE RÉACTIVITÉ ULTRA-RAPIDE
     last_update = time.time()
     token_count = 0
+    min_tokens = 1      # Minimum réduit pour plus de réactivité
+    max_delay = 0.08    # Réduit à 80ms pour plus de vitesse
     try:
+        # Paramètres de génération OPTIMISÉS
         stream = llm.create_completion(
             prompt=formatted_prompt,
             stream=True,
+            max_tokens=768,  # Réduit pour plus de vitesse
             temperature=0.7,
+            top_p=0.85,      # Légèrement réduit
+            repeat_penalty=1.15,  # Augmenté pour éviter la répétition
+            stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>", "###"],
+            top_k=40         # Ajouté pour la vitesse
         )
         for chunk in stream:
                     partial += token
                     token_count += 1
+                    # STRATÉGIE ULTRA-RAPIDE
                     should_update = (
                         token_count >= min_tokens or
                         time.time() - last_update > max_delay or
+                        token in [".", "!", "?", "\n"]
                     )
                     if should_update:
         yield local_hist, ""
 # -------------------------
+# FONCTIONS POUR L'INTERFACE
 # -------------------------
 def toggle_history(visible_state):
     new_state = not bool(visible_state)
     return [], "Conversation 1"
 # -------------------------
+# INTERFACE GRADIO OPTIMISÉE
 # -------------------------
 css = """
 :root {
                 gr.Markdown("""
                 <div style="color: #94a3b8; font-size: 14px;">
                 ✅ Streaming hybride<br>
+                ✅ Réactivité 80ms<br>
+                ✅ Format Alpaca optimisé
                 </div>
                 """, elem_classes="conversation-subheader")
 # LANCEMENT
 # -------------------------
 if __name__ == "__main__":
+    print("🚀 Lancement de l'interface ULTRA-RAPIDE...")
+    print("⏱️  Mode streaming optimisé (80ms)")
+    print("🎯 Format Alpaca accéléré")
     demo.launch(
         share=True,
         server_name="0.0.0.0",