Gems234 commited on
Commit
a6acd24
·
verified ·
1 Parent(s): 5977267

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -37
app.py CHANGED
@@ -29,23 +29,31 @@ if not os.path.exists(MODEL_PATH):
29
  print(f"❌ Erreur téléchargement: {e}")
30
 
31
  # -------------------------
32
- # CONFIGURATION LLAMA.CPP
33
  # -------------------------
34
  os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
35
  warnings.filterwarnings("ignore")
36
 
37
- print("⚡ Chargement du modèle avec llama.cpp...")
 
 
 
 
 
 
38
  llm = Llama(
39
  model_path=MODEL_PATH,
40
- n_ctx=2028,
41
- n_gpu_layers=35,
42
- n_threads=6,
43
- #n_batch=512,
44
- verbose=False,
45
- use_mlock=True
 
 
46
  )
47
 
48
- print("✅ Modèle chargé et prêt!")
49
 
50
  # -------------------------
51
  # ÉTAT & SYNCHRONISATION
@@ -55,7 +63,7 @@ conversations = {"Conversation 1": []}
55
  stop_generation = threading.Event()
56
 
57
  # -------------------------
58
- # FONCTIONS UTILITAIRES
59
  # -------------------------
60
  def clean_output(text: str) -> str:
61
  return re.sub(r"<\|im_.*?\|>", "", text).strip()
@@ -64,8 +72,15 @@ def get_conv_names():
64
  with lock:
65
  return list(conversations.keys())
66
 
 
 
67
  def build_conversation_prompt(history, new_message):
68
- """Format de prompt optimisé pour Alpaca"""
 
 
 
 
 
69
  prompt = ""
70
 
71
  # System prompt seulement au début
@@ -74,12 +89,15 @@ def build_conversation_prompt(history, new_message):
74
 
75
  """
76
 
77
- # Historique de conversation
78
- for user_msg, assistant_msg in history:
 
79
  prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
80
 
81
  # Nouveau message
82
  prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
 
 
83
  return prompt
84
 
85
  def send_message_stream(user_message, displayed_history, current_chat_name):
@@ -90,6 +108,8 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
90
  yield displayed_history or [], ""
91
  return
92
 
 
 
93
  with lock:
94
  if current_chat_name not in conversations:
95
  conversations[current_chat_name] = []
@@ -101,21 +121,24 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
101
  formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
102
  partial = ""
103
 
104
- # PARAMÈTRES DE RÉACTIVITÉ HYBRIDE
105
  last_update = time.time()
106
  token_count = 0
107
- min_tokens = 2 # Minimum de tokens avant update
108
- max_delay = 0.12 # Maximum 120ms entre updates
 
109
 
110
  try:
111
  stream = llm.create_completion(
112
  prompt=formatted_prompt,
113
  stream=True,
114
- max_tokens=2028,
115
  temperature=0.7,
116
  top_p=0.9,
117
  repeat_penalty=1.1,
118
- stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"]
 
 
119
  )
120
 
121
  for chunk in stream:
@@ -125,24 +148,31 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
125
  if "choices" in chunk and chunk["choices"]:
126
  token = chunk["choices"][0].get("text", "")
127
  if token:
128
- partial += token
129
  token_count += 1
130
 
131
- # STRATÉGIE DE RÉACTIVITÉ HYBRIDE
 
 
 
132
  should_update = (
133
  token_count >= min_tokens or
134
- time.time() - last_update > max_delay or
135
- token in [".", "!", "?", "\n", ",", ";", ":"]
136
  )
137
 
138
- if should_update:
 
139
  cleaned = clean_output(partial)
140
  local_hist[-1] = (str(user_message), cleaned)
141
  yield local_hist, ""
142
- last_update = time.time()
143
  token_count = 0
 
144
 
145
- # DERNIER FLUSH - Garantit que tout est affiché
 
 
146
  if partial:
147
  cleaned = clean_output(partial)
148
  local_hist[-1] = (str(user_message), cleaned)
@@ -154,6 +184,9 @@ def send_message_stream(user_message, displayed_history, current_chat_name):
154
  yield local_hist, ""
155
 
156
  finally:
 
 
 
157
  with lock:
158
  conversations[current_chat_name] = local_hist.copy()
159
  yield local_hist, ""
@@ -187,7 +220,7 @@ def clear_chat():
187
  return [], "Conversation 1"
188
 
189
  # -------------------------
190
- # INTERFACE GRADIO OPTIMISÉE
191
  # -------------------------
192
  css = """
193
  :root {
@@ -359,17 +392,35 @@ css = """
359
  .clear-btn:hover {
360
  background: #64748b;
361
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  """
363
 
364
- with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft()) as demo:
365
  history_visible = gr.State(True)
366
  current_chat = gr.State("Conversation 1")
367
 
368
  with gr.Row(elem_id="topbar"):
369
  menu_btn = gr.Button("☰", elem_classes="hamburger")
370
- gr.Markdown("### 💬 Alisia <span class='alisia-badge'>AI Assistant</span>", elem_id="title")
371
  gr.HTML("<div style='flex:1'></div>")
372
- gr.Markdown("<small style='color:#94a3b8'>llama.cpp optimisé</small>")
373
 
374
  with gr.Row():
375
  with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
@@ -390,14 +441,24 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
390
  "🗑️ Effacer chat",
391
  elem_classes="clear-btn"
392
  )
393
- gr.Markdown("## 🚀 Mode Ultra-Rapide", elem_classes="conversation-header")
 
394
  gr.Markdown("""
395
- <div style="color: #94a3b8; font-size: 14px;">
396
- Streaming hybride<br>
397
- Réactivité 120ms<br>
398
- Optimisé llama.cpp
 
 
399
  </div>
400
- """, elem_classes="conversation-subheader")
 
 
 
 
 
 
 
401
 
402
  with gr.Column(scale=3, elem_id="chatcol"):
403
  with gr.Column(elem_id="chat-container"):
@@ -495,8 +556,14 @@ with gr.Blocks(css=css, title="Alisia Chat - Ultra Rapide", theme=gr.themes.Soft
495
  # LANCEMENT
496
  # -------------------------
497
  if __name__ == "__main__":
498
- print("🚀 Lancement de l'interface ultra-réactive...")
499
- print("⏱️ Mode streaming hybride activé (120ms)")
 
 
 
 
 
 
500
  demo.launch(
501
  share=True,
502
  server_name="0.0.0.0",
 
29
  print(f"❌ Erreur téléchargement: {e}")
30
 
31
  # -------------------------
32
+ # CONFIGURATION LLAMA.CPP OPTIMISÉE POUR CPU
33
  # -------------------------
34
  os.environ["LLAMA_CPP_LOG_LEVEL"] = "OFF"
35
  warnings.filterwarnings("ignore")
36
 
37
+ print("⚡ Chargement du modèle avec llama.cpp (CPU optimisé)...")
38
+
39
+ # Détection automatique du nombre de threads
40
+ import multiprocessing
41
+ cpu_count = multiprocessing.cpu_count()
42
+ n_threads = max(2, cpu_count - 1) # Utiliser tous les cores sauf un
43
+
44
  llm = Llama(
45
  model_path=MODEL_PATH,
46
+ n_ctx=2048, # Contexte réduit pour meilleure performance
47
+ n_gpu_layers=0, # Désactivé pour CPU uniquement
48
+ n_threads=n_threads, # Optimisé pour votre CPU
49
+ n_batch=512, # Batch adapté pour CPU
50
+ n_threads_batch=n_threads, # Même nombre de threads pour le batch
51
+ use_mlock=False, # Désactivé pour meilleures performances
52
+ vocab_only=False,
53
+ verbose=False
54
  )
55
 
56
+ print(f"✅ Modèle chargé! Threads: {n_threads} | CPU: {cpu_count} cores")
57
 
58
  # -------------------------
59
  # ÉTAT & SYNCHRONISATION
 
63
  stop_generation = threading.Event()
64
 
65
  # -------------------------
66
+ # FONCTIONS UTILITAIRES OPTIMISÉES
67
  # -------------------------
68
  def clean_output(text: str) -> str:
69
  return re.sub(r"<\|im_.*?\|>", "", text).strip()
 
72
  with lock:
73
  return list(conversations.keys())
74
 
75
+ # Cache pour éviter la reconstruction complète du prompt
76
+ prompt_cache = {}
77
  def build_conversation_prompt(history, new_message):
78
+ """Format de prompt optimisé avec cache"""
79
+ cache_key = str(len(history)) + new_message[:50]
80
+
81
+ if cache_key in prompt_cache:
82
+ return prompt_cache[cache_key]
83
+
84
  prompt = ""
85
 
86
  # System prompt seulement au début
 
89
 
90
  """
91
 
92
+ # Historique de conversation (limité aux derniers messages)
93
+ recent_history = history[-4:] # Limiter à 4 derniers échanges pour CPU
94
+ for user_msg, assistant_msg in recent_history:
95
  prompt += f"### Instruction:\n{user_msg}\n\n### Response:\n{assistant_msg}\n\n"
96
 
97
  # Nouveau message
98
  prompt += f"### Instruction:\n{new_message}\n\n### Response:\n"
99
+
100
+ prompt_cache[cache_key] = prompt
101
  return prompt
102
 
103
  def send_message_stream(user_message, displayed_history, current_chat_name):
 
108
  yield displayed_history or [], ""
109
  return
110
 
111
+ start_time = time.time()
112
+
113
  with lock:
114
  if current_chat_name not in conversations:
115
  conversations[current_chat_name] = []
 
121
  formatted_prompt = build_conversation_prompt(local_hist[:-1], str(user_message))
122
  partial = ""
123
 
124
+ # PARAMÈTRES OPTIMISÉS POUR CPU
125
  last_update = time.time()
126
  token_count = 0
127
+ min_tokens = 3 # Regroupement modéré pour CPU
128
+ max_delay = 0.3 # 300ms entre updates pour CPU
129
+ buffer = ""
130
 
131
  try:
132
  stream = llm.create_completion(
133
  prompt=formatted_prompt,
134
  stream=True,
135
+ max_tokens=384, # Réponse plus courte pour CPU
136
  temperature=0.7,
137
  top_p=0.9,
138
  repeat_penalty=1.1,
139
+ stop=["### Instruction:", "### Response:", "\n\n", "<|endoftext|>"],
140
+ min_p=0.05, # Acceleration CPU
141
+ typical_p=0.95 # Acceleration CPU
142
  )
143
 
144
  for chunk in stream:
 
148
  if "choices" in chunk and chunk["choices"]:
149
  token = chunk["choices"][0].get("text", "")
150
  if token:
151
+ buffer += token
152
  token_count += 1
153
 
154
+ # STRATÉGIE OPTIMISÉE POUR CPU
155
+ current_time = time.time()
156
+ time_since_update = current_time - last_update
157
+
158
  should_update = (
159
  token_count >= min_tokens or
160
+ time_since_update > max_delay or
161
+ token in [".", "!", "?", "\n", " "]
162
  )
163
 
164
+ if should_update and buffer.strip():
165
+ partial += buffer
166
  cleaned = clean_output(partial)
167
  local_hist[-1] = (str(user_message), cleaned)
168
  yield local_hist, ""
169
+ last_update = current_time
170
  token_count = 0
171
+ buffer = ""
172
 
173
+ # Dernier flush du buffer
174
+ if buffer:
175
+ partial += buffer
176
  if partial:
177
  cleaned = clean_output(partial)
178
  local_hist[-1] = (str(user_message), cleaned)
 
184
  yield local_hist, ""
185
 
186
  finally:
187
+ end_time = time.time()
188
+ generation_time = end_time - start_time
189
+ print(f"⏱️ Temps de génération: {generation_time:.2f}s - {len(partial)} caractères")
190
  with lock:
191
  conversations[current_chat_name] = local_hist.copy()
192
  yield local_hist, ""
 
220
  return [], "Conversation 1"
221
 
222
  # -------------------------
223
+ # INTERFACE GRADIO OPTIMISÉE POUR CPU
224
  # -------------------------
225
  css = """
226
  :root {
 
392
  .clear-btn:hover {
393
  background: #64748b;
394
  }
395
+
396
+ .perf-info {
397
+ color: #94a3b8;
398
+ font-size: 12px;
399
+ margin-top: 10px;
400
+ padding: 8px;
401
+ background: #1e293b;
402
+ border-radius: 8px;
403
+ }
404
+
405
+ .cpu-warning {
406
+ color: #fbbf24;
407
+ background: #431407;
408
+ padding: 8px;
409
+ border-radius: 8px;
410
+ margin-top: 10px;
411
+ font-size: 12px;
412
+ }
413
  """
414
 
415
+ with gr.Blocks(css=css, title="Alisia Chat - Optimisé CPU", theme=gr.themes.Soft()) as demo:
416
  history_visible = gr.State(True)
417
  current_chat = gr.State("Conversation 1")
418
 
419
  with gr.Row(elem_id="topbar"):
420
  menu_btn = gr.Button("☰", elem_classes="hamburger")
421
+ gr.Markdown("### 💬 Alisia <span class='alisia-badge'>CPU Mode</span>", elem_id="title")
422
  gr.HTML("<div style='flex:1'></div>")
423
+ gr.Markdown(f"<small style='color:#94a3b8'>CPU: {cpu_count} cores • Threads: {n_threads}</small>")
424
 
425
  with gr.Row():
426
  with gr.Column(scale=1, visible=True, elem_id="leftcol") as left_column:
 
441
  "🗑️ Effacer chat",
442
  elem_classes="clear-btn"
443
  )
444
+
445
+ # Informations de performance CPU
446
  gr.Markdown("""
447
+ <div class="perf-info">
448
+ <strong>⚡ Mode CPU Optimisé</strong><br>
449
+ Threads: {n_threads}/{cpu_count}<br>
450
+ Contexte: 2048 tokens<br>
451
+ • Latence: ~300ms<br>
452
+ • Réponses: 384 tokens max
453
  </div>
454
+ """.format(n_threads=n_threads, cpu_count=cpu_count))
455
+
456
+ gr.Markdown("""
457
+ <div class="cpu-warning">
458
+ ⚠️ Mode CPU - Les performances peuvent varier<br>
459
+ selon la puissance de votre processeur
460
+ </div>
461
+ """)
462
 
463
  with gr.Column(scale=3, elem_id="chatcol"):
464
  with gr.Column(elem_id="chat-container"):
 
556
  # LANCEMENT
557
  # -------------------------
558
  if __name__ == "__main__":
559
+ print("🚀 Lancement de l'interface optimisée CPU...")
560
+ print(f"💻 Configuration CPU:")
561
+ print(f" - Cores disponibles: {cpu_count}")
562
+ print(f" - Threads utilisés: {n_threads}")
563
+ print(f" - Contexte: 2048 tokens")
564
+ print(f" - Réponses limitées: 384 tokens")
565
+ print("⏱️ Patience - Le CPU peut être plus lent que le GPU")
566
+
567
  demo.launch(
568
  share=True,
569
  server_name="0.0.0.0",