MarcosFRGames commited on
Commit
ad055c8
·
verified ·
1 Parent(s): 6c9c1ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -6
app.py CHANGED
@@ -123,9 +123,30 @@ class LLMManager:
123
 
124
  def generate():
125
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  result[0] = model_data["instance"].create_chat_completion(
127
  messages=messages,
128
- **kwargs
129
  )
130
  except Exception as e:
131
  exception[0] = e
@@ -213,7 +234,10 @@ def home():
213
  • top_p= (0.0-1.0)<br>
214
  • top_k= (0-100)<br>
215
  • model= (nombre del modelo)<br>
216
- • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})
 
 
 
217
  </div>
218
 
219
  <div class="endpoint">
@@ -279,6 +303,11 @@ def generate_endpoint(user_message):
279
  model_name = request.args.get('model', MODELS[0]["name"])
280
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
281
 
 
 
 
 
 
282
  # Validar rangos
283
  if not 0 <= temperature <= 2:
284
  return Response(
@@ -301,6 +330,55 @@ def generate_endpoint(user_message):
301
  mimetype='text/plain'
302
  )
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  # Limitar max_tokens a la configuración máxima
305
  if max_tokens > MAX_GENERATION_TOKENS:
306
  max_tokens = MAX_GENERATION_TOKENS
@@ -328,10 +406,17 @@ def generate_endpoint(user_message):
328
  }
329
 
330
  if top_k:
331
- try:
332
- kwargs["top_k"] = int(top_k)
333
- except ValueError:
334
- return Response("Error: top_k debe ser número entero", status=400)
 
 
 
 
 
 
 
335
 
336
  # Generar respuesta
337
  result = llm_manager.chat_completion(model_name, messages, **kwargs)
 
123
 
124
  def generate():
125
  try:
126
+ # Convertir parámetros de OpenAI a parámetros de llama.cpp
127
+ llama_kwargs = kwargs.copy()
128
+
129
+ # Mapear repetition_penalty a repeat_penalty
130
+ if 'repetition_penalty' in llama_kwargs:
131
+ llama_kwargs['repeat_penalty'] = llama_kwargs.pop('repetition_penalty')
132
+
133
+ # presence_penalty y frequency_penalty no son soportados directamente por llama.cpp
134
+ # Se mantienen en los kwargs pero no se pasan a la llamada
135
+ # ya que llama.cpp solo acepta repeat_penalty
136
+
137
+ # Filtrar solo los parámetros que acepta llama.cpp
138
+ valid_params = [
139
+ 'temperature', 'top_p', 'top_k', 'max_tokens', 'stream',
140
+ 'stop', 'seed', 'repeat_penalty', 'frequency_penalty',
141
+ 'presence_penalty', 'repetition_penalty' # Estos últimos para compatibilidad
142
+ ]
143
+
144
+ filtered_kwargs = {k: v for k, v in llama_kwargs.items()
145
+ if k in valid_params and not (k in ['frequency_penalty', 'presence_penalty', 'repetition_penalty'] and k != 'repeat_penalty')}
146
+
147
  result[0] = model_data["instance"].create_chat_completion(
148
  messages=messages,
149
+ **filtered_kwargs
150
  )
151
  except Exception as e:
152
  exception[0] = e
 
234
  • top_p= (0.0-1.0)<br>
235
  • top_k= (0-100)<br>
236
  • model= (nombre del modelo)<br>
237
+ • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>
238
+ • repetition_penalty= (penalización de repetición, 1.0-2.0)<br>
239
+ • presence_penalty= (penalización de presencia, -2.0 a 2.0)<br>
240
+ • frequency_penalty= (penalización de frecuencia, -2.0 a 2.0)
241
  </div>
242
 
243
  <div class="endpoint">
 
303
  model_name = request.args.get('model', MODELS[0]["name"])
304
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
305
 
306
+ # Nuevos parámetros para penalizaciones
307
+ repetition_penalty = request.args.get('repetition_penalty')
308
+ presence_penalty = request.args.get('presence_penalty')
309
+ frequency_penalty = request.args.get('frequency_penalty')
310
+
311
  # Validar rangos
312
  if not 0 <= temperature <= 2:
313
  return Response(
 
330
  mimetype='text/plain'
331
  )
332
 
333
+ # Validar penalizaciones si están presentes
334
+ if repetition_penalty:
335
+ try:
336
+ repetition_penalty = float(repetition_penalty)
337
+ if not 1.0 <= repetition_penalty <= 2.0:
338
+ return Response(
339
+ f"Error: El parámetro 'repetition_penalty' debe estar entre 1.0 y 2.0",
340
+ status=400,
341
+ mimetype='text/plain'
342
+ )
343
+ except ValueError:
344
+ return Response(
345
+ "Error: repetition_penalty debe ser número válido",
346
+ status=400,
347
+ mimetype='text/plain'
348
+ )
349
+
350
+ if presence_penalty:
351
+ try:
352
+ presence_penalty = float(presence_penalty)
353
+ if not -2.0 <= presence_penalty <= 2.0:
354
+ return Response(
355
+ f"Error: El parámetro 'presence_penalty' debe estar entre -2.0 y 2.0",
356
+ status=400,
357
+ mimetype='text/plain'
358
+ )
359
+ except ValueError:
360
+ return Response(
361
+ "Error: presence_penalty debe ser número válido",
362
+ status=400,
363
+ mimetype='text/plain'
364
+ )
365
+
366
+ if frequency_penalty:
367
+ try:
368
+ frequency_penalty = float(frequency_penalty)
369
+ if not -2.0 <= frequency_penalty <= 2.0:
370
+ return Response(
371
+ f"Error: El parámetro 'frequency_penalty' debe estar entre -2.0 y 2.0",
372
+ status=400,
373
+ mimetype='text/plain'
374
+ )
375
+ except ValueError:
376
+ return Response(
377
+ "Error: frequency_penalty debe ser número válido",
378
+ status=400,
379
+ mimetype='text/plain'
380
+ )
381
+
382
  # Limitar max_tokens a la configuración máxima
383
  if max_tokens > MAX_GENERATION_TOKENS:
384
  max_tokens = MAX_GENERATION_TOKENS
 
406
  }
407
 
408
  if top_k:
409
+ kwargs["top_k"] = int(top_k)
410
+
411
+ # Añadir penalizaciones si están presentes
412
+ if repetition_penalty:
413
+ kwargs["repetition_penalty"] = repetition_penalty
414
+
415
+ if presence_penalty:
416
+ kwargs["presence_penalty"] = presence_penalty
417
+
418
+ if frequency_penalty:
419
+ kwargs["frequency_penalty"] = frequency_penalty
420
 
421
  # Generar respuesta
422
  result = llm_manager.chat_completion(model_name, messages, **kwargs)