MarcosFRGames commited on
Commit
ac9909f
·
verified ·
1 Parent(s): ad055c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -65
app.py CHANGED
@@ -32,7 +32,6 @@ class LLMManager:
32
  self.load_all_models()
33
 
34
  def load_all_models(self):
35
- """Cargar todos los modelos en RAM"""
36
  for model_config in self.models_config:
37
  try:
38
  model_name = model_config["name"]
@@ -81,7 +80,6 @@ class LLMManager:
81
  }
82
 
83
  def _download_model(self, model_url):
84
- """Descargar modelo"""
85
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf")
86
  temp_path = temp_file.name
87
  temp_file.close()
@@ -101,11 +99,9 @@ class LLMManager:
101
  return temp_path
102
 
103
  def get_model(self, model_name):
104
- """Obtener instancia de modelo por nombre"""
105
  return self.models.get(model_name)
106
 
107
  def chat_completion(self, model_name, messages, **kwargs):
108
- """Generar respuesta con modelo específico"""
109
  if not self.generation_lock.acquire(blocking=False):
110
  return {"error": "Servidor ocupado - Generación en progreso"}
111
 
@@ -123,30 +119,12 @@ class LLMManager:
123
 
124
  def generate():
125
  try:
126
- # Convertir parámetros de OpenAI a parámetros de llama.cpp
127
- llama_kwargs = kwargs.copy()
128
-
129
- # Mapear repetition_penalty a repeat_penalty
130
- if 'repetition_penalty' in llama_kwargs:
131
- llama_kwargs['repeat_penalty'] = llama_kwargs.pop('repetition_penalty')
132
-
133
- # presence_penalty y frequency_penalty no son soportados directamente por llama.cpp
134
- # Se mantienen en los kwargs pero no se pasan a la llamada
135
- # ya que llama.cpp solo acepta repeat_penalty
136
-
137
- # Filtrar solo los parámetros que acepta llama.cpp
138
- valid_params = [
139
- 'temperature', 'top_p', 'top_k', 'max_tokens', 'stream',
140
- 'stop', 'seed', 'repeat_penalty', 'frequency_penalty',
141
- 'presence_penalty', 'repetition_penalty' # Estos últimos para compatibilidad
142
- ]
143
-
144
- filtered_kwargs = {k: v for k, v in llama_kwargs.items()
145
- if k in valid_params and not (k in ['frequency_penalty', 'presence_penalty', 'repetition_penalty'] and k != 'repeat_penalty')}
146
 
147
  result[0] = model_data["instance"].create_chat_completion(
148
  messages=messages,
149
- **filtered_kwargs
150
  )
151
  except Exception as e:
152
  exception[0] = e
@@ -170,7 +148,6 @@ class LLMManager:
170
  gc.collect()
171
 
172
  def get_loaded_models(self):
173
- """Obtener lista de modelos cargados"""
174
  loaded = []
175
  for name, data in self.models.items():
176
  if data["loaded"]:
@@ -178,7 +155,6 @@ class LLMManager:
178
  return loaded
179
 
180
  def get_all_models_status(self):
181
- """Obtener estado de todos los modelos"""
182
  status = {}
183
  for name, data in self.models.items():
184
  status[name] = {
@@ -189,7 +165,6 @@ class LLMManager:
189
  status[name]["error"] = data["error"]
190
  return status
191
 
192
- # Inicializar el gestor con todos los modelos
193
  llm_manager = LLMManager(MODELS)
194
 
195
  @app.route('/')
@@ -235,9 +210,9 @@ def home():
235
  • top_k= (0-100)<br>
236
  • model= (nombre del modelo)<br>
237
  • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>
238
- • repetition_penalty= (penalización de repetición, 1.0-2.0)<br>
239
- • presence_penalty= (penalización de presencia, -2.0 a 2.0)<br>
240
- • frequency_penalty= (penalización de frecuencia, -2.0 a 2.0)
241
  </div>
242
 
243
  <div class="endpoint">
@@ -273,11 +248,9 @@ def chat_completions():
273
  if key not in ['messages', 'model']:
274
  kwargs[key] = data[key]
275
 
276
- # Aplicar límite de tokens si no se especifica
277
  if 'max_tokens' not in kwargs:
278
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
279
  else:
280
- # Validar que max_tokens no exceda el máximo permitido
281
  if kwargs['max_tokens'] > MAX_GENERATION_TOKENS:
282
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
283
 
@@ -293,9 +266,7 @@ def chat_completions():
293
 
294
  @app.route('/generate/<path:user_message>', methods=['GET'])
295
  def generate_endpoint(user_message):
296
- """Endpoint GET para generar respuestas - Devuelve solo texto"""
297
  try:
298
- # Obtener parámetros GET con valores por defecto
299
  system_instruction = request.args.get('system', '')
300
  temperature = float(request.args.get('temperature', 0.7))
301
  top_p = float(request.args.get('top_p', 0.95))
@@ -303,12 +274,10 @@ def generate_endpoint(user_message):
303
  model_name = request.args.get('model', MODELS[0]["name"])
304
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
305
 
306
- # Nuevos parámetros para penalizaciones
307
  repetition_penalty = request.args.get('repetition_penalty')
308
  presence_penalty = request.args.get('presence_penalty')
309
  frequency_penalty = request.args.get('frequency_penalty')
310
 
311
- # Validar rangos
312
  if not 0 <= temperature <= 2:
313
  return Response(
314
  f"Error: El parámetro 'temperature' debe estar entre 0 y 2",
@@ -330,16 +299,9 @@ def generate_endpoint(user_message):
330
  mimetype='text/plain'
331
  )
332
 
333
- # Validar penalizaciones si están presentes
334
  if repetition_penalty:
335
  try:
336
  repetition_penalty = float(repetition_penalty)
337
- if not 1.0 <= repetition_penalty <= 2.0:
338
- return Response(
339
- f"Error: El parámetro 'repetition_penalty' debe estar entre 1.0 y 2.0",
340
- status=400,
341
- mimetype='text/plain'
342
- )
343
  except ValueError:
344
  return Response(
345
  "Error: repetition_penalty debe ser número válido",
@@ -350,12 +312,6 @@ def generate_endpoint(user_message):
350
  if presence_penalty:
351
  try:
352
  presence_penalty = float(presence_penalty)
353
- if not -2.0 <= presence_penalty <= 2.0:
354
- return Response(
355
- f"Error: El parámetro 'presence_penalty' debe estar entre -2.0 y 2.0",
356
- status=400,
357
- mimetype='text/plain'
358
- )
359
  except ValueError:
360
  return Response(
361
  "Error: presence_penalty debe ser número válido",
@@ -366,12 +322,6 @@ def generate_endpoint(user_message):
366
  if frequency_penalty:
367
  try:
368
  frequency_penalty = float(frequency_penalty)
369
- if not -2.0 <= frequency_penalty <= 2.0:
370
- return Response(
371
- f"Error: El parámetro 'frequency_penalty' debe estar entre -2.0 y 2.0",
372
- status=400,
373
- mimetype='text/plain'
374
- )
375
  except ValueError:
376
  return Response(
377
  "Error: frequency_penalty debe ser número válido",
@@ -379,11 +329,9 @@ def generate_endpoint(user_message):
379
  mimetype='text/plain'
380
  )
381
 
382
- # Limitar max_tokens a la configuración máxima
383
  if max_tokens > MAX_GENERATION_TOKENS:
384
  max_tokens = MAX_GENERATION_TOKENS
385
 
386
- # Validar que el modelo existe
387
  if model_name not in llm_manager.models:
388
  return Response(
389
  f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}",
@@ -391,13 +339,11 @@ def generate_endpoint(user_message):
391
  mimetype='text/plain'
392
  )
393
 
394
- # Crear mensajes
395
  messages = [
396
  {"role": "system", "content": system_instruction},
397
  {"role": "user", "content": user_message}
398
  ]
399
 
400
- # Configurar parámetros
401
  kwargs = {
402
  "temperature": temperature,
403
  "top_p": top_p,
@@ -408,7 +354,6 @@ def generate_endpoint(user_message):
408
  if top_k:
409
  kwargs["top_k"] = int(top_k)
410
 
411
- # Añadir penalizaciones si están presentes
412
  if repetition_penalty:
413
  kwargs["repetition_penalty"] = repetition_penalty
414
 
@@ -418,7 +363,6 @@ def generate_endpoint(user_message):
418
  if frequency_penalty:
419
  kwargs["frequency_penalty"] = frequency_penalty
420
 
421
- # Generar respuesta
422
  result = llm_manager.chat_completion(model_name, messages, **kwargs)
423
 
424
  if "error" in result:
@@ -433,7 +377,6 @@ def generate_endpoint(user_message):
433
  if not response_text:
434
  response_text = "No se generó respuesta"
435
 
436
- # Devolver solo el texto plano
437
  return Response(
438
  response_text,
439
  status=200,
@@ -468,7 +411,6 @@ def health():
468
 
469
  @app.route('/models', methods=['GET'])
470
  def list_models():
471
- """Endpoint para listar todos los modelos y su estado"""
472
  return jsonify({
473
  "available_models": MODELS,
474
  "status": llm_manager.get_all_models_status(),
@@ -480,7 +422,6 @@ def list_models():
480
 
481
  @app.route('/models/<model_name>', methods=['GET'])
482
  def get_model_status(model_name):
483
- """Endpoint para obtener el estado de un modelo específico"""
484
  model_data = llm_manager.get_model(model_name)
485
  if not model_data:
486
  return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404
 
32
  self.load_all_models()
33
 
34
  def load_all_models(self):
 
35
  for model_config in self.models_config:
36
  try:
37
  model_name = model_config["name"]
 
80
  }
81
 
82
  def _download_model(self, model_url):
 
83
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf")
84
  temp_path = temp_file.name
85
  temp_file.close()
 
99
  return temp_path
100
 
101
  def get_model(self, model_name):
 
102
  return self.models.get(model_name)
103
 
104
  def chat_completion(self, model_name, messages, **kwargs):
 
105
  if not self.generation_lock.acquire(blocking=False):
106
  return {"error": "Servidor ocupado - Generación en progreso"}
107
 
 
119
 
120
  def generate():
121
  try:
122
+ if 'repetition_penalty' in kwargs:
123
+ kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  result[0] = model_data["instance"].create_chat_completion(
126
  messages=messages,
127
+ **kwargs
128
  )
129
  except Exception as e:
130
  exception[0] = e
 
148
  gc.collect()
149
 
150
  def get_loaded_models(self):
 
151
  loaded = []
152
  for name, data in self.models.items():
153
  if data["loaded"]:
 
155
  return loaded
156
 
157
  def get_all_models_status(self):
 
158
  status = {}
159
  for name, data in self.models.items():
160
  status[name] = {
 
165
  status[name]["error"] = data["error"]
166
  return status
167
 
 
168
  llm_manager = LLMManager(MODELS)
169
 
170
  @app.route('/')
 
210
  • top_k= (0-100)<br>
211
  • model= (nombre del modelo)<br>
212
  • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>
213
+ • repetition_penalty= (penalización de repetición)<br>
214
+ • presence_penalty= (penalización de presencia)<br>
215
+ • frequency_penalty= (penalización de frecuencia)
216
  </div>
217
 
218
  <div class="endpoint">
 
248
  if key not in ['messages', 'model']:
249
  kwargs[key] = data[key]
250
 
 
251
  if 'max_tokens' not in kwargs:
252
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
253
  else:
 
254
  if kwargs['max_tokens'] > MAX_GENERATION_TOKENS:
255
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
256
 
 
266
 
267
  @app.route('/generate/<path:user_message>', methods=['GET'])
268
  def generate_endpoint(user_message):
 
269
  try:
 
270
  system_instruction = request.args.get('system', '')
271
  temperature = float(request.args.get('temperature', 0.7))
272
  top_p = float(request.args.get('top_p', 0.95))
 
274
  model_name = request.args.get('model', MODELS[0]["name"])
275
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
276
 
 
277
  repetition_penalty = request.args.get('repetition_penalty')
278
  presence_penalty = request.args.get('presence_penalty')
279
  frequency_penalty = request.args.get('frequency_penalty')
280
 
 
281
  if not 0 <= temperature <= 2:
282
  return Response(
283
  f"Error: El parámetro 'temperature' debe estar entre 0 y 2",
 
299
  mimetype='text/plain'
300
  )
301
 
 
302
  if repetition_penalty:
303
  try:
304
  repetition_penalty = float(repetition_penalty)
 
 
 
 
 
 
305
  except ValueError:
306
  return Response(
307
  "Error: repetition_penalty debe ser número válido",
 
312
  if presence_penalty:
313
  try:
314
  presence_penalty = float(presence_penalty)
 
 
 
 
 
 
315
  except ValueError:
316
  return Response(
317
  "Error: presence_penalty debe ser número válido",
 
322
  if frequency_penalty:
323
  try:
324
  frequency_penalty = float(frequency_penalty)
 
 
 
 
 
 
325
  except ValueError:
326
  return Response(
327
  "Error: frequency_penalty debe ser número válido",
 
329
  mimetype='text/plain'
330
  )
331
 
 
332
  if max_tokens > MAX_GENERATION_TOKENS:
333
  max_tokens = MAX_GENERATION_TOKENS
334
 
 
335
  if model_name not in llm_manager.models:
336
  return Response(
337
  f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}",
 
339
  mimetype='text/plain'
340
  )
341
 
 
342
  messages = [
343
  {"role": "system", "content": system_instruction},
344
  {"role": "user", "content": user_message}
345
  ]
346
 
 
347
  kwargs = {
348
  "temperature": temperature,
349
  "top_p": top_p,
 
354
  if top_k:
355
  kwargs["top_k"] = int(top_k)
356
 
 
357
  if repetition_penalty:
358
  kwargs["repetition_penalty"] = repetition_penalty
359
 
 
363
  if frequency_penalty:
364
  kwargs["frequency_penalty"] = frequency_penalty
365
 
 
366
  result = llm_manager.chat_completion(model_name, messages, **kwargs)
367
 
368
  if "error" in result:
 
377
  if not response_text:
378
  response_text = "No se generó respuesta"
379
 
 
380
  return Response(
381
  response_text,
382
  status=200,
 
411
 
412
  @app.route('/models', methods=['GET'])
413
  def list_models():
 
414
  return jsonify({
415
  "available_models": MODELS,
416
  "status": llm_manager.get_all_models_status(),
 
422
 
423
  @app.route('/models/<model_name>', methods=['GET'])
424
  def get_model_status(model_name):
 
425
  model_data = llm_manager.get_model(model_name)
426
  if not model_data:
427
  return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404