MarcosFRGames commited on
Commit
e7e485e
·
verified ·
1 Parent(s): f2ddcf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -26
app.py CHANGED
@@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor
13
  app = Flask(__name__)
14
  logging.basicConfig(level=logging.INFO)
15
 
16
- MAX_CONTEXT_TOKENS = 1024 * 10
17
  MAX_GENERATION_TOKENS = 1024 * 4
18
 
19
  with open('engines.json', 'r') as f:
@@ -32,7 +32,6 @@ class LLMManager:
32
  self.load_all_models()
33
 
34
  def load_all_models(self):
35
- """Cargar todos los modelos en RAM"""
36
  for model_config in self.models_config:
37
  try:
38
  model_name = model_config["name"]
@@ -81,7 +80,6 @@ class LLMManager:
81
  }
82
 
83
  def _download_model(self, model_url):
84
- """Descargar modelo"""
85
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf")
86
  temp_path = temp_file.name
87
  temp_file.close()
@@ -101,11 +99,9 @@ class LLMManager:
101
  return temp_path
102
 
103
  def get_model(self, model_name):
104
- """Obtener instancia de modelo por nombre"""
105
  return self.models.get(model_name)
106
 
107
  def chat_completion(self, model_name, messages, **kwargs):
108
- """Generar respuesta con modelo específico"""
109
  if not self.generation_lock.acquire(blocking=False):
110
  return {"error": "Servidor ocupado - Generación en progreso"}
111
 
@@ -123,6 +119,9 @@ class LLMManager:
123
 
124
  def generate():
125
  try:
 
 
 
126
  result[0] = model_data["instance"].create_chat_completion(
127
  messages=messages,
128
  **kwargs
@@ -149,7 +148,6 @@ class LLMManager:
149
  gc.collect()
150
 
151
  def get_loaded_models(self):
152
- """Obtener lista de modelos cargados"""
153
  loaded = []
154
  for name, data in self.models.items():
155
  if data["loaded"]:
@@ -157,7 +155,6 @@ class LLMManager:
157
  return loaded
158
 
159
  def get_all_models_status(self):
160
- """Obtener estado de todos los modelos"""
161
  status = {}
162
  for name, data in self.models.items():
163
  status[name] = {
@@ -168,7 +165,6 @@ class LLMManager:
168
  status[name]["error"] = data["error"]
169
  return status
170
 
171
- # Inicializar el gestor con todos los modelos
172
  llm_manager = LLMManager(MODELS)
173
 
174
  @app.route('/')
@@ -213,7 +209,10 @@ def home():
213
  • top_p= (0.0-1.0)<br>
214
  • top_k= (0-100)<br>
215
  • model= (nombre del modelo)<br>
216
- • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})
 
 
 
217
  </div>
218
 
219
  <div class="endpoint">
@@ -249,11 +248,9 @@ def chat_completions():
249
  if key not in ['messages', 'model']:
250
  kwargs[key] = data[key]
251
 
252
- # Aplicar límite de tokens si no se especifica
253
  if 'max_tokens' not in kwargs:
254
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
255
  else:
256
- # Validar que max_tokens no exceda el máximo permitido
257
  if kwargs['max_tokens'] > MAX_GENERATION_TOKENS:
258
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
259
 
@@ -269,9 +266,7 @@ def chat_completions():
269
 
270
  @app.route('/generate/<path:user_message>', methods=['GET'])
271
  def generate_endpoint(user_message):
272
- """Endpoint GET para generar respuestas - Devuelve solo texto"""
273
  try:
274
- # Obtener parámetros GET con valores por defecto
275
  system_instruction = request.args.get('system', '')
276
  temperature = float(request.args.get('temperature', 0.7))
277
  top_p = float(request.args.get('top_p', 0.95))
@@ -279,7 +274,10 @@ def generate_endpoint(user_message):
279
  model_name = request.args.get('model', MODELS[0]["name"])
280
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
281
 
282
- # Validar rangos
 
 
 
283
  if not 0 <= temperature <= 2:
284
  return Response(
285
  f"Error: El parámetro 'temperature' debe estar entre 0 y 2",
@@ -301,11 +299,39 @@ def generate_endpoint(user_message):
301
  mimetype='text/plain'
302
  )
303
 
304
- # Limitar max_tokens a la configuración máxima
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  if max_tokens > MAX_GENERATION_TOKENS:
306
  max_tokens = MAX_GENERATION_TOKENS
307
 
308
- # Validar que el modelo existe
309
  if model_name not in llm_manager.models:
310
  return Response(
311
  f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}",
@@ -313,13 +339,11 @@ def generate_endpoint(user_message):
313
  mimetype='text/plain'
314
  )
315
 
316
- # Crear mensajes
317
  messages = [
318
  {"role": "system", "content": system_instruction},
319
  {"role": "user", "content": user_message}
320
  ]
321
 
322
- # Configurar parámetros
323
  kwargs = {
324
  "temperature": temperature,
325
  "top_p": top_p,
@@ -328,12 +352,17 @@ def generate_endpoint(user_message):
328
  }
329
 
330
  if top_k:
331
- try:
332
- kwargs["top_k"] = int(top_k)
333
- except ValueError:
334
- return Response("Error: top_k debe ser número entero", status=400)
 
 
 
 
 
 
335
 
336
- # Generar respuesta
337
  result = llm_manager.chat_completion(model_name, messages, **kwargs)
338
 
339
  if "error" in result:
@@ -348,7 +377,6 @@ def generate_endpoint(user_message):
348
  if not response_text:
349
  response_text = "No se generó respuesta"
350
 
351
- # Devolver solo el texto plano
352
  return Response(
353
  response_text,
354
  status=200,
@@ -383,7 +411,6 @@ def health():
383
 
384
  @app.route('/models', methods=['GET'])
385
  def list_models():
386
- """Endpoint para listar todos los modelos y su estado"""
387
  return jsonify({
388
  "available_models": MODELS,
389
  "status": llm_manager.get_all_models_status(),
@@ -395,7 +422,6 @@ def list_models():
395
 
396
  @app.route('/models/<model_name>', methods=['GET'])
397
  def get_model_status(model_name):
398
- """Endpoint para obtener el estado de un modelo específico"""
399
  model_data = llm_manager.get_model(model_name)
400
  if not model_data:
401
  return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404
 
13
  app = Flask(__name__)
14
  logging.basicConfig(level=logging.INFO)
15
 
16
+ MAX_CONTEXT_TOKENS = 1024 * 4
17
  MAX_GENERATION_TOKENS = 1024 * 4
18
 
19
  with open('engines.json', 'r') as f:
 
32
  self.load_all_models()
33
 
34
  def load_all_models(self):
 
35
  for model_config in self.models_config:
36
  try:
37
  model_name = model_config["name"]
 
80
  }
81
 
82
  def _download_model(self, model_url):
 
83
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf")
84
  temp_path = temp_file.name
85
  temp_file.close()
 
99
  return temp_path
100
 
101
  def get_model(self, model_name):
 
102
  return self.models.get(model_name)
103
 
104
  def chat_completion(self, model_name, messages, **kwargs):
 
105
  if not self.generation_lock.acquire(blocking=False):
106
  return {"error": "Servidor ocupado - Generación en progreso"}
107
 
 
119
 
120
  def generate():
121
  try:
122
+ if 'repetition_penalty' in kwargs:
123
+ kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty')
124
+
125
  result[0] = model_data["instance"].create_chat_completion(
126
  messages=messages,
127
  **kwargs
 
148
  gc.collect()
149
 
150
  def get_loaded_models(self):
 
151
  loaded = []
152
  for name, data in self.models.items():
153
  if data["loaded"]:
 
155
  return loaded
156
 
157
  def get_all_models_status(self):
 
158
  status = {}
159
  for name, data in self.models.items():
160
  status[name] = {
 
165
  status[name]["error"] = data["error"]
166
  return status
167
 
 
168
  llm_manager = LLMManager(MODELS)
169
 
170
  @app.route('/')
 
209
  • top_p= (0.0-1.0)<br>
210
  • top_k= (0-100)<br>
211
  • model= (nombre del modelo)<br>
212
+ • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br>
213
+ • repetition_penalty= (penalización de repetición)<br>
214
+ • presence_penalty= (penalización de presencia)<br>
215
+ • frequency_penalty= (penalización de frecuencia)
216
  </div>
217
 
218
  <div class="endpoint">
 
248
  if key not in ['messages', 'model']:
249
  kwargs[key] = data[key]
250
 
 
251
  if 'max_tokens' not in kwargs:
252
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
253
  else:
 
254
  if kwargs['max_tokens'] > MAX_GENERATION_TOKENS:
255
  kwargs['max_tokens'] = MAX_GENERATION_TOKENS
256
 
 
266
 
267
  @app.route('/generate/<path:user_message>', methods=['GET'])
268
  def generate_endpoint(user_message):
 
269
  try:
 
270
  system_instruction = request.args.get('system', '')
271
  temperature = float(request.args.get('temperature', 0.7))
272
  top_p = float(request.args.get('top_p', 0.95))
 
274
  model_name = request.args.get('model', MODELS[0]["name"])
275
  max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
276
 
277
+ repetition_penalty = request.args.get('repetition_penalty')
278
+ presence_penalty = request.args.get('presence_penalty')
279
+ frequency_penalty = request.args.get('frequency_penalty')
280
+
281
  if not 0 <= temperature <= 2:
282
  return Response(
283
  f"Error: El parámetro 'temperature' debe estar entre 0 y 2",
 
299
  mimetype='text/plain'
300
  )
301
 
302
+ if repetition_penalty:
303
+ try:
304
+ repetition_penalty = float(repetition_penalty)
305
+ except ValueError:
306
+ return Response(
307
+ "Error: repetition_penalty debe ser número válido",
308
+ status=400,
309
+ mimetype='text/plain'
310
+ )
311
+
312
+ if presence_penalty:
313
+ try:
314
+ presence_penalty = float(presence_penalty)
315
+ except ValueError:
316
+ return Response(
317
+ "Error: presence_penalty debe ser número válido",
318
+ status=400,
319
+ mimetype='text/plain'
320
+ )
321
+
322
+ if frequency_penalty:
323
+ try:
324
+ frequency_penalty = float(frequency_penalty)
325
+ except ValueError:
326
+ return Response(
327
+ "Error: frequency_penalty debe ser número válido",
328
+ status=400,
329
+ mimetype='text/plain'
330
+ )
331
+
332
  if max_tokens > MAX_GENERATION_TOKENS:
333
  max_tokens = MAX_GENERATION_TOKENS
334
 
 
335
  if model_name not in llm_manager.models:
336
  return Response(
337
  f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}",
 
339
  mimetype='text/plain'
340
  )
341
 
 
342
  messages = [
343
  {"role": "system", "content": system_instruction},
344
  {"role": "user", "content": user_message}
345
  ]
346
 
 
347
  kwargs = {
348
  "temperature": temperature,
349
  "top_p": top_p,
 
352
  }
353
 
354
  if top_k:
355
+ kwargs["top_k"] = int(top_k)
356
+
357
+ if repetition_penalty:
358
+ kwargs["repetition_penalty"] = repetition_penalty
359
+
360
+ if presence_penalty:
361
+ kwargs["presence_penalty"] = presence_penalty
362
+
363
+ if frequency_penalty:
364
+ kwargs["frequency_penalty"] = frequency_penalty
365
 
 
366
  result = llm_manager.chat_completion(model_name, messages, **kwargs)
367
 
368
  if "error" in result:
 
377
  if not response_text:
378
  response_text = "No se generó respuesta"
379
 
 
380
  return Response(
381
  response_text,
382
  status=200,
 
411
 
412
  @app.route('/models', methods=['GET'])
413
  def list_models():
 
414
  return jsonify({
415
  "available_models": MODELS,
416
  "status": llm_manager.get_all_models_status(),
 
422
 
423
  @app.route('/models/<model_name>', methods=['GET'])
424
  def get_model_status(model_name):
 
425
  model_data = llm_manager.get_model(model_name)
426
  if not model_data:
427
  return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404