from flask import Flask, request, jsonify, Response import os import logging import time from llama_cpp import Llama import requests import tempfile import json from concurrent.futures import ThreadPoolExecutor app = Flask(__name__) logging.basicConfig(level=logging.INFO) MAX_CONTEXT_TOKENS = 1024 * 8 MAX_GENERATION_TOKENS = 1024 * 4 with open('engines.json', 'r') as f: MODELS = json.load(f) class LLMManager: def __init__(self, models_config): self.models = {} self.models_config = models_config self.executor = ThreadPoolExecutor(max_workers=2) self.generation_lock = theading.Lock() self.load_all_models() def load_all_models(self): """Cargar todos los modelos en RAM""" for model_config in self.models_config: try: model_name = model_config["name"] logging.info(f"🚀 Cargando modelo: {model_name}") temp_path = self._download_model(model_config["url"]) actual_size = os.path.getsize(temp_path) actual_gb = actual_size / (1024*1024*1024) logging.info(f"📊 Tamaño descargado para {model_name}: {actual_gb:.2f} GB") logging.info(f"🔄 Cargando {model_name} en RAM…") llm_instance = Llama( model_path=temp_path, n_ctx=MAX_CONTEXT_TOKENS, n_batch=128, n_threads=2, n_threads_batch=2, use_mlock=True, mmap=True, low_vram=False, vocab_only=False ) os.remove(temp_path) self.models[model_name] = { "instance": llm_instance, "loaded": True, "config": model_config } logging.info(f"✅ Modelo {model_name} cargado") except Exception as e: logging.error(f"❌ Error cargando modelo {model_config['name']}: {e}") self.models[model_config["name"]] = { "instance": None, "loaded": False, "config": model_config, "error": str(e) } def _download_model(self, model_url): """Descargar modelo""" temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf") temp_path = temp_file.name temp_file.close() logging.info("📥 Descargando modelo…") response = requests.get(model_url, stream=True, timeout=300) response.raise_for_status() downloaded = 0 with open(temp_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) downloaded += len(chunk) return temp_path def get_model(self, model_name): """Obtener instancia de modelo por nombre""" return self.models.get(model_name) def chat_completion(self, model_name, messages, **kwargs): """Generar respuesta con modelo específico""" if not self.generation_lock.acquire(blocking=False): return {"error": "Servidor ocupado - Generación en progreso"} try: model_data = self.get_model(model_name) if not model_data or not model_data["loaded"]: error_msg = f"Modelo {model_name} no cargado" if model_data and "error" in model_data: error_msg += f": {model_data['error']}" return {"error": error_msg} response = model_data["instance"].create_chat_completion( messages=messages, **kwargs ) response["provider"] = "telechars-ai" response["model"] = model_name return response finally: # Siempre liberar el lock self.generation_lock.release() def get_loaded_models(self): """Obtener lista de modelos cargados""" loaded = [] for name, data in self.models.items(): if data["loaded"]: loaded.append(name) return loaded def get_all_models_status(self): """Obtener estado de todos los modelos""" status = {} for name, data in self.models.items(): status[name] = { "loaded": data["loaded"], "url": data["config"]["url"] } if "error" in data: status[name]["error"] = data["error"] return status # Inicializar el gestor con todos los modelos llm_manager = LLMManager(MODELS) @app.route('/') def home(): loaded_models = llm_manager.get_loaded_models() status_html = "" return f''' TeleChars AI API

TeleChars AI API

⚙️ Configuración

Max Context Tokens: {MAX_CONTEXT_TOKENS}

Max Generation Tokens: {MAX_GENERATION_TOKENS}

📦 Modelos cargados:

{status_html}

Total modelos: {len(loaded_models)}/{len(MODELS)}

🔗 Endpoints disponibles:

GET /generate/<mensaje>[?params]
Devuelve solo el texto generado. Parámetros opcionales:
• system= (instrucciones del sistema)
• temperature= (0.0-2.0)
• top_p= (0.0-1.0)
• model= (nombre del modelo)
• max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})
POST /v1/chat/completions
Compatible con OpenAI API
GET /health
Estado del servicio
GET /models
Lista todos los modelos disponibles
''' @app.route('/v1/chat/completions', methods=['POST']) def chat_completions(): try: data = request.get_json() messages = data.get('messages', []) model_name = data.get('model', MODELS[0]["name"]) if model_name not in llm_manager.models: return jsonify({"error": f"Modelo '{model_name}' no encontrado. Modelos disponibles: {list(llm_manager.models.keys())}"}), 400 kwargs = {} for key in data.keys(): if key not in ['messages', 'model']: kwargs[key] = data[key] # Aplicar límite de tokens si no se especifica if 'max_tokens' not in kwargs: kwargs['max_tokens'] = MAX_GENERATION_TOKENS else: # Validar que max_tokens no exceda el máximo permitido if kwargs['max_tokens'] > MAX_GENERATION_TOKENS: kwargs['max_tokens'] = MAX_GENERATION_TOKENS result = llm_manager.chat_completion(model_name, messages, **kwargs) if "error" in result: return jsonify(result), 500 return jsonify(result), 200 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/generate/', methods=['GET']) def generate_endpoint(user_message): """Endpoint GET para generar respuestas - Devuelve solo texto""" try: # Obtener parámetros GET con valores por defecto system_instruction = request.args.get('system', '') temperature = float(request.args.get('temperature', 0.7)) top_p = float(request.args.get('top_p', 0.95)) model_name = request.args.get('model', MODELS[0]["name"]) max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS)) # Validar rangos if not 0 <= temperature <= 2: return Response( f"Error: El parámetro 'temperature' debe estar entre 0 y 2", status=400, mimetype='text/plain' ) if not 0 <= top_p <= 1: return Response( f"Error: El parámetro 'top_p' debe estar entre 0 y 1", status=400, mimetype='text/plain' ) # Limitar max_tokens a la configuración máxima if max_tokens > MAX_GENERATION_TOKENS: max_tokens = MAX_GENERATION_TOKENS # Validar que el modelo existe if model_name not in llm_manager.models: return Response( f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}", status=400, mimetype='text/plain' ) # Crear mensajes messages = [ {"role": "system", "content": system_instruction}, {"role": "user", "content": user_message} ] # Configurar parámetros kwargs = { "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, "stream": False } # Generar respuesta result = llm_manager.chat_completion(model_name, messages, **kwargs) if "error" in result: return Response( f"Error: {result['error']}", status=500, mimetype='text/plain' ) response_text = result.get("choices", [{}])[0].get("message", {}).get("content", "") if not response_text: response_text = "No se generó respuesta" # Devolver solo el texto plano return Response( response_text, status=200, mimetype='text/plain' ) except ValueError as e: return Response( f"Error: Parámetros inválidos - {str(e)}. Asegúrate de que temperature, top_p y max_tokens sean números válidos.", status=400, mimetype='text/plain' ) except Exception as e: return Response( f"Error: {str(e)}", status=500, mimetype='text/plain' ) @app.route('/health', methods=['GET']) def health(): loaded_models = llm_manager.get_loaded_models() return jsonify({ "status": "healthy" if len(loaded_models) > 0 else "error", "loaded_models": loaded_models, "total_models": len(MODELS), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) @app.route('/models', methods=['GET']) def list_models(): """Endpoint para listar todos los modelos y su estado""" return jsonify({ "available_models": MODELS, "status": llm_manager.get_all_models_status(), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) @app.route('/models/', methods=['GET']) def get_model_status(model_name): """Endpoint para obtener el estado de un modelo específico""" model_data = llm_manager.get_model(model_name) if not model_data: return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404 return jsonify({ "model": model_name, "loaded": model_data["loaded"], "url": model_data["config"]["url"], "error": model_data.get("error"), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) if __name__ == '__main__': app.run(host='0.0.0.0', port=7860, debug=False)