from flask import Flask, request, jsonify, Response import os import logging import threading import time import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.16"]) from llama_cpp import Llama import requests import tempfile import json import gc from concurrent.futures import ThreadPoolExecutor app = Flask(__name__) logging.basicConfig(level=logging.INFO) MAX_CONTEXT_TOKENS = 1024 * 8 MAX_GENERATION_TOKENS = 1024 * 4 with open('engines.json', 'r') as f: MODELS = json.load(f) class LLMManager: def __init__(self, models_config): self.models = {} self.models_config = models_config self.executor = ThreadPoolExecutor(max_workers=2) self.generation_lock = threading.Lock() self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_connections=2, pool_maxsize=2) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.load_all_models() def load_all_models(self): for model_config in self.models_config: try: model_name = model_config["name"] logging.info(f"🚀 Cargando modelo: {model_name}") temp_path = self._download_model(model_config["url"]) actual_size = os.path.getsize(temp_path) actual_gb = actual_size / (1024*1024*1024) logging.info(f"📊 Tamaño descargado para {model_name}: {actual_gb:.2f} GB") n_batch = model_config.get("n_batch", 96) logging.info(f"🔄 Cargando {model_name} en RAM…") llm_instance = Llama( model_path=temp_path, n_ctx=MAX_CONTEXT_TOKENS, n_batch=n_batch, n_threads=2, n_threads_batch=2, use_mlock=False, mmap=True, low_vram=False, vocab_only=False, verbose=False, logits_all=False, mul_mat_q=True ) os.remove(temp_path) self.models[model_name] = { "instance": llm_instance, "loaded": True, "config": model_config } logging.info(f"✅ Modelo {model_name} cargado") except Exception as e: logging.error(f"❌ Error cargando modelo {model_config['name']}: {e}") self.models[model_config["name"]] = { "instance": None, "loaded": False, "config": model_config, "error": str(e) } def _download_model(self, model_url): temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf") temp_path = temp_file.name temp_file.close() logging.info("📥 Descargando modelo…") response = self.session.get(model_url, stream=True, timeout=300) response.raise_for_status() downloaded = 0 with open(temp_path, 'wb') as f: for chunk in response.iter_content(chunk_size=32768): if chunk: f.write(chunk) downloaded += len(chunk) return temp_path def get_model(self, model_name): return self.models.get(model_name) def chat_completion(self, model_name, messages, **kwargs): if not self.generation_lock.acquire(blocking=False): return {"error": "Servidor ocupado - Generación en progreso"} try: model_data = self.get_model(model_name) if not model_data or not model_data["loaded"]: error_msg = f"Modelo {model_name} no cargado" if model_data and "error" in model_data: error_msg += f": {model_data['error']}" return {"error": error_msg} result = [None] exception = [None] def generate(): try: if 'repetition_penalty' in kwargs: kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty') result[0] = model_data["instance"].create_chat_completion( messages=messages, **kwargs ) except Exception as e: exception[0] = e gen_thread = threading.Thread(target=generate, daemon=True) gen_thread.start() gen_thread.join(timeout=120) if gen_thread.is_alive(): return {"error": "Timeout en generación (120 segundos)"} if exception[0]: raise exception[0] result[0]["provider"] = "telechars-ai" result[0]["model"] = model_name return result[0] finally: self.generation_lock.release() gc.collect() def get_loaded_models(self): loaded = [] for name, data in self.models.items(): if data["loaded"]: loaded.append(name) return loaded def get_all_models_status(self): status = {} for name, data in self.models.items(): status[name] = { "loaded": data["loaded"], "url": data["config"]["url"] } if "error" in data: status[name]["error"] = data["error"] return status llm_manager = LLMManager(MODELS) @app.route('/') def home(): loaded_models = llm_manager.get_loaded_models() status_html = "" return f''' TeleChars AI API

TeleChars AI API

⚙️ Configuración

Max Context Tokens: {MAX_CONTEXT_TOKENS}

Max Generation Tokens: {MAX_GENERATION_TOKENS}

📦 Modelos cargados:

{status_html}

Total modelos: {len(loaded_models)}/{len(MODELS)}

🔗 Endpoints disponibles:

GET /generate/<mensaje>[?params]
Devuelve solo el texto generado. Parámetros opcionales:
• system= (instrucciones del sistema)
• temperature= (0.0-2.0)
• top_p= (0.0-1.0)
• top_k= (0-100)
• model= (nombre del modelo)
• max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})
• repetition_penalty= (penalización de repetición)
• presence_penalty= (penalización de presencia)
• frequency_penalty= (penalización de frecuencia)
POST /v1/chat/completions
Compatible con OpenAI API
GET /health
Estado del servicio
GET /models
Lista todos los modelos disponibles
''' @app.route('/v1/chat/completions', methods=['POST']) def chat_completions(): try: data = request.get_json() messages = data.get('messages', []) model_name = data.get('model', MODELS[0]["name"]) if model_name not in llm_manager.models: return jsonify({"error": f"Modelo '{model_name}' no encontrado. Modelos disponibles: {list(llm_manager.models.keys())}"}), 400 kwargs = {} for key in data.keys(): if key not in ['messages', 'model']: kwargs[key] = data[key] if 'max_tokens' not in kwargs: kwargs['max_tokens'] = MAX_GENERATION_TOKENS else: if kwargs['max_tokens'] > MAX_GENERATION_TOKENS: kwargs['max_tokens'] = MAX_GENERATION_TOKENS result = llm_manager.chat_completion(model_name, messages, **kwargs) if "error" in result: return jsonify(result), 500 return jsonify(result), 200 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/generate/', methods=['GET']) def generate_endpoint(user_message): try: system_instruction = request.args.get('system', '') temperature = float(request.args.get('temperature', 0.7)) top_p = float(request.args.get('top_p', 0.95)) top_k = int(request.args.get('top_k', 0)) model_name = request.args.get('model', MODELS[0]["name"]) max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS)) repetition_penalty = request.args.get('repetition_penalty') presence_penalty = request.args.get('presence_penalty') frequency_penalty = request.args.get('frequency_penalty') if not 0 <= temperature <= 2: return Response( f"Error: El parámetro 'temperature' debe estar entre 0 y 2", status=400, mimetype='text/plain' ) if not 0 <= top_p <= 1: return Response( f"Error: El parámetro 'top_p' debe estar entre 0 y 1", status=400, mimetype='text/plain' ) if not 0 <= top_k <= 100: return Response( f"Error: El parámetro 'top_k' debe estar entre 0 y 100", status=400, mimetype='text/plain' ) if repetition_penalty: try: repetition_penalty = float(repetition_penalty) except ValueError: return Response( "Error: repetition_penalty debe ser número válido", status=400, mimetype='text/plain' ) if presence_penalty: try: presence_penalty = float(presence_penalty) except ValueError: return Response( "Error: presence_penalty debe ser número válido", status=400, mimetype='text/plain' ) if frequency_penalty: try: frequency_penalty = float(frequency_penalty) except ValueError: return Response( "Error: frequency_penalty debe ser número válido", status=400, mimetype='text/plain' ) if max_tokens > MAX_GENERATION_TOKENS: max_tokens = MAX_GENERATION_TOKENS if model_name not in llm_manager.models: return Response( f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}", status=400, mimetype='text/plain' ) messages = [ {"role": "system", "content": system_instruction}, {"role": "user", "content": user_message} ] kwargs = { "temperature": temperature, "top_p": top_p, "max_tokens": max_tokens, "stream": False } if top_k: kwargs["top_k"] = int(top_k) if repetition_penalty: kwargs["repetition_penalty"] = repetition_penalty if presence_penalty: kwargs["presence_penalty"] = presence_penalty if frequency_penalty: kwargs["frequency_penalty"] = frequency_penalty result = llm_manager.chat_completion(model_name, messages, **kwargs) if "error" in result: return Response( f"Error: {result['error']}", status=500, mimetype='text/plain' ) response_text = result.get("choices", [{}])[0].get("message", {}).get("content", "") if not response_text: response_text = "No se generó respuesta" return Response( response_text, status=200, mimetype='text/plain' ) except ValueError as e: return Response( f"Error: Parámetros inválidos - {str(e)}. Asegúrate de que temperature, top_p y max_tokens sean números válidos.", status=400, mimetype='text/plain' ) except Exception as e: return Response( f"Error: {str(e)}", status=500, mimetype='text/plain' ) @app.route('/health', methods=['GET']) def health(): loaded_models = llm_manager.get_loaded_models() return jsonify({ "status": "healthy" if len(loaded_models) > 0 else "error", "loaded_models": loaded_models, "total_models": len(MODELS), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) @app.route('/models', methods=['GET']) def list_models(): return jsonify({ "available_models": MODELS, "status": llm_manager.get_all_models_status(), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) @app.route('/models/', methods=['GET']) def get_model_status(model_name): model_data = llm_manager.get_model(model_name) if not model_data: return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404 return jsonify({ "model": model_name, "loaded": model_data["loaded"], "url": model_data["config"]["url"], "error": model_data.get("error"), "config": { "max_context_tokens": MAX_CONTEXT_TOKENS, "max_generation_tokens": MAX_GENERATION_TOKENS } }) if __name__ == '__main__': app.run(host='0.0.0.0', port=7860, debug=False)