| from flask import Flask, request, jsonify, Response |
| import os |
| import logging |
| import threading |
| import time |
| import subprocess |
| import sys |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.16"]) |
| from llama_cpp import Llama |
| import requests |
| import tempfile |
| import json |
| import gc |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| app = Flask(__name__) |
| logging.basicConfig(level=logging.INFO) |
|
|
| MAX_CONTEXT_TOKENS = 1024 * 16 |
| MAX_GENERATION_TOKENS = 1024 * 8 |
|
|
| with open('engines.json', 'r') as f: |
| MODELS = json.load(f) |
|
|
| class LLMManager: |
| def __init__(self, models_config): |
| self.models = {} |
| self.models_config = models_config |
| self.executor = ThreadPoolExecutor(max_workers=2) |
| self.generation_lock = threading.Lock() |
| self.session = requests.Session() |
| adapter = requests.adapters.HTTPAdapter(pool_connections=2, pool_maxsize=2) |
| self.session.mount('http://', adapter) |
| self.session.mount('https://', adapter) |
| self.load_all_models() |
|
|
| def load_all_models(self): |
| for model_config in self.models_config: |
| try: |
| model_name = model_config["name"] |
| logging.info(f"🚀 Cargando modelo: {model_name}") |
| |
| temp_path = self._download_model(model_config["url"]) |
| |
| actual_size = os.path.getsize(temp_path) |
| actual_gb = actual_size / (1024*1024*1024) |
| logging.info(f"📊 Tamaño descargado para {model_name}: {actual_gb:.2f} GB") |
|
|
| n_batch = model_config.get("n_batch", 96) |
|
|
| logging.info(f"🔄 Cargando {model_name} en RAM…") |
| llm_instance = Llama( |
| model_path=temp_path, |
| n_ctx=MAX_CONTEXT_TOKENS, |
| n_batch=n_batch, |
| n_threads=2, |
| n_threads_batch=2, |
| use_mlock=True, |
| mmap=False, |
| low_vram=False, |
| vocab_only=False, |
| verbose=False, |
| logits_all=False, |
| mul_mat_q=True |
| ) |
| |
| os.remove(temp_path) |
|
|
| self.models[model_name] = { |
| "instance": llm_instance, |
| "loaded": True, |
| "config": model_config |
| } |
| logging.info(f"✅ Modelo {model_name} cargado") |
|
|
| except Exception as e: |
| logging.error(f"❌ Error cargando modelo {model_config['name']}: {e}") |
| self.models[model_config["name"]] = { |
| "instance": None, |
| "loaded": False, |
| "config": model_config, |
| "error": str(e) |
| } |
|
|
| def _download_model(self, model_url): |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf") |
| temp_path = temp_file.name |
| temp_file.close() |
|
|
| logging.info("📥 Descargando modelo…") |
| |
| response = self.session.get(model_url, stream=True, timeout=300) |
| response.raise_for_status() |
|
|
| downloaded = 0 |
| with open(temp_path, 'wb') as f: |
| for chunk in response.iter_content(chunk_size=32768): |
| if chunk: |
| f.write(chunk) |
| downloaded += len(chunk) |
| |
| return temp_path |
|
|
| def get_model(self, model_name): |
| return self.models.get(model_name) |
|
|
| def chat_completion(self, model_name, messages, **kwargs): |
| if not self.generation_lock.acquire(blocking=False): |
| return {"error": "Servidor ocupado - Generación en progreso"} |
| |
| try: |
| model_data = self.get_model(model_name) |
| |
| if not model_data or not model_data["loaded"]: |
| error_msg = f"Modelo {model_name} no cargado" |
| if model_data and "error" in model_data: |
| error_msg += f": {model_data['error']}" |
| return {"error": error_msg} |
| |
| result = [None] |
| exception = [None] |
| |
| def generate(): |
| try: |
| if 'repetition_penalty' in kwargs: |
| kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty') |
| |
| result[0] = model_data["instance"].create_chat_completion( |
| messages=messages, |
| **kwargs |
| ) |
| except Exception as e: |
| exception[0] = e |
| |
| gen_thread = threading.Thread(target=generate, daemon=True) |
| gen_thread.start() |
| gen_thread.join(timeout=120) |
| |
| if gen_thread.is_alive(): |
| return {"error": "Timeout en generación (120 segundos)"} |
| |
| if exception[0]: |
| raise exception[0] |
| |
| result[0]["provider"] = "telechars-ai" |
| result[0]["model"] = model_name |
| return result[0] |
| |
| finally: |
| self.generation_lock.release() |
| gc.collect() |
| |
| def get_loaded_models(self): |
| loaded = [] |
| for name, data in self.models.items(): |
| if data["loaded"]: |
| loaded.append(name) |
| return loaded |
|
|
| def get_all_models_status(self): |
| status = {} |
| for name, data in self.models.items(): |
| status[name] = { |
| "loaded": data["loaded"], |
| "url": data["config"]["url"] |
| } |
| if "error" in data: |
| status[name]["error"] = data["error"] |
| return status |
|
|
| llm_manager = LLMManager(MODELS) |
|
|
| @app.route('/') |
| def home(): |
| loaded_models = llm_manager.get_loaded_models() |
| status_html = "<ul>" |
| for model_name, model_data in llm_manager.models.items(): |
| status = "✅" if model_data["loaded"] else "❌" |
| status_html += f"<li>{model_name}: {status}</li>" |
| status_html += "</ul>" |
| |
| return f''' |
| <!DOCTYPE html> |
| <html> |
| <head> |
| <title>TeleChars AI API</title> |
| <style> |
| body {{ font-family: Arial, sans-serif; margin: 40px; }} |
| .config {{ background: #f0f0f0; padding: 15px; border-radius: 5px; margin-bottom: 20px; }} |
| .endpoint {{ background: #e8f4f8; padding: 10px; border-left: 4px solid #2196F3; margin: 10px 0; }} |
| </style> |
| </head> |
| <body> |
| <h1>TeleChars AI API</h1> |
| |
| <div class="config"> |
| <h3>⚙️ Configuración</h3> |
| <p><strong>Max Context Tokens:</strong> {MAX_CONTEXT_TOKENS}</p> |
| <p><strong>Max Generation Tokens:</strong> {MAX_GENERATION_TOKENS}</p> |
| </div> |
| |
| <h2>📦 Modelos cargados:</h2> |
| {status_html} |
| <p>Total modelos: {len(loaded_models)}/{len(MODELS)}</p> |
| |
| <h2>🔗 Endpoints disponibles:</h2> |
| <div class="endpoint"> |
| <strong>GET /generate/<mensaje>[?params]</strong><br> |
| Devuelve solo el texto generado. Parámetros opcionales:<br> |
| • system= (instrucciones del sistema)<br> |
| • temperature= (0.0-2.0)<br> |
| • top_p= (0.0-1.0)<br> |
| • top_k= (0-100)<br> |
| • model= (nombre del modelo)<br> |
| • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br> |
| • repetition_penalty= (penalización de repetición)<br> |
| • presence_penalty= (penalización de presencia)<br> |
| • frequency_penalty= (penalización de frecuencia) |
| </div> |
| |
| <div class="endpoint"> |
| <strong>POST /v1/chat/completions</strong><br> |
| Compatible con OpenAI API |
| </div> |
| |
| <div class="endpoint"> |
| <strong>GET /health</strong><br> |
| Estado del servicio |
| </div> |
| |
| <div class="endpoint"> |
| <strong>GET /models</strong><br> |
| Lista todos los modelos disponibles |
| </div> |
| </body> |
| </html> |
| ''' |
|
|
| @app.route('/v1/chat/completions', methods=['POST']) |
| def chat_completions(): |
| try: |
| data = request.get_json() |
| messages = data.get('messages', []) |
| model_name = data.get('model', MODELS[0]["name"]) |
| |
| if model_name not in llm_manager.models: |
| return jsonify({"error": f"Modelo '{model_name}' no encontrado. Modelos disponibles: {list(llm_manager.models.keys())}"}), 400 |
| |
| kwargs = {} |
| for key in data.keys(): |
| if key not in ['messages', 'model']: |
| kwargs[key] = data[key] |
| |
| if 'max_tokens' not in kwargs: |
| kwargs['max_tokens'] = MAX_GENERATION_TOKENS |
| else: |
| if kwargs['max_tokens'] > MAX_GENERATION_TOKENS: |
| kwargs['max_tokens'] = MAX_GENERATION_TOKENS |
| |
| result = llm_manager.chat_completion(model_name, messages, **kwargs) |
|
|
| if "error" in result: |
| return jsonify(result), 500 |
| |
| return jsonify(result), 200 |
| |
| except Exception as e: |
| return jsonify({"error": str(e)}), 500 |
|
|
| @app.route('/generate/<path:user_message>', methods=['GET']) |
| def generate_endpoint(user_message): |
| try: |
| system_instruction = request.args.get('system', '') |
| temperature = float(request.args.get('temperature', 0.7)) |
| top_p = float(request.args.get('top_p', 0.95)) |
| top_k = int(request.args.get('top_k', 0)) |
| model_name = request.args.get('model', MODELS[0]["name"]) |
| max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS)) |
| |
| repetition_penalty = request.args.get('repetition_penalty') |
| presence_penalty = request.args.get('presence_penalty') |
| frequency_penalty = request.args.get('frequency_penalty') |
| |
| if not 0 <= temperature <= 2: |
| return Response( |
| f"Error: El parámetro 'temperature' debe estar entre 0 y 2", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| if not 0 <= top_p <= 1: |
| return Response( |
| f"Error: El parámetro 'top_p' debe estar entre 0 y 1", |
| status=400, |
| mimetype='text/plain' |
| ) |
|
|
| if not 0 <= top_k <= 100: |
| return Response( |
| f"Error: El parámetro 'top_k' debe estar entre 0 y 100", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| if repetition_penalty: |
| try: |
| repetition_penalty = float(repetition_penalty) |
| except ValueError: |
| return Response( |
| "Error: repetition_penalty debe ser número válido", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| if presence_penalty: |
| try: |
| presence_penalty = float(presence_penalty) |
| except ValueError: |
| return Response( |
| "Error: presence_penalty debe ser número válido", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| if frequency_penalty: |
| try: |
| frequency_penalty = float(frequency_penalty) |
| except ValueError: |
| return Response( |
| "Error: frequency_penalty debe ser número válido", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| if max_tokens > MAX_GENERATION_TOKENS: |
| max_tokens = MAX_GENERATION_TOKENS |
| |
| if model_name not in llm_manager.models: |
| return Response( |
| f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}", |
| status=400, |
| mimetype='text/plain' |
| ) |
| |
| messages = [ |
| {"role": "system", "content": system_instruction}, |
| {"role": "user", "content": user_message} |
| ] |
| |
| kwargs = { |
| "temperature": temperature, |
| "top_p": top_p, |
| "max_tokens": max_tokens, |
| "stream": False |
| } |
|
|
| if top_k: |
| kwargs["top_k"] = int(top_k) |
| |
| if repetition_penalty: |
| kwargs["repetition_penalty"] = repetition_penalty |
| |
| if presence_penalty: |
| kwargs["presence_penalty"] = presence_penalty |
| |
| if frequency_penalty: |
| kwargs["frequency_penalty"] = frequency_penalty |
| |
| result = llm_manager.chat_completion(model_name, messages, **kwargs) |
| |
| if "error" in result: |
| return Response( |
| f"Error: {result['error']}", |
| status=500, |
| mimetype='text/plain' |
| ) |
|
|
| response_text = result.get("choices", [{}])[0].get("message", {}).get("content", "") |
| |
| if not response_text: |
| response_text = "No se generó respuesta" |
| |
| return Response( |
| response_text, |
| status=200, |
| mimetype='text/plain' |
| ) |
|
|
| except ValueError as e: |
| return Response( |
| f"Error: Parámetros inválidos - {str(e)}. Asegúrate de que temperature, top_p y max_tokens sean números válidos.", |
| status=400, |
| mimetype='text/plain' |
| ) |
| except Exception as e: |
| return Response( |
| f"Error: {str(e)}", |
| status=500, |
| mimetype='text/plain' |
| ) |
|
|
| @app.route('/health', methods=['GET']) |
| def health(): |
| loaded_models = llm_manager.get_loaded_models() |
| return jsonify({ |
| "status": "healthy" if len(loaded_models) > 0 else "error", |
| "loaded_models": loaded_models, |
| "total_models": len(MODELS), |
| "config": { |
| "max_context_tokens": MAX_CONTEXT_TOKENS, |
| "max_generation_tokens": MAX_GENERATION_TOKENS |
| } |
| }) |
|
|
| @app.route('/models', methods=['GET']) |
| def list_models(): |
| return jsonify({ |
| "available_models": MODELS, |
| "status": llm_manager.get_all_models_status(), |
| "config": { |
| "max_context_tokens": MAX_CONTEXT_TOKENS, |
| "max_generation_tokens": MAX_GENERATION_TOKENS |
| } |
| }) |
|
|
| @app.route('/models/<model_name>', methods=['GET']) |
| def get_model_status(model_name): |
| model_data = llm_manager.get_model(model_name) |
| if not model_data: |
| return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404 |
| |
| return jsonify({ |
| "model": model_name, |
| "loaded": model_data["loaded"], |
| "url": model_data["config"]["url"], |
| "error": model_data.get("error"), |
| "config": { |
| "max_context_tokens": MAX_CONTEXT_TOKENS, |
| "max_generation_tokens": MAX_GENERATION_TOKENS |
| } |
| }) |
|
|
| if __name__ == '__main__': |
| app.run(host='0.0.0.0', port=7860, debug=False) |