Spaces:
Running
Running
| from flask import Flask, request, jsonify, Response | |
| import os | |
| import logging | |
| import threading | |
| import time | |
| import subprocess | |
| import sys | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.15"]) | |
| from llama_cpp import Llama | |
| import requests | |
| import tempfile | |
| import json | |
| import gc | |
| from concurrent.futures import ThreadPoolExecutor | |
| app = Flask(__name__) | |
| logging.basicConfig(level=logging.INFO) | |
| MAX_CONTEXT_TOKENS = 1024 * 4 | |
| MAX_GENERATION_TOKENS = 1024 * 4 | |
| with open('engines.json', 'r') as f: | |
| MODELS = json.load(f) | |
| class LLMManager: | |
| def __init__(self, models_config): | |
| self.models = {} | |
| self.models_config = models_config | |
| self.executor = ThreadPoolExecutor(max_workers=2) | |
| self.generation_lock = threading.Lock() | |
| self.session = requests.Session() | |
| adapter = requests.adapters.HTTPAdapter(pool_connections=2, pool_maxsize=2) | |
| self.session.mount('http://', adapter) | |
| self.session.mount('https://', adapter) | |
| self.load_all_models() | |
| def load_all_models(self): | |
| for model_config in self.models_config: | |
| try: | |
| model_name = model_config["name"] | |
| logging.info(f"🚀 Cargando modelo: {model_name}") | |
| temp_path = self._download_model(model_config["url"]) | |
| actual_size = os.path.getsize(temp_path) | |
| actual_gb = actual_size / (1024*1024*1024) | |
| logging.info(f"📊 Tamaño descargado para {model_name}: {actual_gb:.2f} GB") | |
| n_batch = model_config.get("n_batch", 96) | |
| logging.info(f"🔄 Cargando {model_name} en RAM…") | |
| llm_instance = Llama( | |
| model_path=temp_path, | |
| n_ctx=MAX_CONTEXT_TOKENS, | |
| n_batch=n_batch, | |
| n_threads=2, | |
| n_threads_batch=2, | |
| use_mlock=True, | |
| mmap=True, | |
| low_vram=False, | |
| vocab_only=False, | |
| verbose=False, | |
| logits_all=False, | |
| mul_mat_q=True | |
| ) | |
| os.remove(temp_path) | |
| self.models[model_name] = { | |
| "instance": llm_instance, | |
| "loaded": True, | |
| "config": model_config | |
| } | |
| logging.info(f"✅ Modelo {model_name} cargado") | |
| except Exception as e: | |
| logging.error(f"❌ Error cargando modelo {model_config['name']}: {e}") | |
| self.models[model_config["name"]] = { | |
| "instance": None, | |
| "loaded": False, | |
| "config": model_config, | |
| "error": str(e) | |
| } | |
| def _download_model(self, model_url): | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf") | |
| temp_path = temp_file.name | |
| temp_file.close() | |
| logging.info("📥 Descargando modelo…") | |
| response = self.session.get(model_url, stream=True, timeout=300) | |
| response.raise_for_status() | |
| downloaded = 0 | |
| with open(temp_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=32768): | |
| if chunk: | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| return temp_path | |
| def get_model(self, model_name): | |
| return self.models.get(model_name) | |
| def chat_completion(self, model_name, messages, **kwargs): | |
| if not self.generation_lock.acquire(blocking=False): | |
| return {"error": "Servidor ocupado - Generación en progreso"} | |
| try: | |
| model_data = self.get_model(model_name) | |
| if not model_data or not model_data["loaded"]: | |
| error_msg = f"Modelo {model_name} no cargado" | |
| if model_data and "error" in model_data: | |
| error_msg += f": {model_data['error']}" | |
| return {"error": error_msg} | |
| result = [None] | |
| exception = [None] | |
| def generate(): | |
| try: | |
| if 'repetition_penalty' in kwargs: | |
| kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty') | |
| result[0] = model_data["instance"].create_chat_completion( | |
| messages=messages, | |
| **kwargs | |
| ) | |
| except Exception as e: | |
| exception[0] = e | |
| gen_thread = threading.Thread(target=generate, daemon=True) | |
| gen_thread.start() | |
| gen_thread.join(timeout=120) | |
| if gen_thread.is_alive(): | |
| return {"error": "Timeout en generación (120 segundos)"} | |
| if exception[0]: | |
| raise exception[0] | |
| result[0]["provider"] = "telechars-ai" | |
| result[0]["model"] = model_name | |
| return result[0] | |
| finally: | |
| self.generation_lock.release() | |
| gc.collect() | |
| def get_loaded_models(self): | |
| loaded = [] | |
| for name, data in self.models.items(): | |
| if data["loaded"]: | |
| loaded.append(name) | |
| return loaded | |
| def get_all_models_status(self): | |
| status = {} | |
| for name, data in self.models.items(): | |
| status[name] = { | |
| "loaded": data["loaded"], | |
| "url": data["config"]["url"] | |
| } | |
| if "error" in data: | |
| status[name]["error"] = data["error"] | |
| return status | |
| llm_manager = LLMManager(MODELS) | |
| def home(): | |
| loaded_models = llm_manager.get_loaded_models() | |
| status_html = "<ul>" | |
| for model_name, model_data in llm_manager.models.items(): | |
| status = "✅" if model_data["loaded"] else "❌" | |
| status_html += f"<li>{model_name}: {status}</li>" | |
| status_html += "</ul>" | |
| return f''' | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>TeleChars AI API</title> | |
| <style> | |
| body {{ font-family: Arial, sans-serif; margin: 40px; }} | |
| .config {{ background: #f0f0f0; padding: 15px; border-radius: 5px; margin-bottom: 20px; }} | |
| .endpoint {{ background: #e8f4f8; padding: 10px; border-left: 4px solid #2196F3; margin: 10px 0; }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>TeleChars AI API</h1> | |
| <div class="config"> | |
| <h3>⚙️ Configuración</h3> | |
| <p><strong>Max Context Tokens:</strong> {MAX_CONTEXT_TOKENS}</p> | |
| <p><strong>Max Generation Tokens:</strong> {MAX_GENERATION_TOKENS}</p> | |
| </div> | |
| <h2>📦 Modelos cargados:</h2> | |
| {status_html} | |
| <p>Total modelos: {len(loaded_models)}/{len(MODELS)}</p> | |
| <h2>🔗 Endpoints disponibles:</h2> | |
| <div class="endpoint"> | |
| <strong>GET /generate/<mensaje>[?params]</strong><br> | |
| Devuelve solo el texto generado. Parámetros opcionales:<br> | |
| • system= (instrucciones del sistema)<br> | |
| • temperature= (0.0-2.0)<br> | |
| • top_p= (0.0-1.0)<br> | |
| • top_k= (0-100)<br> | |
| • model= (nombre del modelo)<br> | |
| • max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})<br> | |
| • repetition_penalty= (penalización de repetición)<br> | |
| • presence_penalty= (penalización de presencia)<br> | |
| • frequency_penalty= (penalización de frecuencia) | |
| </div> | |
| <div class="endpoint"> | |
| <strong>POST /v1/chat/completions</strong><br> | |
| Compatible con OpenAI API | |
| </div> | |
| <div class="endpoint"> | |
| <strong>GET /health</strong><br> | |
| Estado del servicio | |
| </div> | |
| <div class="endpoint"> | |
| <strong>GET /models</strong><br> | |
| Lista todos los modelos disponibles | |
| </div> | |
| </body> | |
| </html> | |
| ''' | |
| def chat_completions(): | |
| try: | |
| data = request.get_json() | |
| messages = data.get('messages', []) | |
| model_name = data.get('model', MODELS[0]["name"]) | |
| if model_name not in llm_manager.models: | |
| return jsonify({"error": f"Modelo '{model_name}' no encontrado. Modelos disponibles: {list(llm_manager.models.keys())}"}), 400 | |
| kwargs = {} | |
| for key in data.keys(): | |
| if key not in ['messages', 'model']: | |
| kwargs[key] = data[key] | |
| if 'max_tokens' not in kwargs: | |
| kwargs['max_tokens'] = MAX_GENERATION_TOKENS | |
| else: | |
| if kwargs['max_tokens'] > MAX_GENERATION_TOKENS: | |
| kwargs['max_tokens'] = MAX_GENERATION_TOKENS | |
| result = llm_manager.chat_completion(model_name, messages, **kwargs) | |
| if "error" in result: | |
| return jsonify(result), 500 | |
| return jsonify(result), 200 | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| def generate_endpoint(user_message): | |
| try: | |
| system_instruction = request.args.get('system', '') | |
| temperature = float(request.args.get('temperature', 0.7)) | |
| top_p = float(request.args.get('top_p', 0.95)) | |
| top_k = int(request.args.get('top_k', 0)) | |
| model_name = request.args.get('model', MODELS[0]["name"]) | |
| max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS)) | |
| repetition_penalty = request.args.get('repetition_penalty') | |
| presence_penalty = request.args.get('presence_penalty') | |
| frequency_penalty = request.args.get('frequency_penalty') | |
| if not 0 <= temperature <= 2: | |
| return Response( | |
| f"Error: El parámetro 'temperature' debe estar entre 0 y 2", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if not 0 <= top_p <= 1: | |
| return Response( | |
| f"Error: El parámetro 'top_p' debe estar entre 0 y 1", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if not 0 <= top_k <= 100: | |
| return Response( | |
| f"Error: El parámetro 'top_k' debe estar entre 0 y 100", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if repetition_penalty: | |
| try: | |
| repetition_penalty = float(repetition_penalty) | |
| except ValueError: | |
| return Response( | |
| "Error: repetition_penalty debe ser número válido", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if presence_penalty: | |
| try: | |
| presence_penalty = float(presence_penalty) | |
| except ValueError: | |
| return Response( | |
| "Error: presence_penalty debe ser número válido", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if frequency_penalty: | |
| try: | |
| frequency_penalty = float(frequency_penalty) | |
| except ValueError: | |
| return Response( | |
| "Error: frequency_penalty debe ser número válido", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| if max_tokens > MAX_GENERATION_TOKENS: | |
| max_tokens = MAX_GENERATION_TOKENS | |
| if model_name not in llm_manager.models: | |
| return Response( | |
| f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| messages = [ | |
| {"role": "system", "content": system_instruction}, | |
| {"role": "user", "content": user_message} | |
| ] | |
| kwargs = { | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "max_tokens": max_tokens, | |
| "stream": False | |
| } | |
| if top_k: | |
| kwargs["top_k"] = int(top_k) | |
| if repetition_penalty: | |
| kwargs["repetition_penalty"] = repetition_penalty | |
| if presence_penalty: | |
| kwargs["presence_penalty"] = presence_penalty | |
| if frequency_penalty: | |
| kwargs["frequency_penalty"] = frequency_penalty | |
| result = llm_manager.chat_completion(model_name, messages, **kwargs) | |
| if "error" in result: | |
| return Response( | |
| f"Error: {result['error']}", | |
| status=500, | |
| mimetype='text/plain' | |
| ) | |
| response_text = result.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| if not response_text: | |
| response_text = "No se generó respuesta" | |
| return Response( | |
| response_text, | |
| status=200, | |
| mimetype='text/plain' | |
| ) | |
| except ValueError as e: | |
| return Response( | |
| f"Error: Parámetros inválidos - {str(e)}. Asegúrate de que temperature, top_p y max_tokens sean números válidos.", | |
| status=400, | |
| mimetype='text/plain' | |
| ) | |
| except Exception as e: | |
| return Response( | |
| f"Error: {str(e)}", | |
| status=500, | |
| mimetype='text/plain' | |
| ) | |
| def health(): | |
| loaded_models = llm_manager.get_loaded_models() | |
| return jsonify({ | |
| "status": "healthy" if len(loaded_models) > 0 else "error", | |
| "loaded_models": loaded_models, | |
| "total_models": len(MODELS), | |
| "config": { | |
| "max_context_tokens": MAX_CONTEXT_TOKENS, | |
| "max_generation_tokens": MAX_GENERATION_TOKENS | |
| } | |
| }) | |
| def list_models(): | |
| return jsonify({ | |
| "available_models": MODELS, | |
| "status": llm_manager.get_all_models_status(), | |
| "config": { | |
| "max_context_tokens": MAX_CONTEXT_TOKENS, | |
| "max_generation_tokens": MAX_GENERATION_TOKENS | |
| } | |
| }) | |
| def get_model_status(model_name): | |
| model_data = llm_manager.get_model(model_name) | |
| if not model_data: | |
| return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404 | |
| return jsonify({ | |
| "model": model_name, | |
| "loaded": model_data["loaded"], | |
| "url": model_data["config"]["url"], | |
| "error": model_data.get("error"), | |
| "config": { | |
| "max_context_tokens": MAX_CONTEXT_TOKENS, | |
| "max_generation_tokens": MAX_GENERATION_TOKENS | |
| } | |
| }) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860, debug=False) |