from flask import Flask, request, jsonify, Response
import os
import logging
import threading
import time
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python==0.3.16"])
from llama_cpp import Llama
import requests
import tempfile
import json
import gc
from concurrent.futures import ThreadPoolExecutor
app = Flask(__name__)
logging.basicConfig(level=logging.INFO)
MAX_CONTEXT_TOKENS = 1024 * 8
MAX_GENERATION_TOKENS = 1024 * 4
with open('engines.json', 'r') as f:
MODELS = json.load(f)
class LLMManager:
def __init__(self, models_config):
self.models = {}
self.models_config = models_config
self.executor = ThreadPoolExecutor(max_workers=2)
self.generation_lock = threading.Lock()
self.session = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=2, pool_maxsize=2)
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)
self.load_all_models()
def load_all_models(self):
for model_config in self.models_config:
try:
model_name = model_config["name"]
logging.info(f"🚀 Cargando modelo: {model_name}")
temp_path = self._download_model(model_config["url"])
actual_size = os.path.getsize(temp_path)
actual_gb = actual_size / (1024*1024*1024)
logging.info(f"📊 Tamaño descargado para {model_name}: {actual_gb:.2f} GB")
n_batch = model_config.get("n_batch", 96)
logging.info(f"🔄 Cargando {model_name} en RAM…")
llm_instance = Llama(
model_path=temp_path,
n_ctx=MAX_CONTEXT_TOKENS,
n_batch=n_batch,
n_threads=2,
n_threads_batch=2,
use_mlock=False,
mmap=True,
low_vram=False,
vocab_only=False,
verbose=False,
logits_all=False,
mul_mat_q=True
)
os.remove(temp_path)
self.models[model_name] = {
"instance": llm_instance,
"loaded": True,
"config": model_config
}
logging.info(f"✅ Modelo {model_name} cargado")
except Exception as e:
logging.error(f"❌ Error cargando modelo {model_config['name']}: {e}")
self.models[model_config["name"]] = {
"instance": None,
"loaded": False,
"config": model_config,
"error": str(e)
}
def _download_model(self, model_url):
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gguf")
temp_path = temp_file.name
temp_file.close()
logging.info("📥 Descargando modelo…")
response = self.session.get(model_url, stream=True, timeout=300)
response.raise_for_status()
downloaded = 0
with open(temp_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=32768):
if chunk:
f.write(chunk)
downloaded += len(chunk)
return temp_path
def get_model(self, model_name):
return self.models.get(model_name)
def chat_completion(self, model_name, messages, **kwargs):
if not self.generation_lock.acquire(blocking=False):
return {"error": "Servidor ocupado - Generación en progreso"}
try:
model_data = self.get_model(model_name)
if not model_data or not model_data["loaded"]:
error_msg = f"Modelo {model_name} no cargado"
if model_data and "error" in model_data:
error_msg += f": {model_data['error']}"
return {"error": error_msg}
result = [None]
exception = [None]
def generate():
try:
if 'repetition_penalty' in kwargs:
kwargs['repeat_penalty'] = kwargs.pop('repetition_penalty')
result[0] = model_data["instance"].create_chat_completion(
messages=messages,
**kwargs
)
except Exception as e:
exception[0] = e
gen_thread = threading.Thread(target=generate, daemon=True)
gen_thread.start()
gen_thread.join(timeout=120)
if gen_thread.is_alive():
return {"error": "Timeout en generación (120 segundos)"}
if exception[0]:
raise exception[0]
result[0]["provider"] = "telechars-ai"
result[0]["model"] = model_name
return result[0]
finally:
self.generation_lock.release()
gc.collect()
def get_loaded_models(self):
loaded = []
for name, data in self.models.items():
if data["loaded"]:
loaded.append(name)
return loaded
def get_all_models_status(self):
status = {}
for name, data in self.models.items():
status[name] = {
"loaded": data["loaded"],
"url": data["config"]["url"]
}
if "error" in data:
status[name]["error"] = data["error"]
return status
llm_manager = LLMManager(MODELS)
@app.route('/')
def home():
loaded_models = llm_manager.get_loaded_models()
status_html = "
"
for model_name, model_data in llm_manager.models.items():
status = "✅" if model_data["loaded"] else "❌"
status_html += f"- {model_name}: {status}
"
status_html += "
"
return f'''
TeleChars AI API
TeleChars AI API
⚙️ Configuración
Max Context Tokens: {MAX_CONTEXT_TOKENS}
Max Generation Tokens: {MAX_GENERATION_TOKENS}
📦 Modelos cargados:
{status_html}
Total modelos: {len(loaded_models)}/{len(MODELS)}
🔗 Endpoints disponibles:
GET /generate/<mensaje>[?params]
Devuelve solo el texto generado. Parámetros opcionales:
• system= (instrucciones del sistema)
• temperature= (0.0-2.0)
• top_p= (0.0-1.0)
• top_k= (0-100)
• model= (nombre del modelo)
• max_tokens= (máximo tokens a generar, default: {MAX_GENERATION_TOKENS})
• repetition_penalty= (penalización de repetición)
• presence_penalty= (penalización de presencia)
• frequency_penalty= (penalización de frecuencia)
POST /v1/chat/completions
Compatible con OpenAI API
GET /health
Estado del servicio
GET /models
Lista todos los modelos disponibles
'''
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
try:
data = request.get_json()
messages = data.get('messages', [])
model_name = data.get('model', MODELS[0]["name"])
if model_name not in llm_manager.models:
return jsonify({"error": f"Modelo '{model_name}' no encontrado. Modelos disponibles: {list(llm_manager.models.keys())}"}), 400
kwargs = {}
for key in data.keys():
if key not in ['messages', 'model']:
kwargs[key] = data[key]
if 'max_tokens' not in kwargs:
kwargs['max_tokens'] = MAX_GENERATION_TOKENS
else:
if kwargs['max_tokens'] > MAX_GENERATION_TOKENS:
kwargs['max_tokens'] = MAX_GENERATION_TOKENS
result = llm_manager.chat_completion(model_name, messages, **kwargs)
if "error" in result:
return jsonify(result), 500
return jsonify(result), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/generate/', methods=['GET'])
def generate_endpoint(user_message):
try:
system_instruction = request.args.get('system', '')
temperature = float(request.args.get('temperature', 0.7))
top_p = float(request.args.get('top_p', 0.95))
top_k = int(request.args.get('top_k', 0))
model_name = request.args.get('model', MODELS[0]["name"])
max_tokens = int(request.args.get('max_tokens', MAX_GENERATION_TOKENS))
repetition_penalty = request.args.get('repetition_penalty')
presence_penalty = request.args.get('presence_penalty')
frequency_penalty = request.args.get('frequency_penalty')
if not 0 <= temperature <= 2:
return Response(
f"Error: El parámetro 'temperature' debe estar entre 0 y 2",
status=400,
mimetype='text/plain'
)
if not 0 <= top_p <= 1:
return Response(
f"Error: El parámetro 'top_p' debe estar entre 0 y 1",
status=400,
mimetype='text/plain'
)
if not 0 <= top_k <= 100:
return Response(
f"Error: El parámetro 'top_k' debe estar entre 0 y 100",
status=400,
mimetype='text/plain'
)
if repetition_penalty:
try:
repetition_penalty = float(repetition_penalty)
except ValueError:
return Response(
"Error: repetition_penalty debe ser número válido",
status=400,
mimetype='text/plain'
)
if presence_penalty:
try:
presence_penalty = float(presence_penalty)
except ValueError:
return Response(
"Error: presence_penalty debe ser número válido",
status=400,
mimetype='text/plain'
)
if frequency_penalty:
try:
frequency_penalty = float(frequency_penalty)
except ValueError:
return Response(
"Error: frequency_penalty debe ser número válido",
status=400,
mimetype='text/plain'
)
if max_tokens > MAX_GENERATION_TOKENS:
max_tokens = MAX_GENERATION_TOKENS
if model_name not in llm_manager.models:
return Response(
f"Error: Modelo '{model_name}' no encontrado. Modelos disponibles: {', '.join(llm_manager.models.keys())}",
status=400,
mimetype='text/plain'
)
messages = [
{"role": "system", "content": system_instruction},
{"role": "user", "content": user_message}
]
kwargs = {
"temperature": temperature,
"top_p": top_p,
"max_tokens": max_tokens,
"stream": False
}
if top_k:
kwargs["top_k"] = int(top_k)
if repetition_penalty:
kwargs["repetition_penalty"] = repetition_penalty
if presence_penalty:
kwargs["presence_penalty"] = presence_penalty
if frequency_penalty:
kwargs["frequency_penalty"] = frequency_penalty
result = llm_manager.chat_completion(model_name, messages, **kwargs)
if "error" in result:
return Response(
f"Error: {result['error']}",
status=500,
mimetype='text/plain'
)
response_text = result.get("choices", [{}])[0].get("message", {}).get("content", "")
if not response_text:
response_text = "No se generó respuesta"
return Response(
response_text,
status=200,
mimetype='text/plain'
)
except ValueError as e:
return Response(
f"Error: Parámetros inválidos - {str(e)}. Asegúrate de que temperature, top_p y max_tokens sean números válidos.",
status=400,
mimetype='text/plain'
)
except Exception as e:
return Response(
f"Error: {str(e)}",
status=500,
mimetype='text/plain'
)
@app.route('/health', methods=['GET'])
def health():
loaded_models = llm_manager.get_loaded_models()
return jsonify({
"status": "healthy" if len(loaded_models) > 0 else "error",
"loaded_models": loaded_models,
"total_models": len(MODELS),
"config": {
"max_context_tokens": MAX_CONTEXT_TOKENS,
"max_generation_tokens": MAX_GENERATION_TOKENS
}
})
@app.route('/models', methods=['GET'])
def list_models():
return jsonify({
"available_models": MODELS,
"status": llm_manager.get_all_models_status(),
"config": {
"max_context_tokens": MAX_CONTEXT_TOKENS,
"max_generation_tokens": MAX_GENERATION_TOKENS
}
})
@app.route('/models/', methods=['GET'])
def get_model_status(model_name):
model_data = llm_manager.get_model(model_name)
if not model_data:
return jsonify({"error": f"Modelo '{model_name}' no encontrado"}), 404
return jsonify({
"model": model_name,
"loaded": model_data["loaded"],
"url": model_data["config"]["url"],
"error": model_data.get("error"),
"config": {
"max_context_tokens": MAX_CONTEXT_TOKENS,
"max_generation_tokens": MAX_GENERATION_TOKENS
}
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=False)