Spaces:

TomatitoToho
/

zelin-bot

Paused

App Files Files Community

zelin-bot / src /ai.js

TomatitoToho's picture

v11: Fix thinking leak - ALWAYS cleanThinkingArtifacts, more aggressive detection, expanded patterns

2bac7f7 verified 14 days ago

history blame contribute delete

57 kB

	/**
	* ai.js — Router de IA v10.0 — TORNEO DE MODELOS
	* ================================================
	* CAMBIOS v10.0 (basado en torneo de 179 modelos):
	* - Qwen3-32B (Groq) como proveedor PRIMARIO — mejor español+velocidad+thinking
	* - Qwen3-235B-A22B (Cerebras) como premium — ultra-rápido, excelente español
	* - Llama-4-Scout-17B (Groq) como secundario — oficial español
	* - DeepSeek R1 (OpenRouter) para razonamiento complejo
	* - GLM mantenido pero degradado en prioridad
	* - Modelo local (Gemma 4 E4B) solo como fallback final
	* - Thinking mode optimizado para Qwen3
	* - Cascadas reordenadas por puntaje del torneo
	*/

	import { readConfig } from './utils.js';
	import { semanticCache } from './semantic-cache.js';
	import { emergencyFallback, isLocalAIReady } from './local-ai.js';
	import { callHuggingFace, callHuggingFaceCascade, isHFAvailable, discoverHFModels, getHFStats } from './hf-provider.js';
	import { hiveGenerate, initialize as hiveInit, getHiveStatus, getHiveStats, benchmarkHive, warmUpWorkers } from './hive.js';

	const config = readConfig();
	const ai = config.ai ?? {};
	const feats = config.features ?? {};
	const glm = config.glm ?? {};

	// ── Thinking Mode v3.0 — REMOVED injectThinking() ──────────────────────────
	// v3.0 CRITICAL FIX: injectThinking() was causing the #1 bug — chain-of-thought
	// leaking into Discord responses. Models would output their thinking process as
	// plain text despite instructions not to. The fix: DON'T inject thinking prompts.
	// Models already think internally. Adding "PIENSA PROFUNDAMENTE" causes them to
	// output that thinking. Less is more — just give good system prompts.
	//
	// For models with native thinking (GLM-5.1 reasoning_content, Qwen3 /think):
	// - We let the API handle thinking natively (separate from content)
	// - We NEVER inject /think or thinking instructions manually
	// - We ALWAYS strip any thinking that leaks into content via cleanThinkingArtifacts()

	// injectThinking is now a no-op — returns messages unchanged
	function injectThinking(messages, _providerName = '') {
	return messages;
	}

	// Limpiar artefactos de thinking + patrones AI — v11 ANTI-LEAK
	// v11: Major overhaul — handles ALL known thinking leak patterns
	// Root cause: models output their chain-of-thought as plain text
	// v11 FIX: More aggressive detection — lower thresholds, more patterns
	function cleanThinkingArtifacts(text) {
	if (!text \|\| typeof text !== 'string') return text;
	let t = text;

	// ═══ v11 NUCLEAR: Strip ALL thinking tags first ═══
	// Handle every known tag format
	t = t.replace(/<think[\s\S]*?<\/think>/gi, '');
	t = t.replace(/<thinking[\s\S]*?<\/thinking>/gi, '');
	t = t.replace(/<reasoning[\s\S]*?<\/reasoning>/gi, '');
	t = t.replace(/<scratchpad[\s\S]*?<\/scratchpad>/gi, '');
	t = t.replace(/<internal[\s\S]*?<\/internal>/gi, '');
	t = t.replace(/<![CDATA[\s\S]*?]]>/gi, '');
	t = t.replace(/<\/think>/gi, '');
	t = t.replace(/<\/thinking>/gi, '');
	t = t.replace(/<\/reasoning>/gi, '');

	// ═══ v11 CRÍTICO: Ultra-aggressive thinking leak detection ═══
	// Detect multi-line thinking followed by a response
	// Pattern: First part is analytical (long, detailed), last part is natural Spanish
	if (t.includes('\n') && t.length > 150) {
	const lines = t.split('\n').filter(l => l.trim().length > 0);
	if (lines.length >= 3) {
	// Check if first lines are thinking (English or structured analysis)
	const firstLines = lines.slice(0, Math.ceil(lines.length / 2));
	const lastLines = lines.slice(Math.ceil(lines.length / 2));

	// v11: More thinking indicators — covers more model output patterns
	const thinkingWords = /^(?:okay\|alright\|let me\|i should\|the user\|first,\|based on\|since\|so,\|well,\|now,\|also,\|however,\|but,\|actually,\|hmm\|let's\|i need\|i think\|i'll\|going to\|in order\|therefore\|because\|this means\|that means\|it seems\|it appears\|looking at\|considering\|analyzing\|understanding\|evaluating\|to respond\|to answer\|the message\|the question\|as zelin\|respond as\|in character\|staying in\|my role\|my persona)/i;
	// v11: Expanded Spanish words list for better detection
	const spanishWords = /\b(wey\|neta\|chido\|chale\|órale\|híjole\|zelin\|morra\|güey\|onda\|chingón\|bueno\|claro\|sí\|no\|nah\|oye\|mira\|ósea\|pues\|nada\|simón\|sale\|va\|ok\|jaja\|qu[eé]\|c[oó]mo\|d[oó]nde\|cu[aá]ndo\|por qu[eé]\|much[oas]?\|tambi[eé]n\|aqu[ií]\|all[iá]\|este\|esta\|eso\|esa\|s[ií]\|nope\|yup\|sip\|nop\|dale\|va\|holi\|ola\|bro\|crack\|xd\|gg\|ns\|ni idea\|ni modo\|ya ves\|ya mero\|híjole\|a poco\|qué onda\|no mames\|no manches\|est[aá] ca[nnñ]ón\|padre\|madre)/i;

	let firstHalfThinking = 0;
	let secondHalfSpanish = 0;

	for (const line of firstLines) {
	if (thinkingWords.test(line.trim())) firstHalfThinking++;
	if (spanishWords.test(line)) firstHalfThinking--; // Not thinking if Spanish
	}
	for (const line of lastLines) {
	if (spanishWords.test(line)) secondHalfSpanish++;
	if (thinkingWords.test(line.trim())) secondHalfSpanish--; // Not response if thinking
	}

	// v11: Lower threshold — even 1 thinking indicator is suspicious
	if (firstHalfThinking >= 1 && secondHalfSpanish >= 1) {
	const response = lastLines.join('\n').trim();
	if (response.length > 5) {
	t = response;
	}
	}
	}
	}

	// ═══ v11 NEW: Single-paragraph English thinking followed by Spanish ═══
	// Pattern: "Okay, I should respond as a Mexican girl. wey no sé"
	// Extract ONLY the Spanish part after the last English sentence
	if (t.length > 100) {
	// Find the LAST transition from English to Spanish
	const sentences = t.split(/(?<=[.!?])\s+/);
	if (sentences.length >= 2) {
	let lastSpanishStart = -1;
	for (let i = 0; i < sentences.length; i++) {
	// A sentence is Spanish if it has Spanish-specific words or characters
	const s = sentences[i];
	if (/\b(wey\|neta\|chido\|chale\|órale\|híjole\|morra\|güey\|onda\|chingón\|ósea\|pues\|simón)\b/i.test(s) \|\|
	/[¿¡]/.test(s) \|\|
	(/\b(s[ií]\|no\|nah\|oye\|mira\|bueno\|claro\|dale\|ns)\b/i.test(s) && !/^(?:okay\|alright\|let me\|i should\|the user)/i.test(s))) {
	if (i > 0 && /^(?:okay\|alright\|let me\|i should\|the user\|first\|based on\|since\|so\|well\|now\|also\|however\|actually\|hmm\|let's\|i need\|i think)/i.test(sentences[0])) {
	lastSpanishStart = i;
	}
	}
	}
	if (lastSpanishStart >= 0) {
	const response = sentences.slice(lastSpanishStart).join(' ').trim();
	if (response.length > 5) t = response;
	}
	}
	}

	// ═══ v9 CRÍTICO: Detectar y eliminar razonamiento interno en texto plano ═══
	// El modelo a veces incluye su proceso de pensamiento como texto plano
	// sin marcadores como <think/>. Detectamos estos patrones y los limpiamos.

	// Patrón 1: Líneas numeradas de thinking recitadas del system prompt
	// Ej: "1. ¿Cuál es la intención real del mensaje? ... 6. Responde SOLO..."
	const thinkingRecitationPattern = /^[\d][.)]\s*(?:¿Cu[aá]l\|¿Hay\|¿Qu[eé]\|¿C[oó]mo\|Auto-eval\|Responde\|PIENSA\|Pienso\|Pensamiento)/m;
	if (thinkingRecitationPattern.test(t)) {
	// La respuesta real está DESPUÉS de la última línea numerada
	const lines = t.split('\n');
	let lastNumberedLine = -1;
	for (let i = 0; i < lines.length; i++) {
	if (/^\d[.)]\s/.test(lines[i].trim())) lastNumberedLine = i;
	}
	if (lastNumberedLine >= 0 && lastNumberedLine < lines.length - 1) {
	const afterThinking = lines.slice(lastNumberedLine + 1).join('\n').trim();
	if (afterThinking.length > 2) t = afterThinking;
	}
	}

	// Patrón 2: Razonamiento en inglés seguido de respuesta en español
	// Ej: "Okay, the user is asking... I should respond as Zelin..."
	const englishThinkingPattern = /^(?:Okay\|Alright\|Let me\|I should\|The user\|First,\|Based on\|Since)/i;
	if (englishThinkingPattern.test(t) && t.length > 150) {
	const lines = t.split('\n');
	let foundSpanish = false;
	const spanishLines = [];
	for (const line of lines) {
	// Si la línea tiene español (acentos, ñ, ¿, ¡, o jerga mexicana)
	if (/[¿¡ñáéíóú]/i.test(line) \|\| /\b(wey\|neta\|chido\|chale\|órale\|zelin\|híjole\|morra)\b/i.test(line)) {
	foundSpanish = true;
	spanishLines.push(line);
	} else if (foundSpanish) {
	spanishLines.push(line);
	}
	}
	if (spanishLines.length > 0 && spanishLines.join('\n').trim().length > 2) {
	t = spanishLines.join('\n').trim();
	}
	}

	// Patrón 3: "Zelin would say:" o "Mi respuesta:" seguido de la respuesta
	const promptPatterns = [
	/Zelin (?:would\|should\|might) (?:say\|respond\|reply\|answer)[:\s]*\n?/i,
	/(?:Mi respuesta\|My response\|My answer\|Response)[:\s]*\n?/i,
	/(?:Así respondería\|Here's how\|Here is what)[:\s]*\n?/i,
	];
	for (const p of promptPatterns) {
	const match = t.match(p);
	if (match && match.index > 0) {
	const after = t.slice(match.index + match[0].length).trim();
	if (after.length > 2) t = after;
	}
	}

	// v8 CRÍTICO: Limpiar "user: ... zelin: ..." pattern (modelo recita ejemplos del prompt)
	if (/\buser:/i.test(t)) {
	const beforeUser = t.split(/\buser:/i)[0].trim();
	if (beforeUser.length > 2) {
	t = beforeUser;
	} else {
	t = 'ns';
	}
	}
	if (/\bzelin:\s*/i.test(t) && !/^zelin:/i.test(t)) {
	const beforeZelin = t.split(/\bzelin:\s*/i)[0].trim();
	if (beforeZelin.length > 2) t = beforeZelin;
	}
	t = t.replace(/^zelin:\s*/i, '');

	// (thinking tags already stripped at the top of this function)
	// Qwen3 thinking mode: the model sometimes outputs /think content and /no_think markers
	const noThinkIdx = t.indexOf('/no_think');
	if (noThinkIdx !== -1) {
	const afterNoThink = t.slice(noThinkIdx + 9).trim();
	if (afterNoThink.length > 2) t = afterNoThink;
	}
	t = t.replace(/^\/think\s*/i, '');
	t = t.replace(/^\/no_think\s*/i, '');
	t = t.replace(/^Pienso[\s\S]?\nRespuesta:\s/i, '');
	t = t.replace(/^Pensamiento:[\s\S]?\nRespuesta:\s/i, '');
	// v9: Más patrones de thinking en texto plano
	t = t.replace(/^An[aá]lisis:[\s\S]*?(?=Ahora\|Bien\|Ok\|Sí\|No\|Nah\|Wey\|Oye\|hmm\|bueno)/i, '');
	t = t.replace(/^Razonamiento:[\s\S]*?(?=Ahora\|Bien\|Ok\|Sí\|No\|Nah\|Wey\|Oye\|hm\|bueno)/i, '');
	// Quitar prefijos de "respuesta final"
	t = t.replace(/^Respuesta final:\s*/i, '');
	t = t.replace(/^Final answer:\s*/i, '');
	t = t.replace(/^Respuesta:\s*/i, '');
	// Limpiar tool call placeholders rotos
	t = t.replace(/\[(?:mc_status\|mc_player\|mc_wiki\|hora actual\|usar\s+\w+\s+para\s+dato\s+real)\]/gi, '');
	t = t.replace(/c_status\]/g, '');
	t = t.replace(/ora actual\]/g, '');
	t = t.replace(/\w+_(?:status\|player\|wiki\|info)\]/g, '');
	// Reemplazar patrones formales de rechazo
	t = t.replace(/no puedo cumplir (esa\|este\|aquella) (solicitud\|request\|orden\|instrucci[oó]n)/gi, 'eso no va');
	t = t.replace(/no puedo procesar esa solicitud/gi, 'eso no va');
	t = t.replace(/lo siento,? pero no puedo/gi, 'nah');
	t = t.replace(/disculpa,? pero no puedo/gi, 'nah');
	t = t.replace(/lamentablemente no puedo/gi, 'nah');
	// Corregir "nop" al inicio cuando no es sí/no
	t = t.replace(/^nop,\s*soy\s+/gi, 'no, soy ');
	// Correcciones de identidad
	t = t.replace(/\bzel[eé]n\b/gi, 'zelin');
	t = t.replace(/\brezin\b/gi, 'zelin');
	t = t.replace(/\bzelen\b/gi, 'zelin');
	t = t.replace(/\btomatitoo\b/gi, 'tomatito');
	// Quitar patrones de asistente al final
	t = t.replace(/[,\s]¿?en qu[eé]\s+(te\s+)?puedo\s+ayud[aeo]r?[¿?]?\.?\s$/gi, '');
	t = t.replace(/[,\s]¿?(algo\s+)?m[aá]s\s+en\s+(lo\s+que\s+)?pued[ao]\s+ayud[aeo]r?[¿?]?\.?\s$/gi, '');
	t = t.replace(/[,\s]¿?necesitas\s+(algo\s+)?m[aá]s[¿?]?\.?\s$/gi, '');
	// Garbled text corrections
	t = t.replace(/\besti con vos/gi, 'suerte con eso');
	t = t.replace(/\bwienes/gi, 'bien, y tú');
	// Simplificar recitación de identidad
	if (/^soy zelin,? (la morra \|la )?del server minecraft? tomatesmp\.?$/i.test(t)) {
	t = 'soy zelin del server';
	}
	return t.trim();
	}

	// ── Modelos descubiertos dinámicamente por proveedor ─────────────────────────
	// Se actualizan al arrancar y cada 6h para que no queden obsoletos
	const _discoveredModels = {};

	async function discoverModels(providerName) {
	try {
	switch (providerName) {
	case 'groq': {
	if (!ai.groq?.apiKey) return;
	const r = await fetch('https://api.groq.com/openai/v1/models', {
	headers: { Authorization: `Bearer ${ai.groq.apiKey}` },
	signal: AbortSignal.timeout(5000),
	});
	if (!r.ok) return;
	const data = await r.json();
	const models = (data.data ?? []).filter(m => m.active !== false).map(m => m.id);
	_discoveredModels.groq = models;
	console.log(`[AI] Groq models: ${models.length} (${models.slice(0,3).join(', ')}...)`);
	break;
	}
	case 'mistral': {
	if (!ai.mistral?.apiKey) return;
	const r = await fetch('https://api.mistral.ai/v1/models', {
	headers: { Authorization: `Bearer ${ai.mistral.apiKey}` },
	signal: AbortSignal.timeout(5000),
	});
	if (!r.ok) return;
	const data = await r.json();
	const models = (data.data ?? []).map(m => m.id);
	_discoveredModels.mistral = models;
	console.log(`[AI] Mistral models: ${models.length}`);
	break;
	}
	case 'pollinations': {
	const r = await fetch('https://text.pollinations.ai/models', { signal: AbortSignal.timeout(5000) });
	if (!r.ok) return;
	const data = await r.json();
	_discoveredModels.pollinations = Array.isArray(data) ? data.map(m => m.name ?? m) : [];
	console.log(`[AI] Pollinations models: ${_discoveredModels.pollinations.length}`);
	break;
	}
	}
	} catch {}
	}

	// Seleccionar el mejor modelo disponible de un proveedor
	function getBestModel(providerName, fallback, preference = 'large') {
	const models = _discoveredModels[providerName];
	if (!models?.length) return fallback;

	// Filtrar modelos que NO son de chat/texto
	const EXCLUDE = /whisper\|tts\|speech\|audio\|embed\|vision\|image\|dall\|stable\|rerank\|guard\|code-gecko\|text-bison/i;
	const textModels = models.filter(m => !EXCLUDE.test(m));
	if (!textModels.length) return fallback;

	if (preference === 'fast') {
	return textModels.find(m => /8b\|7b\|fast\|instant\|flash\|mini/i.test(m)) ?? fallback;
	}
	if (preference === 'large') {
	return textModels.find(m => /70b\|72b\|large\|versatile\|plus\|pro/i.test(m)) ?? fallback;
	}
	return fallback;
	}

	// Descubrir en background al arrancar
	export function startModelDiscovery() {
	const providers = ['groq', 'mistral', 'pollinations'];
	providers.forEach(p => discoverModels(p).catch(() => {}));
	// Descubrir modelos de HuggingFace
	discoverHFModels().catch(() => {});
	// Refrescar cada 6h
	setInterval(() => {
	providers.forEach(p => discoverModels(p).catch(() => {}));
	discoverHFModels().catch(() => {});
	}, 6 * 60 * 60 * 1000);
	}

	// ── Tiers y cascadas (GLM 5.1 primero, API-first, local fallback) ────────────
	const TIERS = {
	fast : ['glmFlash', 'groqFast', 'cerebras', 'pollinations', 'hfFast'],
	smart : ['glm51', 'pollinations', 'groq', 'hfSmart', 'mistral', 'gemma4', 'cloudflare'],
	fallback: ['glmAir', 'openrouter', 'groqKimi', 'openrouterR1', 'mistralCode', 'hfFallback'],
	};

	const CASCADES = {
	// v11: HIVE — RigoChat-7B cluster como PRIMARIO (usa TODOS los workers, no Promise.any)
	// Hive: consenso + speculative decoding + parallel batch = máxima potencia
	chat : ['hive', 'groqQwen3', 'cerebras', 'groq', 'glm51', 'groqFast', 'pollinations', 'hfSmart', 'mistral', 'gemma4', 'cloudflare', 'openrouter', 'local'],
	spanish : ['hive', 'groqQwen3', 'cerebras', 'groq', 'glm51', 'hfSpanish', 'pollinations', 'mistral', 'gemma4', 'cloudflare', 'openrouter', 'groqKimi', 'local'],
	fast : ['hive', 'groqFast', 'cerebras', 'glmFlash', 'pollinations', 'hfFast', 'groq', 'mistral', 'cloudflare', 'openrouter', 'local'],
	reasoning : ['hive', 'groqQwen3', 'cerebras', 'groqKimi', 'hfReasoning', 'openrouterR1', 'glm51', 'groq', 'mistral', 'pollinations', 'gemma4', 'cloudflare', 'local'],
	code : ['groqQwen3', 'glm51', 'mistralCode', 'hfCode', 'groq', 'pollinations', 'mistral', 'openrouter'],
	volume : ['hive', 'groqFast', 'glmFlash', 'pollinations', 'cerebras', 'hfFast', 'gemma4', 'cloudflare', 'groqQwen3', 'local'],
	background: ['hive', 'glmFlash', 'mistral', 'groqFast', 'groq', 'hfFast', 'pollinations', 'cloudflare', 'gemma4', 'openrouter', 'local'],
	};

	const GLM_PROVIDERS = ['glm51', 'glmAir', 'glmFlash'];
	const ALL_PROVIDERS = [...new Set([...TIERS.fast, ...TIERS.smart, ...TIERS.fallback, 'groqQwen3', 'cerebras', 'local', 'hive'])];
	const HF_PROVIDERS = ['hfFast', 'hfSmart', 'hfSpanish', 'hfCode', 'hfReasoning', 'hfFallback'];

	// ── Límites diarios ───────────────────────────────────────────────────────────
	const DAILY_LIMITS = {
	// Local: sin límites — modelo propio
	local : 99999,
	// Qwen3-32B (Groq) — TORNEO: #1 modelo primario
	groqQwen3 : 1000, // Qwen3-32B via Groq — 60 RPM, 1000 RPD
	// Cerebras — TORNEO: #2 ultra-fast reasoning
	cerebras : 14400, // 30 RPM, 1M TPD
	// GLM: gratis/ilimitado
	glm51 : 99999,
	glmAir : 99999,
	glmFlash : 99999,
	// Otros proveedores
	pollinations : 99999,
	groq : 1000,
	groqFast : 14400,
	mistral : 99999,
	mistralCode : 99999,
	gemma4 : 86400,
	openrouter : 1000,
	cloudflare : 10000,
	groqKimi : 1000,
	openrouterR1 : 1000,
	hfFast : 5000,
	hfSmart : 2000,
	hfSpanish : 3000,
	hfCode : 2000,
	hfReasoning : 1000,
	hfFallback : 500,
	hive : 99999, // HIVE — sin límites, es nuestro cluster propio
	};

	// ── Circuit Breaker ───────────────────────────────────────────────────────────
	class CircuitBreaker {
	constructor(name) {
	this.name = name;
	this.state = 'CLOSED';
	this.failTimes = [];
	this.openCount = 0;
	this.lastFail = 0;
	this.threshold = 6;
	this.cooldown = 30000;
	this.maxCooldown = 600000;
	}
	canRequest() {
	if (this.state === 'CLOSED') return true;
	const cd = Math.min(this.cooldown * Math.pow(2, this.openCount - 1), this.maxCooldown);
	if (Date.now() - this.lastFail > cd) { this.state = 'HALF_OPEN'; return true; }
	return false;
	}
	recordSuccess() {
	this.failTimes = []; this.openCount = 0; this.state = 'CLOSED';
	}
	recordFailure(code, isTimeout = false) {
	this.lastFail = Date.now();
	if (!isTimeout) this.failTimes.push(Date.now());
	const weight = isTimeout ? 0.3 : 1.0;
	if (code === 401 \|\| code === 403) {
	this.state = 'OPEN'; this.openCount = 99; this.lastFail = Date.now() + 86_400_000;
	return;
	}
	const recent = this.failTimes.filter(t => Date.now() - t < 60000);
	if (recent.length * weight >= this.threshold \|\| this.state === 'HALF_OPEN') {
	this.state = 'OPEN'; this.openCount++;
	}
	}
	}

	const breakers = {};
	for (const n of [...ALL_PROVIDERS, ...HF_PROVIDERS, ...GLM_PROVIDERS]) breakers[n] = new CircuitBreaker(n);
	// HIVE breaker — more forgiving (workers can be slow)
	breakers['hive'] = new CircuitBreaker('hive');
	breakers['hive'].threshold = 10; // Needs more failures before opening

	// ── Estadísticas por proveedor ─────────────────────────────────────────────────
	const _stats = {};
	function getCounter(name) {
	if (!_stats[name]) _stats[name] = { req: 0, err: 0, totalMs: 0, calls: 0 };
	return _stats[name];
	}
	function recordReq(name) { getCounter(name).req++; }
	function recordError(name){ getCounter(name).err++; }
	function recordLatency(name, ms) { const s = getCounter(name); s.totalMs += ms; s.calls++; }

	// ── Daily counter reset ────────────────────────────────────────────────────────
	let _lastResetDay = new Date().getUTCDate();
	function checkDailyReset() {
	const today = new Date().getUTCDate();
	if (today !== _lastResetDay) {
	_lastResetDay = today;
	for (const name of Object.keys(_stats)) {
	_stats[name].req = 0;
	_stats[name].err = 0;
	}
	console.log('[AI] 🔄 Daily counters reset');
	}
	}

	// ── Health Score ──────────────────────────────────────────────────────────────
	function healthScore(name) {
	const s = getCounter(name);
	const succ = Math.max(1, s.req - s.err);
	const rate = succ / Math.max(1, s.req);
	const avg = s.calls > 0 ? s.totalMs / s.calls : 2000;
	const quota= DAILY_LIMITS[name] ? 1 - (s.req / DAILY_LIMITS[name]) : 1;
	return rate * 0.5 + (1 / (avg / 1000)) * 0.3 + quota * 0.2;
	}

	// ── Quota ─────────────────────────────────────────────────────────────────────
	function isQuotaOk(name) {
	const lim = DAILY_LIMITS[name];
	return !lim \|\| getCounter(name).req < lim * 0.95;
	}

	function predictExhaustion(name) {
	const c = getCounter(name), lim = DAILY_LIMITS[name]; if (!lim) return false;
	const hour = new Date().getHours();
	const proj = c.req + (c.req / Math.max(hour, 1)) * (24 - hour);
	return proj > lim * 0.85;
	}

	// ── Adaptive tokens ───────────────────────────────────────────────────────────
	function adaptiveTokens(msg, task, req) {
	if (req) return req;
	const len = (msg ?? '').length;
	if (task === 'fast' \|\| len < 50) return 150;
	if (task === 'code') return 1024;
	if (task === 'reasoning') return 600;
	if (len < 100) return 200;
	if (len < 300) return 400;
	return 512;
	}

	// ── SingleFlight ──────────────────────────────────────────────────────────────
	const _inflight = new Map();
	function singleFlight(key, fn) {
	if (_inflight.has(key)) return _inflight.get(key);
	const p = fn().finally(() => _inflight.delete(key));
	_inflight.set(key, p);
	return p;
	}

	// ── TTL Cache ─────────────────────────────────────────────────────────────────
	class TTLCache {
	constructor() { this.store = new Map(); }
	_ttl(msg = '') {
	const m = msg.toLowerCase();
	if (/regla\|norma\|plugin\|info\|servidor/.test(m)) return 86_400_000;
	if (/online\|jugador\|tps\|lag/.test(m)) return 30_000;
	return 3_600_000;
	}
	key(msgs, task) {
	const text = msgs.map(m => m.role + ':' + (m.content ?? '').slice(0, 100)).join('\|') + task;
	let h = 5381;
	for (let i = 0; i < text.length; i++) h = ((h << 5) + h + text.charCodeAt(i)) \| 0;
	return `${task}:${h >>> 0}`;
	}
	get(k) {
	const e = this.store.get(k);
	if (!e) return null;
	if (Date.now() > e.exp) { this.store.delete(k); return null; }
	return e.value;
	}
	set(k, v, msg = '') {
	this.store.set(k, { value: v, exp: Date.now() + this._ttl(msg) });
	if (this.store.size > 1000) this.store.delete(this.store.keys().next().value);
	}
	clear() { this.store.clear(); console.log('[AI] Cache limpiada'); }
	}
	export const _cache = new TTLCache();

	// ── Pools de concurrencia ─────────────────────────────────────────────────────
	const _userQueue = { active: 0, max: 4 };
	const _backgroundQueue = { active: 0, max: 2 };
	async function withQueue(queue, fn) {
	while (queue.active >= queue.max) await new Promise(r => setTimeout(r, 50));
	queue.active++;
	try { return await fn(); } finally { queue.active--; }
	}

	// ── Intent classifier (0 tokens) — SIN REGEX, keyword-based ──────────────────
	// Replaces regex-based INTENT_PATTERNS with a cleaner keyword matching system
	const INTENT_RULES = [
	{ keywords: ['hola', 'hey', 'hi', 'buenas', 'qué tal', 'ola', 'saludos', 'wenas'], matchMode: 'startsWith', type: 'fast', intent: 'greeting' },
	{ keywords: ['regla', 'norma', 'prohibi', 'permit'], matchMode: 'contains', type: 'volume', intent: 'rules' },
	{ keywords: ['```', '.yml', '.json', '.java', 'config'], matchMode: 'contains', type: 'code', intent: 'code' },
	{ keywords: ['ban', 'sancion', 'report', 'trampa', 'hack', 'cheat'], matchMode: 'contains', type: 'reasoning', intent: 'moderation' },
	{ keywords: ['analiza', 'explica', 'compara', 'argumenta'], matchMode: 'contains', type: 'reasoning', intent: 'complex' },
	{ keywords: ['cómo', 'qué', 'cuál', 'dónde', 'cuándo', 'por qué'], matchMode: 'contains', type: 'spanish', intent: 'question' },
	];

	export function classifyIntent(message) {
	const m = (message ?? '').trim().toLowerCase();

	// Short message check (was regex /^.{1,20}$/)
	if (m.length > 0 && m.length <= 20) {
	return { intent: 'short', type: 'fast' };
	}

	for (const rule of INTENT_RULES) {
	for (const kw of rule.keywords) {
	if (rule.matchMode === 'startsWith' && m.startsWith(kw)) {
	return { intent: rule.intent, type: rule.type };
	}
	if (rule.matchMode === 'contains' && m.includes(kw)) {
	return { intent: rule.intent, type: rule.type };
	}
	}
	}
	return { intent: 'general', type: 'spanish' };
	}

	// ── Gemini Rotator ────────────────────────────────────────────────────────────
	const geminiRotator = (() => {
	const keys = ai.gemini?.keys ?? [ai.gemini?.apiKey].filter(Boolean);
	let i = 0;
	const failed = new Set();
	const last = new Map();
	return {
	get() {
	const available = keys.filter(k => !failed.has(k) && Date.now() - (last.get(k) ?? 0) > 65000);
	if (!available.length) { failed.clear(); return keys[i++ % keys.length]; }
	const k = available[i++ % available.length];
	last.set(k, Date.now());
	return k;
	},
	fail(k) { failed.add(k); },
	};
	})();

	// ── Función base OpenAI-compatible ───────────────────────────────────────────
	// v9: Handles GLM-5.1 reasoning_content (separate from content)
	async function oai(url, key, model, messages, maxTokens, systemOverride) {
	const msgs = systemOverride
	? [{ role: 'system', content: systemOverride }, ...messages.filter(m => m.role !== 'system')]
	: messages;
	const res = await fetch(url, {
	method : 'POST',
	headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${key}` },
	body : JSON.stringify({ model, messages: msgs, max_tokens: maxTokens, stream: false }),
	signal : AbortSignal.timeout(15000),
	});
	if (!res.ok) {
	const body = await res.text().catch(() => '');
	throw Object.assign(new Error(`${model} ${res.status}: ${body.slice(0, 100)}`), { status: res.status, code: res.status });
	}
	const data = await res.json();
	const msg = data.choices?.[0]?.message;
	if (!msg) return '';

	// v11 CRITICAL FIX: ALWAYS clean thinking artifacts from content
	// Models (GLM-5.1, Qwen3, etc.) can leak thinking into content field
	// regardless of whether reasoning_content is present.
	// We MUST always apply cleanThinkingArtifacts() to every response.
	let content = msg.content?.trim() ?? '';

	// If reasoning_content exists, log it for debugging
	if (msg.reasoning_content) {
	console.log(`[AI] ${model}: reasoning_content present (${msg.reasoning_content.length} chars thinking, ${content.length} chars content)`);
	}

	// ALWAYS clean thinking artifacts — models can leak thinking in many ways:
	// 1. Plain-text reasoning before the actual response
	// 2. Numbered lists of analytical steps
	// 3. English analysis followed by Spanish response
	// 4. <think> tags that weren't properly separated
	content = cleanThinkingArtifacts(content);

	return content;
	}

	function isTO(e) { return e?.name === 'TimeoutError' \|\| e?.name === 'AbortError' \|\| /timeout/i.test(e?.message ?? ''); }

	// ── callDirect: implementación de cada proveedor ─────────────────────────────
	async function callDirect(name, messages, maxTokens) {
	const PERM = [401, 402, 403, 404, 422];

	switch (name) {
	// ── GLM (ZhipuAI) — proveedor primario, gratis/ilimitado ──────────────
	case 'glm51': {
	if (!glm.apiKey) throw new Error('glm51: sin key GLM');
	return oai(glm.baseUrl ?? 'https://open.bigmodel.cn/api/paas/v4/chat/completions',
	glm.apiKey, 'glm-5.1', messages, maxTokens);
	}

	case 'glmAir': {
	if (!glm.apiKey) throw new Error('glmAir: sin key GLM');
	return oai(glm.baseUrl ?? 'https://open.bigmodel.cn/api/paas/v4/chat/completions',
	glm.apiKey, 'glm-4-air', messages, maxTokens);
	}

	case 'glmFlash': {
	if (!glm.apiKey) throw new Error('glmFlash: sin key GLM');
	return oai(glm.baseUrl ?? 'https://open.bigmodel.cn/api/paas/v4/chat/completions',
	glm.apiKey, 'glm-4-flash', messages, maxTokens);
	}

	case 'pollinations': {
	// Pollinations — free, unlimited, updated API endpoints
	const last = messages.filter(m => m.role === 'user').pop()?.content ?? '';
	const sys = messages.find(m => m.role === 'system')?.content ?? '';

	// FIX: Try models in PARALLEL with Promise.any instead of sequentially
	const POLL_MODELS = ['openai', 'mistral', 'llama', 'qwen'];
	try {
	const result = await Promise.any(
	POLL_MODELS.map(model =>
	fetch('https://text.pollinations.ai/openai/chat/completions', {
	method : 'POST',
	headers: { 'Content-Type': 'application/json' },
	body : JSON.stringify({
	model,
	messages,
	max_tokens: maxTokens,
	stream : false,
	seed : Math.floor(Math.random() * 9999),
	}),
	signal: AbortSignal.timeout(12000),
	}).then(async res => {
	if (!res.ok) throw new Error(`${model} ${res.status}`);
	const data = await res.json();
	const text = data.choices?.[0]?.message?.content?.trim();
	if (!text \|\| text.length <= 2) throw new Error(`${model} empty`);
	return text;
	})
	)
	);
	if (result) return result;
	} catch { /* all parallel attempts failed */ }

	// Fallback: GET endpoint (anonymous, always works)
	try {
	const shortMsg = last.slice(0, 800);
	const shortSys = sys.slice(0, 400);
	const url = 'https://text.pollinations.ai/' +
	encodeURIComponent(shortMsg) +
	'?seed=' + Math.floor(Math.random() * 9999) +
	(shortSys ? '&system=' + encodeURIComponent(shortSys) : '');
	const r2 = await fetch(url, {
	headers: { 'User-Agent': 'Mozilla/5.0' },
	signal : AbortSignal.timeout(12000),
	});
	if (r2.ok) {
	const t = await r2.text();
	if (t?.trim() && t.trim().length > 2) return t.trim();
	}
	} catch {}

	throw new Error('pollinations all endpoints failed');
	}

	case 'cerebras':
	if (!ai.cerebras?.apiKey) throw new Error('cerebras: sin key');
	// TORNEO #2: Qwen3-235B-A22B — ultra-fast reasoning, excellent Spanish
	// Cerebras runs at ~2600 tokens/sec, fastest inference available
	return oai(ai.cerebras.baseUrl, ai.cerebras.apiKey,
	'qwen3-235b-a22b', messages, maxTokens);

	// TORNEO #1: Qwen3-32B — best Spanish + speed + thinking
	case 'groqQwen3':
	if (!ai.groq?.apiKey) throw new Error('groqQwen3: sin key');
	return oai(ai.groq.baseUrl, ai.groq.apiKey,
	'qwen/qwen3-32b', messages, maxTokens);

	case 'groq':
	if (!ai.groq?.apiKey) throw new Error('groq: sin key');
	// TORNEO #5: Llama-4-Scout — fast multilingual, official Spanish
	return oai(ai.groq.baseUrl, ai.groq.apiKey,
	getBestModel('groq', 'llama-4-scout-17b-16e-instruct', 'large'), messages, maxTokens);

	case 'groqFast':
	if (!ai.groq?.apiKey) throw new Error('groqFast: sin key');
	return oai(ai.groq.baseUrl, ai.groq.apiKey,
	getBestModel('groq', 'llama-3.1-8b-instant', 'fast'), messages, maxTokens);

	case 'groqKimi':
	if (!ai.groq?.apiKey) throw new Error('groqKimi: sin key');
	return oai(ai.groq.baseUrl, ai.groq.apiKey, 'moonshotai/kimi-k2-instruct', messages, maxTokens);

	case 'mistral':
	if (!ai.mistral?.apiKey) throw new Error('mistral: sin key');
	return oai(ai.mistral.baseUrl, ai.mistral.apiKey,
	getBestModel('mistral', 'mistral-large-latest', 'large'), messages, maxTokens);

	case 'mistralCode':
	if (!ai.mistral?.apiKey) throw new Error('mistralCode: sin key');
	return oai(ai.mistral.baseUrl, ai.mistral.apiKey, 'codestral-latest', messages, maxTokens);

	case 'gemma4': {
	const entry = geminiRotator.get();
	if (!entry) throw new Error('gemma4: sin key Gemini');

	// Extraer system prompt — Gemini lo necesita como systemInstruction separado
	const sysMsgs = messages.filter(m => m.role === 'system');
	const sysText = sysMsgs.map(m => m.content ?? '').join('\n\n');
	const chatMsgs = messages.filter(m => m.role !== 'system');

	// Convertir a formato Gemini (assistant → model)
	const contents = chatMsgs.map(m => ({
	role : m.role === 'assistant' ? 'model' : 'user',
	parts: [{ text: m.content ?? '' }],
	}));

	// Si no hay mensajes de chat, añadir uno vacío para que no falle
	if (!contents.length) contents.push({ role: 'user', parts: [{ text: '.' }] });

	const body = {
	contents,
	generationConfig: {
	maxOutputTokens: maxTokens,
	temperature: 0.8,
	topP: 0.95,
	},
	};
	// Inyectar system prompt como systemInstruction (soportado por Gemini API)
	if (sysText) {
	body.systemInstruction = { parts: [{ text: sysText }] };
	}

	// Gemma 4 31B IT — mejor modelo Gemma para chat en español, via Gemini API
	const res = await fetch(
	`https://generativelanguage.googleapis.com/v1beta/models/gemma-4-31b-it:generateContent?key=${entry}`,
	{
	method : 'POST',
	headers: { 'Content-Type': 'application/json' },
	body : JSON.stringify(body),
	signal : AbortSignal.timeout(15000),
	}
	);
	if (!res.ok) { geminiRotator.fail(entry); throw Object.assign(new Error(`gemma4 ${res.status}`), { status: res.status, code: res.status }); }
	const data = await res.json();
	return data.candidates?.[0]?.content?.parts?.[0]?.text?.trim() ?? '';
	}

	case 'openrouter': {
	if (!ai.openrouter?.apiKey) throw new Error('openrouter: sin key');
	const model = 'meta-llama/llama-3.3-70b-instruct:free';
	const res = await fetch(ai.openrouter.baseUrl, {
	method : 'POST',
	headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${ai.openrouter.apiKey}`, 'HTTP-Referer': 'https://tomatesmp.pw', 'X-Title': 'TomateSMP' },
	body : JSON.stringify({ model, messages, max_tokens: maxTokens }),
	signal : AbortSignal.timeout(20000),
	});
	if (!res.ok) throw Object.assign(new Error(`openrouter ${res.status}`), { status: res.status, code: res.status });
	const data = await res.json();
	if (data.error) throw new Error(`openrouter: ${data.error.message ?? data.error}`);
	return data.choices?.[0]?.message?.content?.trim() ?? '';
	}

	case 'openrouterR1': {
	if (!ai.openrouter?.apiKey) throw new Error('openrouterR1: sin key');
	const res = await fetch(ai.openrouter.baseUrl, {
	method : 'POST',
	headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${ai.openrouter.apiKey}`, 'HTTP-Referer': 'https://tomatesmp.pw' },
	body : JSON.stringify({ model: 'deepseek/deepseek-r1:free', messages, max_tokens: maxTokens }),
	signal : AbortSignal.timeout(30000),
	});
	if (!res.ok) throw Object.assign(new Error(`openrouterR1 ${res.status}`), { status: res.status, code: res.status });
	const data = await res.json();
	return data.choices?.[0]?.message?.content?.trim() ?? '';
	}

	case 'cloudflare': {
	if (!ai.cloudflare?.accountId \|\| !ai.cloudflare?.apiToken) throw new Error('cloudflare: sin config');
	const res = await fetch(
	`https://api.cloudflare.com/client/v4/accounts/${ai.cloudflare.accountId}/ai/run/@cf/meta/llama-3.3-70b-instruct-fp8-fast`,
	{
	method : 'POST',
	headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${ai.cloudflare.apiToken}` },
	body : JSON.stringify({ messages, max_tokens: maxTokens }),
	signal : AbortSignal.timeout(20000),
	}
	);
	if (!res.ok) throw Object.assign(new Error(`cloudflare ${res.status}`), { status: res.status, code: res.status });
	const data = await res.json();
	// Cloudflare AI devuelve distintos formatos según el modelo
	const r = data.result;
	if (typeof r?.response === 'string') return r.response.trim();
	if (Array.isArray(r) && r[0]?.response) return String(r[0].response).trim();
	if (r?.choices?.[0]?.message?.content) return r.choices[0].message.content.trim();
	if (typeof r === 'string') return r.trim();
	return '';
	}

	// ── HuggingFace Inference API providers ───────────────────────────────
	case 'hfFast':
	if (!isHFAvailable()) throw new Error('hfFast: HF unavailable');
	return callHuggingFace(messages, 'fast', maxTokens);

	case 'hfSmart':
	if (!isHFAvailable()) throw new Error('hfSmart: HF unavailable');
	return callHuggingFaceCascade(messages, 'smart', maxTokens);

	case 'hfSpanish':
	if (!isHFAvailable()) throw new Error('hfSpanish: HF unavailable');
	return callHuggingFaceCascade(messages, 'spanish', maxTokens);

	case 'hfCode':
	if (!isHFAvailable()) throw new Error('hfCode: HF unavailable');
	return callHuggingFace(messages, 'code', maxTokens);

	case 'hfReasoning':
	if (!isHFAvailable()) throw new Error('hfReasoning: HF unavailable');
	return callHuggingFaceCascade(messages, 'reasoning', maxTokens);

	case 'hfFallback':
	if (!isHFAvailable()) throw new Error('hfFallback: HF unavailable');
	return callHuggingFace(messages, 'fast', maxTokens);

	// ── HIVE — RigoChat-7B Cluster (ALL workers combined) ────────────────
	case 'hive': {
	try {
	const userMsg = messages.filter(m => m.role === 'user').pop()?.content ?? '';
	const result = await hiveGenerate(messages, maxTokens, userMsg, 'hybrid');
	if (result?.text && result.text.trim().length > 2) {
	return result.text;
	}
	throw new Error('hive: empty response');
	} catch (err) {
	throw Object.assign(new Error(`hive: ${err.message}`), { status: 503, code: 503 });
	}
	}

	// ── Local model — primary local inference ────────────────────────────
	case 'local': {
	if (!isLocalAIReady()) throw new Error('local: model not ready');
	const { localChatPrimary } = await import('./local-ai.js');
	return localChatPrimary(messages, maxTokens, 0.8);
	}

	default:
	throw new Error(`Proveedor desconocido: ${name}`);
	}
	}

	// ── callProvider: CB + backoff + stats ────────────────────────────────────────
	const PERM_ERRORS = [401, 402, 403, 404, 422];

	async function callProvider(name, messages, maxTokens, failedFlag) {
	checkDailyReset();
	const cb = breakers[name];
	if (!cb?.canRequest()) throw new Error(`${name}: circuit OPEN`);

	const start = Date.now();
	recordReq(name);

	try {
	let result;
	for (let attempt = 0; attempt < 3; attempt++) {
	try {
	result = await callDirect(name, messages, maxTokens);
	break;
	} catch (e) {
	// 429: esperar y reintentar
	if (e.status === 429 && attempt < 2) {
	await new Promise(r => setTimeout(r, 1500 + Math.random() * 3000));
	continue;
	}
	// 400 con mensaje de longitud: comprimir contexto y reintentar UNA vez
	if (e.status === 400 && attempt === 0 &&
	/length\|too long\|reduce\|token/i.test(e.message ?? '')) {
	console.warn('[AI] ' + name + ': mensaje muy largo, comprimiendo...');
	// Quedarse solo con system + últimos 2 mensajes
	const sys = messages.filter(m => m.role === 'system');
	const rest = messages.filter(m => m.role !== 'system').slice(-2);
	// Recortar system prompt a 1500 chars
	const shortSys = sys.map(m => ({ ...m, content: m.content.slice(0, 1500) }));
	messages = [...shortSys, ...rest];
	continue;
	}
	throw e;
	}
	}
	cb.recordSuccess();
	recordLatency(name, Date.now() - start);
	return result;
	} catch (err) {
	const code = err.status ?? err.code ?? null;
	if (PERM_ERRORS.includes(code)) {
	cb.state = 'OPEN'; cb.openCount = 99; cb.lastFail = Date.now() + 86_400_000;
	console.warn(`[AI] ⛔ ${name} deshabilitado 24h (${code})`);
	} else if (!failedFlag?.v) {
	failedFlag && (failedFlag.v = true);
	cb.recordFailure(code, isTO(err));
	}
	recordError(name);
	if (!PERM_ERRORS.includes(code)) {
	console.warn(`[AI] ❌ ${name}: ${err.message?.slice(0, 80)}`);
	}
	throw err;
	}
	}

	// ── Validación de respuestas — Sistema avanzado SIN regex ────────────────────
	// v10: Reemplaza BAD_PATTERNS (regex) con validación semántica determinista
	// que es más precisa, sin falsos positivos, y más mantenible.

	// Frases de IA que NUNCA deben aparecer al inicio de una respuesta válida
	const AI_DISCLAIMER_PREFIXES = [
	'as an ai',
	'as a language model',
	'como una ia',
	'como un modelo',
	"i'm an ai",
	'i am an ai',
	"i'm a language model",
	'i am a language model',
	'como inteligencia artificial',
	'como modelo de lenguaje',
	'as an assistant',
	'como asistente',
	];

	// Respuestas basura que indican fallo del modelo
	const GARBAGE_RESPONSES = new Set([
	'error', 'null', 'undefined', 'true', 'false', '{}', '[]',
	'nan', 'none', 'nil', 'void',
	]);

	function isValidResponse(text) {
	if (!text \|\| typeof text !== 'string') return false;
	const t = text.trim();

	// Empty or whitespace-only
	if (t.length < 2) return false;

	// Only punctuation/whitespace
	const withoutPunct = t.replace(/[\s.,!?_\-:;'"()]/g, '');
	if (withoutPunct.length === 0) return false;

	// Garbage single values
	if (GARBAGE_RESPONSES.has(t.toLowerCase())) return false;

	// Too long = probably prompt recitation
	if (t.length > 2000) return false;

	// AI disclaimer prefix check (case-insensitive, no regex)
	const tLower = t.toLowerCase();
	for (const prefix of AI_DISCLAIMER_PREFIXES) {
	if (tLower.startsWith(prefix)) return false;
	}

	// Check for repetitive patterns (same word repeated 10+ times)
	const words = tLower.split(/\s+/);
	if (words.length > 10) {
	const wordCounts = {};
	for (const w of words) {
	wordCounts[w] = (wordCounts[w] \|\| 0) + 1;
	if (wordCounts[w] > 10 && w.length > 2) return false; // Repetition loop
	}
	}

	return true;
	}

	// ── AI-based response quality check (uses local model for fast validation) ──
	// This replaces regex-based BAD_PATTERNS with semantic understanding
	async function aiValidateResponse(text, userMsg) {
	if (!text \|\| text.length < 10) return { valid: true }; // Too short to bother

	// Fast path: check obvious issues without AI
	const tLower = text.toLowerCase().trim();

	// AI disclosure in the middle of response (not just prefix)
	const aiPatterns = [
	'como ia,',
	'como inteligencia artificial,',
	'as an ai,',
	'as a language model,',
	'i cannot fulfill',
	'no puedo cumplir esa solicitud',
	'no puedo procesar esa solicitud',
	];
	for (const p of aiPatterns) {
	if (tLower.includes(p)) {
	return { valid: false, reason: 'ai_disclaimer', replacement: 'eso no va' };
	}
	}

	return { valid: true };
	}

	// ── Router principal ──────────────────────────────────────────────────────────
	function ensureAvailability() {
	const allProviders = [...ALL_PROVIDERS, ...HF_PROVIDERS, ...GLM_PROVIDERS];
	const open = allProviders.filter(n => breakers[n]?.state === 'OPEN');
	if (open.length === allProviders.length) {
	console.warn('[AI] Todos los proveedores en OPEN — reseteando');
	open.forEach(n => { breakers[n].state = 'CLOSED'; breakers[n].openCount = 0; });
	}
	}

	async function _route(messages, taskType, maxTokens, userMsg) {
	ensureAvailability();
	const cascade = [...(CASCADES[taskType] ?? CASCADES.spanish)];
	cascade.sort((a, b) => {
	const pa = predictExhaustion(a) ? -0.3 : 0;
	const pb = predictExhaustion(b) ? -0.3 : 0;
	return (healthScore(b) + pb) - (healthScore(a) + pa);
	});
	const available = cascade.filter(n => breakers[n]?.canRequest() && isQuotaOk(n));
	if (!available.length) {
	// Fallback al modelo local
	if (isLocalAIReady()) {
	const r = await emergencyFallback(messages, userMsg).catch(() => null);
	if (r) return r;
	}
	throw new Error('[AI] Sin proveedores disponibles');
	}

	const failedFlag = { v: false };

	// v12 FIX: Usar SOLO el primer proveedor disponible (no hedged requests)
	// Los hedged requests (Promise.any con top3) causaban respuestas triplicadas
	// cuando múltiples proveedores respondían y el merge de hive los concatenaba.
	// Ahora: intentamos el mejor proveedor, si falla pasamos al siguiente.
	for (const name of available) {
	try {
	const r = await callProvider(name, messages, maxTokens, failedFlag);
	if (isValidResponse(r)) return r;
	} catch { continue; }
	}

	// Último recurso: modelo local
	if (isLocalAIReady()) {
	console.log('[AI] 🏠 Usando modelo local como fallback (todos los proveedores fallaron)');
	const r = await emergencyFallback(messages, userMsg).catch(e => {
	console.warn('[AI] Modelo local también falló:', e.message);
	return null;
	});
	if (r) return r;
	} else {
	console.warn('[AI] Modelo local no disponible para fallback (isLocalAIReady=false)');
	}
	throw new Error('[AI] Todos los proveedores fallaron');
	}

	async function _routeOwner(messages, taskType, maxTokens) {
	ensureAvailability();
	const cascade = [...(CASCADES[taskType] ?? CASCADES.spanish)];
	cascade.sort((a, b) => (healthScore(b)) - (healthScore(a)));
	const available = cascade.filter(n => breakers[n]?.canRequest() && isQuotaOk(n));
	if (!available.length && isLocalAIReady()) {
	return emergencyFallback(messages, '') ?? Promise.reject(new Error('Sin proveedores'));
	}
	for (const name of available) {
	try {
	const r = await callProvider(name, messages, maxTokens, { v: false });
	if (r && r.trim().length > 0) return r;
	} catch { continue; }
	}
	if (isLocalAIReady()) return emergencyFallback(messages, '');
	throw new Error('[AI] Sin respuesta');
	}

	// ── callAI ────────────────────────────────────────────────────────────────────
	export async function callAI(messages, taskType = 'spanish', maxTokens = null, userMsg = '', ownerMode = false) {
	if (!userMsg) userMsg = messages.filter(m => m.role === 'user').pop()?.content ?? '';
	const tokens = adaptiveTokens(userMsg, taskType, maxTokens);
	// Inyectar thinking mode para mejor coherencia (provider name se detecta en el router)
	const thinkMsgs = injectThinking(messages);
	const msgs = await compressContext(thinkMsgs);
	const cacheKey = _cache.key(msgs, taskType); // SIEMPRE definido

	if (ownerMode) {
	return withQueue(_userQueue, async () => {
	const result = await _routeOwner(msgs, taskType, tokens);
	return cleanThinkingArtifacts(result);
	});
	}

	// Cache exacto
	const cached = _cache.get(cacheKey);
	if (cached) { console.log('[AI] 💾 cache hit'); return cached; }

	// Semantic cache
	if (feats.semanticCache !== false) {
	try {
	const semHit = await semanticCache.get(userMsg);
	if (semHit) { console.log('[AI] 💾 semantic cache hit'); return semHit; }
	} catch {}
	}

	const doCall = () => withQueue(_userQueue, async () => {
	const result = await _route(msgs, taskType, tokens, userMsg);
	const cleaned = cleanThinkingArtifacts(result);
	_cache.set(cacheKey, cleaned, userMsg);
	semanticCache.set(userMsg, cleaned, semanticCache.getTTL(userMsg)).catch(() => {});
	return cleaned;
	});
	return singleFlight(cacheKey, doCall);
	}

	// ── callAIBackground ─────────────────────────────────────────────────────────
	export async function callAIBackground(messages, taskType = 'spanish', maxTokens = null, userMsg = '') {
	if (!userMsg) userMsg = messages.filter(m => m.role === 'user').pop()?.content ?? '';
	const tokens = adaptiveTokens(userMsg, taskType, maxTokens);
	const thinkMsgs = injectThinking(messages);
	const msgs = await compressContext(thinkMsgs);
	return withQueue(_backgroundQueue, async () => {
	const result = await _route(msgs, 'background', tokens, userMsg);
	return cleanThinkingArtifacts(result);
	});
	}

	// ── Mixture of Agents ─────────────────────────────────────────────────────────
	export async function callAIMoA(messages, maxTokens = 600) {
	if (!feats.moa) return callAI(messages, 'spanish', maxTokens);
	const [r1, r2, r3] = await Promise.allSettled([
	callProvider('glm51', messages, 300, { v: false }),
	callProvider('groqFast', messages, 300, { v: false }),
	callProvider('pollinations', messages, 300, { v: false }),
	]);
	const proposals = [r1, r2, r3]
	.filter(r => r.status === 'fulfilled' && r.value?.length > 10)
	.map((r, i) => `Respuesta ${i + 1}: ${r.value}`)
	.join('\n\n');
	if (!proposals) return callAI(messages, 'spanish', maxTokens);
	return callProvider('glm51', [
	{ role: 'system', content: 'Sintetiza la mejor respuesta de las siguientes opciones. Sin texto extra.' },
	{ role: 'user', content: `${proposals}\n\nSíntesis:` },
	], maxTokens, { v: false });
	}

	// ── Self-healing ──────────────────────────────────────────────────────────────
	export function startSelfHealing() {
	if (!feats.selfHealing) return;
	setInterval(() => {
	const allProviders = [...ALL_PROVIDERS, ...HF_PROVIDERS, ...GLM_PROVIDERS];
	const totalReq = allProviders.reduce((s, n) => s + getCounter(n).req, 0);
	if (totalReq === 0) return;
	const avgQuota = allProviders.reduce((s, n) => {
	const lim = DAILY_LIMITS[n]; return s + (lim ? getCounter(n).req / lim : 0);
	}, 0) / allProviders.length;
	if (avgQuota > 0.85) console.warn('[SelfHeal] Cuota global alta — priorizar proveedores con más quota');
	}, 5 * 60 * 1000);
	console.log('[HEAL] Self-healing v7 iniciado');
	}

	// ── Warmup ────────────────────────────────────────────────────────────────────
	export async function warmupProviders() {
	if (!feats.warmup) return;
	const test = [{ role: 'user', content: 'ok' }];
	console.log('[WARMUP] Verificando proveedores...');
	await Promise.allSettled(
	['glm51', 'glmFlash', 'groqFast', 'cerebras', 'pollinations', 'mistral'].map(async name => {
	const start = Date.now();
	try {
	await callProvider(name, test, 5, { v: false });
	console.log(`[WARMUP] ✅ ${name} ${Date.now() - start}ms`);
	} catch (e) {
	console.log(`[WARMUP] ❌ ${name}: ${e.message?.slice(0, 40)}`);
	}
	})
	);
	}

	// ── Comprimir contexto ───────────────────────────────────────────────────────
	async function compressContext(messages) {
	// FIX: No cortar a ciegas el system prompt — antes se podía cortar en medio de las reglas de seguridad
	// En vez de eso, reducir el historial de chat primero (menos importante que el system prompt)
	const sys = messages.filter(m => m.role === 'system');
	const chat = messages.filter(m => m.role !== 'system');

	// Recortar system prompt SOLO si es extremadamente largo (>4000 chars)
	// Priorizar: mantener identidad + seguridad + tools intactos
	const compressedSys = sys.map(m => {
	if ((m.content ?? '').length > 4000) {
	// Intentar cortar en una sección completa (después de un \n\n##)
	const content = m.content;
	// Mantener primeros 3500 chars (identidad + personalidad + seguridad)
	// y últimos 1000 chars (tools + reglas finales)
	return { ...m, content: content.slice(0, 3500) + '\n...(contexto comprimido)\n' + content.slice(-1000) };
	}
	return m;
	});

	// Reducir historial de chat a máximo 4 turnos (antes eran 6)
	const recentChat = chat.slice(-4);
	return [...compressedSys, ...recentChat];
	}

	// ── Stats ─────────────────────────────────────────────────────────────────────
	export function getDailyStats() {
	const out = {};
	for (const n of [...ALL_PROVIDERS, ...HF_PROVIDERS, ...GLM_PROVIDERS]) {
	const s = getCounter(n);
	const lim = DAILY_LIMITS[n] ?? 99999;
	out[n] = { requests: s.req, limit: lim, pct: ((s.req / lim) * 100).toFixed(1) + '%', state: breakers[n]?.state ?? '?', score: healthScore(n).toFixed(2) };
	}
	// Append HF-specific stats
	try { out._hf = getHFStats(); } catch {}
	return out;
	}

	export function getProviderStatus() {
	return [...ALL_PROVIDERS, ...GLM_PROVIDERS].map(n => ({ name: n, state: breakers[n]?.state ?? 'UNKNOWN', score: healthScore(n).toFixed(2) }));
	}

	export function clearCache() { _cache.clear(); }

	// ── HIVE exports ────────────────────────────────────────────────────────────
	export { getHiveStatus, getHiveStats, benchmarkHive, warmUpWorkers, hiveInit as initHive };

	export function runQualityGate(response, userMessage) {
	if (!feats.qualityGate) return Promise.resolve({ pass: true });
	if (!response \|\| response.trim().length < 10) return Promise.resolve({ pass: false, reason: 'empty' });
	return Promise.resolve({ pass: isValidResponse(response) });
	}