Spaces:

akra35567
/

akira

Running

App Files Files Community

akira / local_llm.py

akra35567

Upload 2 files

2ed1b2f verified about 1 month ago

raw

history blame contribute delete

29.1 kB

	# type: ignore
	"""
	modules/local_llm.py
	================================================================================
	FALLBACK LOCAL LLM - ÚLTIMA HIPÓTASE
	================================================================================
	Este módulo é usado SOMENTE quando TODAS as APIs externas falharem.
	Implementa um modelo local leve (TinyLlama ou equivalente) para respostas
	básicas em modo de emergência.

	Features:
	- Fallback final do sistema
	- Modelo pequeno (~1.5B parâmetros)
	- Respostas básicas em português/angolano
	- Não requer GPU
	================================================================================
	"""

	import os
	import re
	import time
	from typing import Optional, List, Dict, Any
	from datetime import datetime
	from .config import SYSTEM_PROMPT

	# Imports opcionais com fallbacks
	try:
	import torch # type: ignore
	TORCH_AVAILABLE = True
	except Exception:
	TORCH_AVAILABLE = False
	torch = None # type: ignore

	import requests # type: ignore
	try:
	from huggingface_hub import hf_hub_download, InferenceClient # type: ignore
	HUGGINGFACE_HUB_AVAILABLE = True
	except Exception:
	HUGGINGFACE_HUB_AVAILABLE = False
	hf_hub_download = None
	InferenceClient = None

	try:
	from llama_cpp import Llama # type: ignore
	LLAMA_CPP_AVAILABLE = True
	except Exception:
	LLAMA_CPP_AVAILABLE = False
	Llama = None # type: ignore

	try:
	from loguru import logger # type: ignore
	LOGURU_AVAILABLE = True
	except Exception:
	LOGURU_AVAILABLE = False
	# Criar logger dummy
	class DummyLogger:
	def info(self, args, *kwargs): pass
	def success(self, args, *kwargs): pass
	def warning(self, args, *kwargs): pass
	def error(self, args, *kwargs): pass
	def debug(self, args, *kwargs): pass
	logger = DummyLogger() # type: ignore

	try:
	from cachetools import TTLCache # type: ignore
	CACHETOOLS_AVAILABLE = True
	except Exception:
	CACHETOOLS_AVAILABLE = False
	# Implementação simples de cache fallback
	class TTLCache(dict):
	def __init__(self, maxsize=10, ttl=300, **kwargs):
	super().__init__(**kwargs)
	self.maxsize = maxsize
	self.ttl = ttl
	self._timestamps = {}

	def __setitem__(self, key, value):
	super().__setitem__(key, value)
	self._timestamps[key] = time.time()
	# Limpa itens antigos se necessário
	if len(self) > self.maxsize:
	oldest_key = min(self._timestamps.keys(), key=lambda k: self._timestamps[k])
	self.pop(oldest_key, None)
	self._timestamps.pop(oldest_key, None)

	def get(self, key, default=None):
	# Verifica se expirou
	if key in self._timestamps:
	if time.time() - self._timestamps[key] > self.ttl:
	self.pop(key, None)
	self._timestamps.pop(key, None)
	return default
	return super().get(key, default)

	# Cache de prompts
	_prompt_cache: Any = None
	if CACHETOOLS_AVAILABLE:
	try:
	_prompt_cache = TTLCache(maxsize=10, ttl=300)
	except Exception:
	_prompt_cache = {}

	# ============================================================
	# 🎯 CONFIGURAÇÕES DO FALLBACK LOCAL (GGUF via llama.cpp)
	# ============================================================

	# Modelos locais suportados (do mais leve ao mais pesado - versão GGUF)
	LOCAL_LLM_MODELS = [
	{
	"repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	"file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # ~680MB
	},
	{
	"repo": "TheBloke/phi-2-GGUF",
	"file": "phi-2.Q4_K_M.gguf" # ~1.7GB
	}
	]

	# O prompt agora é importado de .config (SYSTEM_PROMPT)


	# ============================================================
	# 🏗️ CLASSE PRINCIPAL - LOCAL LLM FALLBACK
	# ============================================================

	class LocalLLMFallback:
	"""
	Fallback local puro usando llama.cpp para quando TODAS as APIs externas falharem.
	Este motor é ultraleve consumindo menos de 1GB de RAM.
	IMPORTANTE: Esta classe só deve ser usada como ÚLTIMA opção.
	"""

	_instance = None
	_model_lock = None

	def __new__(cls):
	if cls._instance is None:
	cls._instance = super().__new__(cls)
	cls._instance._initialized = False
	import threading as _threading
	cls._instance._model_lock = _threading.Lock()
	return cls._instance

	def __init__(self):
	if self._initialized:
	return
	self._initialized = True

	# Componentes do modelo
	self._model = None # type: ignore
	self._model_path: Optional[str] = None
	self._deepseek_model: Optional[str] = None
	self._mistral_model: Optional[str] = None
	self._lexi_model: Optional[str] = None
	self._luna_model: Optional[str] = None
	self._multilingual_beast: Optional[str] = None
	self._is_loaded = False
	self._tokenizer = None # type: ignore
	self._pipeline = None # type: ignore

	# Configurações do Llama CPP / API Inference (Otimizados contra Alucinações)
	self._max_tokens = 1024
	self._temperature = 0.85
	self._top_p = 0.9
	self._repetition_penalty = 1.15
	self._ctx_size = 4096

	self._max_consecutive_failures = 3
	self._consecutive_failures = 0
	self._is_hf_inference_mode = False
	self._hf_client = None

	# Estatísticas
	self._stats: Dict[str, Any] = {
	"total_calls": 0,
	"successful_calls": 0,
	"failed_calls": 0,
	"last_used": None,
	"model_loaded": False
	}

	# Tenta detectar e carregar modelo
	self._detect_and_load_model()

	def _detect_and_load_model(self) -> bool:
	"""Configura o fallback via Cloud API (Hugging Face Inference)."""
	logger.info("Local LLM: Configurando fallback exclusivo via HuggingFace Cloud API.")

	try:
	import importlib as _iloc
	_cfgloc = _iloc.import_module('modules.config')
	_hf_fallback = getattr(_cfgloc, 'HF_TOKEN', None)
	except Exception:
	_hf_fallback = None
	hf_token: Optional[str] = os.getenv("HF_TOKEN") or _hf_fallback

	if hf_token:
	self._is_hf_inference_mode = True
	self._is_loaded = True

	# Nova Hierarquia AKIRA V21 - Usando config se disponível
	try:
	self._deepseek_model = getattr(_cfgloc, 'DEEPSEEK_MODEL', "deepseek-ai/DeepSeek-V3")
	self._mistral_model = getattr(_cfgloc, 'MISTRAL_MODEL_HF', "mistralai/Mistral-7B-Instruct-v0.3")
	except:
	self._deepseek_model = "deepseek-ai/DeepSeek-V3"
	self._mistral_model = "mistralai/Mistral-7B-Instruct-v0.3"

	self._lexi_model = "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
	self._luna_model = "rhaymison/Mistral-8x7b-Quantized-portuguese-luana"
	self._multilingual_beast = "Qwen/Qwen2.5-72B-Instruct"

	self._model_path = self._deepseek_model # Default principal
	self._stats["model_loaded"] = True

	# Inicializa o cliente se possível
	if InferenceClient:
	try:
	self._hf_client = InferenceClient(token=hf_token)
	logger.success("✅ Fallback Cloud HF Inference configurado com sucesso.")
	except Exception as e:
	logger.warning(f"Erro ao inicializar InferenceClient: {e}")

	return True

	logger.error("❌ Fallback Local/Cloud indisponível: HF_TOKEN não encontrado.")
	return False

	def is_available(self) -> bool:
	"""Verifica se o fallback está disponível (requer token ou modelo local)."""
	return self._is_loaded

	def is_operational(self) -> bool:
	"""Verifica se o motor está pronto para gerar (Cloud ou Local)."""
	if getattr(self, '_is_hf_inference_mode', False):
	return self._is_loaded
	return self._is_loaded and self._model is not None

	def generate(
	self,
	prompt: str,
	system_prompt: Optional[str] = None,
	context_history: List[dict] = [],
	max_tokens: Optional[int] = None,
	temperature: Optional[float] = None
	) -> Optional[str]:
	"""Gera resposta usando modelo local ou nuvem HF."""
	self._stats["total_calls"] += 1
	max_new = max_tokens or self._max_tokens

	# Verifica disponibilidade
	if not self.is_operational():
	self._stats["failed_calls"] += 1
	return None

	# Usa cache se disponível
	cache_key = f"{prompt[:50]}:{system_prompt or 'default'}"
	if _prompt_cache is not None:
	cached = _prompt_cache.get(cache_key)
	if cached:
	logger.debug("Resposta encontrada em cache local")
	return cached

	try:
	# Prepara prompts (Centralizado em config.py)
	sys_prompt = system_prompt or SYSTEM_PROMPT

	# Formatação base compatível com a flag ChatML do Llama / TinyLlama
	formatted = f"<\|system\|>\n{sys_prompt}</s>\n<\|user\|>\n{prompt}</s>\n<\|assistant\|>\n"

	if getattr(self, '_is_hf_inference_mode', False):
	try:
	import importlib as _il2
	_cfg2 = _il2.import_module('modules.config')
	_hf2 = getattr(_cfg2, 'HF_TOKEN', None)
	except Exception:
	_hf2 = None
	hf_token = os.getenv("HF_TOKEN") or _hf2
	if not hf_token:
	logger.error("❌ Token HF não encontrado para a requisição de inferência HF")
	# Tentar prosseguir sem token se for modelo free (geralmente Llama 3 precisa)

	# Define o modelo base e as mensagens para as chamadas HF
	base_model = self._model_path or "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2"
	messages = []
	if sys_prompt:
	messages.append({"role": "system", "content": sys_prompt})

	for turn in context_history:
	messages.append({
	"role": turn.get("role", "user"),
	"content": turn.get("content", "")
	})

	messages.append({"role": "user", "content": prompt})

	# 1. HF ROUTER (Múltiplos providers)
	# Tentamos o modelo Lexi-V2 primeiro em todos os providers (especialmente featherless-ai)
	# Se falhar, tentamos o Llama-3.3-70B como fallback de alta potência no Router

	headers = {
	"Authorization": f"Bearer {hf_token}",
	"Content-Type": "application/json"
	}
	router_url = "https://router.huggingface.co/v1/chat/completions"

	# Providers disponíveis (featherless-ai é o que oficialmente suporta Lexi-V2)
	providers = ["featherless-ai", "hyperbolic", "sambanova", "cerebras", "nebius", "novita"]

	# Modelos para testar no Router (Luana/70B primeiro se for modo pesado)
	candidate_models = []

	long_prompt = prompt.count('\n') >= 4 or len(prompt) > 800
	prompt_lower = prompt.lower()

	# Palavras-chave exclusivas para o modelo Pesado (Qwen Abliterated: debates, polêmicas, ciência, etc)
	heavy_keywords = [
	"debate", "discuti", "ideia", "explica", "acadêmica", "religião", "científica", "ciência", "filosófico", "filosofia",
	"ética", "etica", "moral", "radical", "polêmica", "polemica", "país", "analise", "fale sobre", "codigo", "programar",
	"política", "politi", "guerra", "mundo"
	]
	# Palavras-chave exclusivas para Luna (Girias, cultura PT/AO/BR, informalidade)
	luna_keywords = [
	"kkk", "mn", "informal", "foda", "lá ele", "puto", "mano", "porra", "fxe", "cassule"
	]

	needs_heavy = long_prompt or any(x in prompt_lower for x in heavy_keywords)
	needs_luna = any(x in prompt_lower for x in luna_keywords)

	# Verifica a flag global de preferência por modelos pesados
	try:
	import importlib as _il3
	_cfg3 = _il3.import_module('modules.config')
	prefer_heavy: bool = bool(getattr(_cfg3, 'PREFER_HEAVY_MODEL', False))
	except Exception:
	prefer_heavy = False

	# Regra estrita: se for curtíssimo (ex: oi, tudo bem, hmm), NUNCA gasta o pesado
	palavras = len(prompt.split())
	is_very_short = palavras <= 5 and not needs_heavy

	# 0. DEFINIR HIERARQUIA ESTRETA
	# 1. DeepSeek (Pesado/Padrão) -> 2. Mistral (Humano) -> 3. Lexi (Sem Censura) -> 4. Luna (Cultura)

	if needs_heavy and not is_very_short:
	# MENSAGEM COMPLEXA/LÓGICA: DeepSeek -> Mistral -> Lexi
	candidate_models.extend([self._deepseek_model, self._mistral_model, self._lexi_model])
	elif needs_luna and not is_very_short:
	# MENSAGEM CULTURAL: Luna -> Mistral -> Lexi
	candidate_models.extend([self._luna_model, self._mistral_model, self._lexi_model])
	elif "humano" in prompt_lower or "conversa" in prompt_lower:
	# MENSAGEM HUMANA: Mistral -> DeepSeek -> Lexi
	candidate_models.extend([self._mistral_model, self._deepseek_model, self._lexi_model])
	else:
	# PADRÃO: DeepSeek como base se não for curto
	if is_very_short:
	candidate_models.extend([self._lexi_model, self._mistral_model])
	else:
	# Hierarquia padrão solicitada: DeepSeek > Mistral > Lexi > Luna
	candidate_models.extend([self._deepseek_model, self._mistral_model, self._lexi_model, self._luna_model])

	# Garantir apenas modelos únicos mantendo a ordem
	seen = set()
	candidate_models = [x for x in candidate_models if not (x in seen or seen.add(x))]

	for current_model in candidate_models:
	for provider in providers:
	model_with_provider = f"{current_model}:{provider}"
	# Ajuste dinâmico de template conforme a família do modelo
	current_messages = messages.copy()

	# Se for modelo Luana ou Mistral, aplicamos o template [INST] conforme a documentação
	_cm = str(current_model) if current_model else ""
	if "mistral" in _cm.lower() or "luana" in _cm.lower():
	# Para Mistral via Chat API, geralmente o provedor já cuida da conversão,
	# mas podemos reforçar na primeira mensagem se necessário.
	# No caso da Luana específica, ela gosta do formato "Abaixo está uma instrução..."
	if "luana" in _cm.lower():
	instruction = f"Abaixo está uma instrução que descreve uma tarefa, juntamente com uma entrada que fornece mais contexto.\nEscreva uma resposta que complete adequadamente o pedido.\n### instrução: {sys_prompt}\n### entrada: {prompt}"
	current_messages = [{"role": "user", "content": instruction}]

	# Extrair parâmetros específicos do modelo injetando agressividade e coerência
	try:
	import importlib as _il
	_cfg = _il.import_module('modules.config')
	_all_params: dict = getattr(_cfg, 'MODEL_PARAMETERS', {})
	except Exception:
	_all_params = {}
	model_params: Dict[str, Any] = dict(_all_params.get(current_model, {}))

	payload = {
	"model": model_with_provider,
	"messages": current_messages,
	"max_tokens": max_tokens or model_params.get("max_tokens", max_new),
	"temperature": temperature or model_params.get("temperature", self._temperature),
	"top_p": model_params.get("top_p", self._top_p)
	}

	# Adicionar parâmetros extras se existirem para o motor HuggingFace (TGI/vLLM)
	for opt_param in ["top_k", "repetition_penalty", "frequency_penalty", "presence_penalty"]:
	if opt_param in model_params:
	payload[opt_param] = model_params[opt_param]
	try:
	logger.debug(f"🔁 Tentando HF Router: {model_with_provider}")
	resp = requests.post(router_url, headers=headers, json=payload, timeout=25)
	if resp.status_code == 200:
	data = resp.json()
	content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
	if content and content.strip():
	logger.success(f"✅ Sucesso via HF Router ({model_with_provider})")
	self._stats["last_model_used"] = current_model
	return self._process_successful_response(content, prompt, cache_key)

	# Se o erro for de modelo não suportado por este provider, ignoramos e tentamos o próximo provider/modelo
	elif resp.status_code == 400:
	try:
	err_json = resp.json()
	if "not supported" in str(err_json).lower():
	continue
	logger.error(f"⚠️ Router '{provider}' HTTP 400: {err_json}")
	except:
	logger.error(f"⚠️ Router '{provider}' HTTP 400: {resp.text[:200]}")
	except Exception:
	continue

	logger.error(f"❌ Todos os métodos HF falharam")
	self._consecutive_failures += 1
	self._stats["failed_calls"] += 1
	return None

	else:
	# ----------------------------------------------------
	# EXECUTAR OFFLINE (GGUF CPU LLAMA.CPP)
	# ----------------------------------------------------
	if not self._model: return None

	start_time = time.time()
	outputs = self._model(
	prompt=formatted,
	max_tokens=max_new,
	temperature=temperature or self._temperature,
	top_p=0.9,
	repeat_penalty=1.1,
	echo=False # IMPORTANT: Evita devolver o prompt na string de resposta (Semelhante ao antigo return_full_text=False)
	)

	exec_time = time.time() - start_time
	logger.debug(f"[LLAMA CPP] Inferência CPU local GGUF completada em {exec_time:.2f}s")

	# Extrai resposta baseada no wrapper do create_completion
	if outputs and "choices" in outputs and len(outputs["choices"]) > 0:
	generated = outputs["choices"][0].get("text", "")

	# Garantir limpeza de possíveis sujidades de XML Chat templates
	response_text = self._extract_response(generated, formatted)
	response_text = self._clean_response(response_text)

	if response_text:
	# Cache se disponível
	if _prompt_cache is not None:
	try: _prompt_cache[cache_key] = response_text
	except Exception: pass

	self._stats["successful_calls"] += 1
	self._stats["last_used"] = datetime.now().isoformat()
	self._stats["last_model_used"] = "llama_local_gguf"
	self._consecutive_failures = 0
	return response_text

	# Falha silenciosa
	self._consecutive_failures += 1
	self._stats["failed_calls"] += 1
	return None

	except Exception as e:
	logger.error(f"❌ Erro em fallback de emergência: {e}")
	self._consecutive_failures += 1
	self._stats["failed_calls"] += 1
	return None

	def _process_successful_response(self, text: str, prompt: str, cache_key: str) -> str:
	"""Processa e limpa uma resposta bem-sucedida."""
	res_text = self._extract_response(text, prompt)
	res_text = self._clean_response(res_text)
	if _prompt_cache is not None:
	try: _prompt_cache[cache_key] = res_text
	except Exception: pass
	self._stats["successful_calls"] += 1
	self._stats["last_used"] = datetime.now().isoformat()
	self._consecutive_failures = 0
	return res_text

	def _extract_response(self, generated: str, prompt: str) -> str:
	"""Extrai a resposta do texto gerado, removendo alucinações e metadados."""
	if not generated: return ""

	response = generated

	# 1. Limpeza de tags de chat leakadas
	if "<\|assistant\|>" in response:
	response = response.split("<\|assistant\|>")[-1]
	elif "[/INST]" in response:
	response = response.split("[/INST]")[-1]
	elif "assistant\n" in response.lower():
	parts = re.split(r'(?i)assistant\n', response)
	response = parts[-1]

	# 2. Remoção de prefixos repetitivos (Alucinações comuns do modelo)
	prefixes_to_strip = [
	r'^### Akira ### Resposta:?\s*',
	r'^### Akira ###:?\s*',
	r'^### Resposta:?\s*',
	r'^Akira:?\s*',
	r'^🤖 AKIRA:?\s*',
	r'^Resposta:?\s*',
	r'^Assistant:?\s*'
	]

	for pattern in prefixes_to_strip:
	response = re.sub(pattern, '', response, flags=re.IGNORECASE \| re.MULTILINE)

	# 3. Se o modelo repetir o prompt do usuário no início
	if prompt.strip() in response[:len(prompt)+20]:
	response = response.replace(prompt.strip(), '', 1)

	return response.strip()

	def _clean_response(self, text: str) -> str:
	"""Limpa a resposta gerada."""
	# Se for um vazamento direto do System Prompt inteiro
	if "SYSTEM STRICTOVERRIDES:" in text:
	# Extrair dócil se houver separadores:
	text = text.split("</\|assistant\|>")[-1] if "</\|assistant\|>" in text else text

	# Se continuar enorme, corta as partes de configuração
	text = re.sub(r'SYSTEM STRICTOVERRIDES:.*?Conversa privada\.', '', text, flags=re.DOTALL)
	text = re.sub(r'## 🎭 PERFIL: AKIRA.*?REGRAS PRINCIPAIS \d+\.', '', text, flags=re.DOTALL)

	# Remove tags e formatação
	text = re.sub(r'<\\|[^\|]+\\|>', '', text)
	text = re.sub(r'</?s>', '', text)
	text = re.sub(r'[\*\_\`\[\]\"]', '', text)

	# Normaliza espaços
	text = re.sub(r'\s+', ' ', text).strip()

	# Limita tamanho (1 token ≈ 4 caracteres)
	max_chars = self._max_tokens * 4
	if len(text) > max_chars:
	# Corta em sentença completa
	sentences = [s.strip() + "." for s in text.split(".") if s.strip()]
	result = ""
	for sent in sentences:
	if len(result + sent) <= max_chars:
	result += sent + " "
	else:
	break
	text = result.strip()

	return text

	def get_status(self) -> Dict[str, Any]:
	"""Retorna status do fallback local."""
	return {
	"available": self.is_available(),
	"operational": self.is_operational(),
	"model_path": self._model_path,
	"model_loaded": self._is_loaded,
	"consecutive_failures": self._consecutive_failures,
	"max_failures_allowed": self._max_consecutive_failures,
	"stats": self._stats.copy()
	}

	def reset_failures(self):
	"""Reseta contador de falhas."""
	self._consecutive_failures = 0

	def should_use_fallback(self, api_failures: int = 0) -> bool:
	"""
	Decide se deve usar o fallback local.

	Args:
	api_failures: Número de falhas consecutivas de APIs

	Returns:
	True se deve usar fallback
	"""
	# Só usa se:
	# 1. Modelo está operacional
	# 2. Houve pelo menos 1 falha de API OU está explicitamente habilitado
	return (
	self.is_operational() and
	(api_failures > 0 or os.getenv("USE_LOCAL_FALLBACK", "").lower() == "true")
	)


	# ============================================================
	# 🎯 FUNÇÃO PRINCIPAL DE FALLBACK
	# ============================================================

	def get_local_fallback() -> LocalLLMFallback:
	"""Retorna instância singleton do fallback local."""
	return LocalLLMFallback()


	def generate_fallback_response(
	prompt: str,
	system_prompt: Optional[str] = None,
	api_failures: int = 0
	) -> Optional[str]:
	"""
	Gera resposta de fallback se necessário.

	Args:
	prompt: Prompt do usuário
	system_prompt: Prompt do sistema opcional
	api_failures: Número de falhas de API

	Returns:
	Resposta gerada ou None
	"""
	fallback = get_local_fallback()

	if fallback.should_use_fallback(api_failures):
	logger.info(f"🔴 Usando fallback local (API failures: {api_failures})")
	return fallback.generate(prompt, system_prompt)

	return None


	# ============================================================
	# 🧪 MOCK PARA TESTES
	# ============================================================

	class MockLocalLLM:
	"""Mock para testes quando modelo não está disponível."""

	def is_available(self) -> bool:
	return False

	def is_operational(self) -> bool:
	return False

	def generate(self, prompt: str, **kwargs) -> str:
	return "🤖 Modo de emergência: Todas as APIs falharam. Tente novamente mais tarde."

	def get_status(self) -> Dict[str, Any]:
	return {"available": False, "mock": True}


	# ============================================================
	# 📤 EXPORTS
	# ============================================================

	__all__ = [
	"LocalLLMFallback",
	"get_local_fallback",
	"generate_fallback_response",
	"MockLocalLLM",
	"FALLBACK_SYSTEM_PROMPT",
	]