""" CORE LLM MEJORADO - SAMBANOVA ENTERPRISE - Timeouts configurados - Retry automático - Streaming optimizado - Métricas de calidad """ import os import time import json from typing import Dict, List, Optional, Generator, Any, Union from dataclasses import dataclass from enum import Enum import asyncio import aiohttp from openai import OpenAI, AsyncOpenAI from openai.types.chat import ChatCompletionMessageParam # ===================== CONFIGURACIÓN ===================== SAMBANOVA_BASE = "https://api.sambanova.ai/v1" DEFAULT_TIMEOUT = 60.0 MAX_RETRIES = 3 RETRY_DELAY = 1.0 # ===================== MODELOS ENTERPRISE ===================== ENTERPRISE_MODELS = { "Scout": { "id": "Llama-4-Maverick-17B-128E-Instruct", "context_window": 131072, "vision": False, "strengths": ["general", "reasoning", "code"], "optimal_temperature": 0.1 }, "Heavy": { "id": "Meta-Llama-3.3-70B-Instruct", "context_window": 131072, "vision": False, "strengths": ["complex reasoning", "analysis", "strategy"], "optimal_temperature": 0.05 }, "Coder": { "id": "Qwen2.5-Coder-32B-Instruct", "context_window": 32768, "vision": False, "strengths": ["programming", "debugging", "architecture"], "optimal_temperature": 0.01 }, "Vision": { "id": "Meta-Llama-3.2-11B-Vision-Instruct", "context_window": 131072, "vision": True, "strengths": ["image analysis", "visual reasoning"], "optimal_temperature": 0.1 }, "Reasoner": { "id": "DeepSeek-R1", "context_window": 128000, "vision": False, "strengths": ["step-by-step reasoning", "mathematics", "logic"], "optimal_temperature": 0.3 }, "Fast": { "id": "Llama-4-Scout-17B-16E-Instruct", "context_window": 131072, "vision": False, "strengths": ["quick responses", "general purpose"], "optimal_temperature": 0.2 } } # ===================== CLIENTE MEJORADO ===================== class SambanovaClient: def __init__(self, api_key: str): self.api_key = api_key self.sync_client = OpenAI( base_url=SAMBANOVA_BASE, api_key=api_key, timeout=DEFAULT_TIMEOUT, max_retries=MAX_RETRIES ) self.async_client = AsyncOpenAI( base_url=SAMBANOVA_BASE, api_key=api_key, timeout=DEFAULT_TIMEOUT, max_retries=MAX_RETRIES ) self.request_count = 0 self.error_count = 0 self.avg_latency = 0.0 def get_model_info(self, model_key: str) -> Optional[Dict]: """Obtiene información detallada del modelo""" if model_key in ENTERPRISE_MODELS: return ENTERPRISE_MODELS[model_key] # Buscar por ID si no coincide la clave for key, info in ENTERPRISE_MODELS.items(): if info["id"] == model_key: return info return None def chat_completion( self, model: str, messages: List[ChatCompletionMessageParam], temperature: float = 0.1, top_p: float = 0.9, max_tokens: int = 4096, stream: bool = True, **kwargs ): """Completación de chat con métricas""" start_time = time.time() self.request_count += 1 try: # Ajustar parámetros según modelo model_info = self.get_model_info(model) if model_info: temperature = kwargs.get('temperature', model_info.get('optimal_temperature', temperature)) response = self.sync_client.chat.completions.create( model=model if model in [v["id"] for v in ENTERPRISE_MODELS.values()] else self.get_model_info(model)["id"], messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=stream, **{k: v for k, v in kwargs.items() if k not in ['model', 'messages', 'stream']} ) latency = time.time() - start_time self.avg_latency = (self.avg_latency * (self.request_count - 1) + latency) / self.request_count return response except Exception as e: self.error_count += 1 latency = time.time() - start_time raise async def async_chat_completion( self, model: str, messages: List[ChatCompletionMessageParam], temperature: float = 0.1, top_p: float = 0.9, max_tokens: int = 4096, stream: bool = True, **kwargs ): """Versión asíncrona""" start_time = time.time() self.request_count += 1 try: model_info = self.get_model_info(model) if model_info: temperature = kwargs.get('temperature', model_info.get('optimal_temperature', temperature)) response = await self.async_client.chat.completions.create( model=model if model in [v["id"] for v in ENTERPRISE_MODELS.values()] else self.get_model_info(model)["id"], messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=stream, **{k: v for k, v in kwargs.items() if k not in ['model', 'messages', 'stream']} ) latency = time.time() - start_time self.avg_latency = (self.avg_latency * (self.request_count - 1) + latency) / self.request_count return response except Exception as e: self.error_count += 1 latency = time.time() - start_time raise def get_metrics(self) -> Dict[str, Any]: """Obtiene métricas del cliente""" return { "request_count": self.request_count, "error_count": self.error_count, "success_rate": 1.0 - (self.error_count / max(self.request_count, 1)), "average_latency": self.avg_latency, "models_available": len(ENTERPRISE_MODELS) } # ===================== FUNCIÓN PRINCIPAL MEJORADA ===================== def create_sambanova_client(api_key: Optional[str] = None) -> SambanovaClient: """Crea y retorna cliente Sambanova""" key = api_key or os.getenv("SAMBANOVA_API_KEY_2") if not key: raise RuntimeError( "❌ Falta SAMBANOVA_API_KEY_2. Configura la variable de entorno o pasa la clave directamente." ) return SambanovaClient(key) def chat_sambanova_optimized( api_key: str, model: str, messages: List[ChatCompletionMessageParam], temperature: float = 0.1, top_p: float = 0.9, max_tokens: int = 4096, stream: bool = True, retry_on_failure: bool = True, **kwargs ): """ Función optimizada con manejo de errores y retry """ client = create_sambanova_client(api_key) for attempt in range(MAX_RETRIES if retry_on_failure else 1): try: return client.chat_completion( model=model, messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=stream, **kwargs ) except Exception as e: if attempt == MAX_RETRIES - 1 or not retry_on_failure: raise time.sleep(RETRY_DELAY * (attempt + 1)) continue # ===================== STREAMING OPTIMIZADO ===================== def stream_response_generator( api_key: str, model: str, messages: List[ChatCompletionMessageParam], **kwargs ) -> Generator[str, None, Dict]: """ Generador optimizado para streaming con métricas """ start_time = time.time() full_response = "" try: stream = chat_sambanova_optimized( api_key=api_key, model=model, messages=messages, stream=True, **kwargs ) for chunk in stream: if chunk.choices and chunk.choices[0].delta.content: content = chunk.choices[0].delta.content full_response += content yield content metrics = { "success": True, "total_time": time.time() - start_time, "response_length": len(full_response), "model_used": model, "final_response": full_response } yield metrics except Exception as e: error_metrics = { "success": False, "error": str(e), "total_time": time.time() - start_time, "response_length": len(full_response), "model_attempted": model } yield error_metrics