HindiRAG / src /llm_manager.py
hardkpentium101's picture
Add swap space for CPU model loading and optimize startup
20b1ba2
"""
LLM Manager module with Sarvam-1 model support for the Hindi RAG system
Optimized for CPU-only environments like HF Spaces free tier
"""
from typing import Optional, Dict, Any
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os
import warnings
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", category=UserWarning)
class LLMManager:
_instance = None
_llm_instance = None
_initialization_error = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(LLMManager, cls).__new__(cls)
return cls._instance
def get_llm(self, provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""
Get LLM instance based on provider
"""
if self._initialization_error is not None:
logger.error(f"LLM initialization failed: {self._initialization_error}")
return None
if self._llm_instance is not None:
return self._llm_instance
# Initialize Sarvam-1 model
self._llm_instance = self._get_sarvam_llm(model_kwargs)
if self._llm_instance is None:
logger.error("Failed to initialize Sarvam-1 LLM")
self._initialization_error = "Sarvam-1 initialization failed"
return self._llm_instance
def _get_sarvam_llm(self, model_kwargs: Optional[Dict[str, Any]] = None):
"""
Initialize Sarvam-1 model for Hindi text generation
Uses simple pipeline approach for CPU compatibility
"""
model_id = "sarvamai/sarvam-1"
try:
logger.info(f"Initializing Sarvam-1 model: {model_id}")
logger.info("Sarvam-1: 2B parameters, optimized for 10 Indic languages")
# Use simple pipeline approach - handles device placement automatically
# This is the recommended approach from Sarvam AI
logger.info("Loading model with CPU-first approach...")
pipe = pipeline(
"text-generation",
model=model_id,
model_kwargs={
"torch_dtype": torch.float32, # Float32 for CPU compatibility
"low_cpu_mem_usage": False, # Avoid meta tensor issues
},
device_map="cpu" # Force CPU for HF Spaces
)
logger.info(f"✓ Sarvam-1 pipeline initialized successfully on CPU")
# Wrap pipeline for LangChain compatibility
llm = HuggingFacePipeline(pipeline=pipe)
return llm
except Exception as e:
logger.error(f"Failed to initialize Sarvam-1 model: {e}")
logger.error(f"Error type: {type(e).__name__}")
# Provide helpful error message
if "meta tensor" in str(e).lower():
logger.error("Meta tensor error: Insufficient RAM for model loading")
logger.error("HF Spaces CPU tier has ~13GB RAM, Sarvam-1 needs ~8GB")
logger.error("Try: Upgrading to GPU Space or using smaller model")
self._initialization_error = str(e)
return None
def is_available(self) -> bool:
"""
Check if LLM is available and initialized
"""
return self._llm_instance is not None and self._initialization_error is None
def get_initialization_error(self) -> Optional[str]:
"""
Get the initialization error if any
"""
return self._initialization_error
def get_llm(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""
Convenience function to get LLM instance
"""
manager = LLMManager()
return manager.get_llm(provider, model_kwargs)
def get_llm_with_provider(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""
Get LLM with specific provider and model kwargs
"""
manager = LLMManager()
return manager.get_llm(provider, model_kwargs)