indicRAG / backend /src /llm_manager.py
hardkpentium101's picture
Switch to Qwen-1.5-1.8B-Chat - verified multilingual model with good Indic support
3862877
"""
LLM Manager module with AI4Bharat IndicLLM support
Optimized for 11 Indic languages on CPU
"""
from typing import Optional, Dict, Any
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
from huggingface_hub import login
import torch
import os
import warnings
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*max_new_tokens.*max_length.*")
# Set cache directory
os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface_cache")
class LLMManager:
_instance = None
_llm_instance = None
_initialization_error = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(LLMManager, cls).__new__(cls)
return cls._instance
def get_llm(self, provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""Get LLM instance based on provider"""
if self._initialization_error is not None:
logger.error(f"LLM initialization failed: {self._initialization_error}")
return None
if self._llm_instance is not None:
return self._llm_instance
self._llm_instance = self._get_indic_llm(model_kwargs)
if self._llm_instance is None:
logger.error("Failed to initialize IndicLLM")
self._initialization_error = "IndicLLM initialization failed"
return self._llm_instance
def _get_indic_llm(self, model_kwargs: Optional[Dict[str, Any]] = None):
"""Initialize Qwen-1.5-1.8B-Chat for multilingual (including 11 Indic languages)"""
model_id = "Qwen/Qwen1.5-1.8B-Chat"
try:
# Authenticate with HuggingFace if token is provided
hf_token = os.getenv("HF_TOKEN")
if hf_token:
logger.info("Authenticating with HuggingFace...")
login(token=hf_token)
else:
logger.warning("No HF_TOKEN provided. Downloads may be slower.")
logger.info(f"Initializing Qwen-1.5-1.8B-Chat: {model_id}")
logger.info("Qwen: 1.8B parameters, supports 100+ languages including Hindi, Bengali, Tamil, Telugu, etc.")
logger.info("Loading model...")
# Load tokenizer with token if available
if hf_token:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
else:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Check if CUDA is available for quantization
if torch.cuda.is_available():
logger.info("GPU detected - using 4-bit quantization")
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True,
token=hf_token,
)
else:
logger.info("CPU only - loading with memory optimizations")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True,
token=hf_token,
)
# Create generation config and set on model
gen_config = GenerationConfig(
temperature=float(os.getenv("TEMPERATURE", 0.9)),
top_p=float(os.getenv("TOP_P", 0.92)),
top_k=int(os.getenv("TOP_K", 50)),
repetition_penalty=float(os.getenv("REPETITION_PENALTY", 1.15)),
max_new_tokens=int(os.getenv("MAX_NEW_TOKENS", 400)),
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Set on model - pipeline will use this
model.generation_config = gen_config
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
truncation=True,
return_full_text=False,
clean_up_tokenization_spaces=True,
)
logger.info("Qwen pipeline initialized successfully")
llm = HuggingFacePipeline(pipeline=pipe)
return llm
except Exception as e:
logger.error(f"Failed to load Qwen: {e}")
self._initialization_error = str(e)
return None
def is_available(self) -> bool:
return self._llm_instance is not None and self._initialization_error is None
def get_initialization_error(self) -> Optional[str]:
return self._initialization_error
def get_llm(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""Convenience function to get LLM instance"""
manager = LLMManager()
return manager.get_llm(provider, model_kwargs)
def get_llm_with_provider(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None):
"""Get LLM with specific provider and model kwargs"""
manager = LLMManager()
return manager.get_llm(provider, model_kwargs)