Spaces:
Sleeping
Sleeping
| """ | |
| LLM Manager module with AI4Bharat IndicLLM support | |
| Optimized for 11 Indic languages on CPU | |
| """ | |
| from typing import Optional, Dict, Any | |
| from langchain_huggingface import HuggingFacePipeline | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig | |
| from huggingface_hub import login | |
| import torch | |
| import os | |
| import warnings | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| warnings.filterwarnings("ignore", message=".*max_new_tokens.*max_length.*") | |
| # Set cache directory | |
| os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface_cache") | |
| class LLMManager: | |
| _instance = None | |
| _llm_instance = None | |
| _initialization_error = None | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super(LLMManager, cls).__new__(cls) | |
| return cls._instance | |
| def get_llm(self, provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None): | |
| """Get LLM instance based on provider""" | |
| if self._initialization_error is not None: | |
| logger.error(f"LLM initialization failed: {self._initialization_error}") | |
| return None | |
| if self._llm_instance is not None: | |
| return self._llm_instance | |
| self._llm_instance = self._get_indic_llm(model_kwargs) | |
| if self._llm_instance is None: | |
| logger.error("Failed to initialize IndicLLM") | |
| self._initialization_error = "IndicLLM initialization failed" | |
| return self._llm_instance | |
| def _get_indic_llm(self, model_kwargs: Optional[Dict[str, Any]] = None): | |
| """Initialize Qwen-1.5-1.8B-Chat for multilingual (including 11 Indic languages)""" | |
| model_id = "Qwen/Qwen1.5-1.8B-Chat" | |
| try: | |
| # Authenticate with HuggingFace if token is provided | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| logger.info("Authenticating with HuggingFace...") | |
| login(token=hf_token) | |
| else: | |
| logger.warning("No HF_TOKEN provided. Downloads may be slower.") | |
| logger.info(f"Initializing Qwen-1.5-1.8B-Chat: {model_id}") | |
| logger.info("Qwen: 1.8B parameters, supports 100+ languages including Hindi, Bengali, Tamil, Telugu, etc.") | |
| logger.info("Loading model...") | |
| # Load tokenizer with token if available | |
| if hf_token: | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token) | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| # Check if CUDA is available for quantization | |
| if torch.cuda.is_available(): | |
| logger.info("GPU detected - using 4-bit quantization") | |
| from transformers import BitsAndBytesConfig | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| token=hf_token, | |
| ) | |
| else: | |
| logger.info("CPU only - loading with memory optimizations") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| token=hf_token, | |
| ) | |
| # Create generation config and set on model | |
| gen_config = GenerationConfig( | |
| temperature=float(os.getenv("TEMPERATURE", 0.9)), | |
| top_p=float(os.getenv("TOP_P", 0.92)), | |
| top_k=int(os.getenv("TOP_K", 50)), | |
| repetition_penalty=float(os.getenv("REPETITION_PENALTY", 1.15)), | |
| max_new_tokens=int(os.getenv("MAX_NEW_TOKENS", 400)), | |
| do_sample=True, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Set on model - pipeline will use this | |
| model.generation_config = gen_config | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| truncation=True, | |
| return_full_text=False, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| logger.info("Qwen pipeline initialized successfully") | |
| llm = HuggingFacePipeline(pipeline=pipe) | |
| return llm | |
| except Exception as e: | |
| logger.error(f"Failed to load Qwen: {e}") | |
| self._initialization_error = str(e) | |
| return None | |
| def is_available(self) -> bool: | |
| return self._llm_instance is not None and self._initialization_error is None | |
| def get_initialization_error(self) -> Optional[str]: | |
| return self._initialization_error | |
| def get_llm(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None): | |
| """Convenience function to get LLM instance""" | |
| manager = LLMManager() | |
| return manager.get_llm(provider, model_kwargs) | |
| def get_llm_with_provider(provider: str = "huggingface", model_kwargs: Optional[Dict[str, Any]] = None): | |
| """Get LLM with specific provider and model kwargs""" | |
| manager = LLMManager() | |
| return manager.get_llm(provider, model_kwargs) | |