| | from transformers import AutoTokenizer, AutoModelForCausalLM |
| | from sentence_transformers import SentenceTransformer |
| | import torch |
| | import logging |
| | from config.config import settings |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | class ModelService: |
| | _instance = None |
| |
|
| | def __new__(cls): |
| | if cls._instance is None: |
| | cls._instance = super().__new__(cls) |
| | cls._instance._initialized = False |
| | return cls._instance |
| |
|
| | def __init__(self): |
| | if not self._initialized: |
| | self._initialized = True |
| | self.tokenizer = None |
| | self.model = None |
| | self.embedder = None |
| | self._load_models() |
| |
|
| | def _load_models(self): |
| | try: |
| | logger.info("Loading models...") |
| |
|
| | |
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME, use_fast=False) |
| | self.tokenizer.pad_token = self.tokenizer.eos_token |
| |
|
| | logger.info(f"Tokenizer for {settings.MODEL_NAME} loaded successfully.") |
| |
|
| | |
| | quantization_device = settings.DEVICE |
| | quantization_bits = settings.QUANTIZATION_BITS |
| |
|
| | self.model = AutoModelForCausalLM.from_pretrained( |
| | settings.MODEL_NAME, |
| | torch_dtype=torch.float16 if quantization_device == "cuda" else torch.float32, |
| | device_map="auto" if quantization_device == "cuda" else None, |
| | |
| | trust_remote_code=True |
| | ) |
| | logger.info(f"Model {settings.MODEL_NAME} loaded successfully on {quantization_device}.") |
| |
|
| | |
| | self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL, device='cuda' if torch.cuda.is_available() else 'cpu') |
| |
|
| | |
| | logger.info(f"Embedder {settings.EMBEDDER_MODEL} loaded successfully.") |
| |
|
| | except Exception as e: |
| | logger.error(f"Error loading models: {e}") |
| | raise RuntimeError(f"Failed to initialize ModelService: {str(e)}") |
| |
|
| | def get_models(self): |
| | """ |
| | Returns the tokenizer, language model, and sentence embedder instances. |
| | """ |
| | if not self.tokenizer or not self.model or not self.embedder: |
| | raise RuntimeError("Models are not fully loaded.") |
| | return self.tokenizer, self.model, self.embedder |
| |
|