File size: 3,089 Bytes
76d540d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""

AI Research Paper Helper - Model Manager

Lightweight version: Only handles local embeddings.

LLM inference goes through API (Groq/OpenRouter).

"""

import asyncio
import logging
from typing import Optional

from sentence_transformers import SentenceTransformer

from config import settings

logger = logging.getLogger(__name__)


class ModelManager:
    """Singleton manager for ML models with lazy loading and caching.

    

    Only loads the lightweight embedding model locally.

    All LLM inference is delegated to external APIs.

    """
    
    _instance: Optional['ModelManager'] = None
    _lock = asyncio.Lock()
    
    def __init__(self):
        self.device = "cpu"  # Embeddings run fine on CPU
        logger.info(f"ModelManager initialized (embeddings only, device: {self.device})")
        
        # Only embedding model (lightweight, ~80MB)
        self._embedding_model: Optional[SentenceTransformer] = None
    
    @classmethod
    def get_instance(cls) -> 'ModelManager':
        """Get singleton instance."""
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance
    
    @property
    def embedding_model(self) -> Optional[SentenceTransformer]:
        return self._embedding_model
    
    async def load_embedding_model(self) -> SentenceTransformer:
        """Load the embedding model asynchronously."""
        if self._embedding_model is not None:
            return self._embedding_model
        
        async with self._lock:
            if self._embedding_model is not None:
                return self._embedding_model
            
            logger.info(f"Loading embedding model: {settings.embedding_model}")
            
            # Run in thread pool to not block event loop
            loop = asyncio.get_event_loop()
            self._embedding_model = await loop.run_in_executor(
                None,
                lambda: SentenceTransformer(
                    settings.embedding_model,
                    cache_folder=str(settings.model_cache_dir),
                    device=self.device
                )
            )
            
            logger.info("Embedding model loaded successfully")
            return self._embedding_model
    
    async def get_embeddings(self, texts: list[str]) -> list[list[float]]:
        """Generate embeddings for a list of texts."""
        model = await self.load_embedding_model()
        
        loop = asyncio.get_event_loop()
        embeddings = await loop.run_in_executor(
            None,
            lambda: model.encode(texts, convert_to_tensor=False, show_progress_bar=False)
        )
        
        return embeddings.tolist()
    
    async def cleanup(self):
        """Clean up model resources."""
        logger.info("Cleaning up model resources...")
        
        if self._embedding_model is not None:
            del self._embedding_model
            self._embedding_model = None
        
        logger.info("Model cleanup complete")