File size: 2,308 Bytes
1182571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import fasttext
import os
import logging
from src.config import Config

logger = logging.getLogger(__name__)

class LanguageDetectorService:
    def __init__(self):
        self.model = None
        self._initialized = False

    def initialize(self):
        """Initialize the FastText model."""
        if self._initialized:
            return

        model_path = Config.get_model_path(Config.FASTTEXT_FILENAME)
        if not os.path.exists(model_path):
            logger.error(f"FastText model not found at {model_path}")
            # If model is missing, we can't function properly.
            # In production, this should probably prevent startup or fail gracefully.
            raise RuntimeError(f"FastText model not found at {model_path}")

        try:
            logger.info(f"Loading FastText model from {model_path}...")
            # Supress fasttext warning on load if possible, but it usually prints to C++ stdout
            self.model = fasttext.load_model(model_path)
            self._initialized = True
            logger.info("LanguageDetectorService initialized successfully.")
        except Exception as e:
            logger.error(f"Failed to load FastText model: {e}")
            raise RuntimeError(f"Failed to load FastText model: {e}")

    def detect_language(self, text: str) -> str:
        """
        Detects the language of the provided text.
        Returns the ISO 639-1 language code (e.g., 'en', 'fr').
        """
        if not self._initialized or self.model is None:
             raise RuntimeError("LanguageDetectorService not initialized")

        if not text or not text.strip():
            return "unknown"

        # fasttext expects a single line for prediction
        clean_text = text.replace("\n", " ")

        try:
            # predict returns a tuple: (['__label__en'], array([0.98...]))
            labels, _ = self.model.predict(clean_text)
            if labels:
                # Extract language code from '__label__en'
                lang_code = labels[0].replace("__label__", "")
                return lang_code
            return "unknown"
        except Exception as e:
            logger.error(f"Error detecting language: {e}")
            return "unknown"

# Global instance
language_detector_service = LanguageDetectorService()