import fasttext import os import logging from src.config import Config logger = logging.getLogger(__name__) class LanguageDetectorService: def __init__(self): self.model = None self._initialized = False def initialize(self): """Initialize the FastText model.""" if self._initialized: return model_path = Config.get_model_path(Config.FASTTEXT_FILENAME) if not os.path.exists(model_path): logger.error(f"FastText model not found at {model_path}") # If model is missing, we can't function properly. # In production, this should probably prevent startup or fail gracefully. raise RuntimeError(f"FastText model not found at {model_path}") try: logger.info(f"Loading FastText model from {model_path}...") # Supress fasttext warning on load if possible, but it usually prints to C++ stdout self.model = fasttext.load_model(model_path) self._initialized = True logger.info("LanguageDetectorService initialized successfully.") except Exception as e: logger.error(f"Failed to load FastText model: {e}") raise RuntimeError(f"Failed to load FastText model: {e}") def detect_language(self, text: str) -> str: """ Detects the language of the provided text. Returns the ISO 639-1 language code (e.g., 'en', 'fr'). """ if not self._initialized or self.model is None: raise RuntimeError("LanguageDetectorService not initialized") if not text or not text.strip(): return "unknown" # fasttext expects a single line for prediction clean_text = text.replace("\n", " ") try: # predict returns a tuple: (['__label__en'], array([0.98...])) labels, _ = self.model.predict(clean_text) if labels: # Extract language code from '__label__en' lang_code = labels[0].replace("__label__", "") return lang_code return "unknown" except Exception as e: logger.error(f"Error detecting language: {e}") return "unknown" # Global instance language_detector_service = LanguageDetectorService()