Spaces:
Sleeping
Sleeping
| import fasttext | |
| import os | |
| import logging | |
| from src.config import Config | |
| logger = logging.getLogger(__name__) | |
| class LanguageDetectorService: | |
| def __init__(self): | |
| self.model = None | |
| self._initialized = False | |
| def initialize(self): | |
| """Initialize the FastText model.""" | |
| if self._initialized: | |
| return | |
| model_path = Config.get_model_path(Config.FASTTEXT_FILENAME) | |
| if not os.path.exists(model_path): | |
| logger.error(f"FastText model not found at {model_path}") | |
| # If model is missing, we can't function properly. | |
| # In production, this should probably prevent startup or fail gracefully. | |
| raise RuntimeError(f"FastText model not found at {model_path}") | |
| try: | |
| logger.info(f"Loading FastText model from {model_path}...") | |
| # Supress fasttext warning on load if possible, but it usually prints to C++ stdout | |
| self.model = fasttext.load_model(model_path) | |
| self._initialized = True | |
| logger.info("LanguageDetectorService initialized successfully.") | |
| except Exception as e: | |
| logger.error(f"Failed to load FastText model: {e}") | |
| raise RuntimeError(f"Failed to load FastText model: {e}") | |
| def detect_language(self, text: str) -> str: | |
| """ | |
| Detects the language of the provided text. | |
| Returns the ISO 639-1 language code (e.g., 'en', 'fr'). | |
| """ | |
| if not self._initialized or self.model is None: | |
| raise RuntimeError("LanguageDetectorService not initialized") | |
| if not text or not text.strip(): | |
| return "unknown" | |
| # fasttext expects a single line for prediction | |
| clean_text = text.replace("\n", " ") | |
| try: | |
| # predict returns a tuple: (['__label__en'], array([0.98...])) | |
| labels, _ = self.model.predict(clean_text) | |
| if labels: | |
| # Extract language code from '__label__en' | |
| lang_code = labels[0].replace("__label__", "") | |
| return lang_code | |
| return "unknown" | |
| except Exception as e: | |
| logger.error(f"Error detecting language: {e}") | |
| return "unknown" | |
| # Global instance | |
| language_detector_service = LanguageDetectorService() | |