| """ |
| Vectorizer module. |
| |
| Responsibility: |
| - Load the trained TF-IDF vectorizer |
| - Convert raw Sinhala sentences into numerical feature vectors |
| |
| This module is intentionally isolated so that: |
| - Feature extraction logic is reusable |
| - Vectorizer is loaded only once (lazy loading) |
| - Inference code remains clean and readable |
| """ |
|
|
| import joblib |
| import os |
|
|
| |
| |
| |
| |
| |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
| |
| VECTORIZER_PATH = os.path.join( |
| BASE_DIR, "models", "tfidf_vectorizer.pkl" |
| ) |
|
|
| |
| _vectorizer = None |
|
|
|
|
| def load_vectorizer(): |
| """ |
| Loads the TF-IDF vectorizer from disk if not already loaded. |
| |
| Uses lazy loading to: |
| - Avoid repeated disk I/O |
| - Improve inference performance |
| """ |
| global _vectorizer |
| if _vectorizer is None: |
| _vectorizer = joblib.load(VECTORIZER_PATH) |
| return _vectorizer |
|
|
|
|
| def vectorize_sentence(sentence: str): |
| """ |
| Converts a single Sinhala sentence into a TF-IDF feature vector. |
| |
| Args: |
| sentence (str): Raw Sinhala sentence |
| |
| Returns: |
| scipy sparse matrix: Vectorized sentence representation |
| """ |
| vectorizer = load_vectorizer() |
| return vectorizer.transform([sentence]) |
|
|