File size: 1,535 Bytes
30e9fd8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """
Vectorizer module.
Responsibility:
- Load the trained TF-IDF vectorizer
- Convert raw Sinhala sentences into numerical feature vectors
This module is intentionally isolated so that:
- Feature extraction logic is reusable
- Vectorizer is loaded only once (lazy loading)
- Inference code remains clean and readable
"""
import joblib
import os
# ------------------------------------------------------------
# Resolve project root directory
# ------------------------------------------------------------
# BASE_DIR points to: binary_dyslexia_detector/
# This allows model paths to work regardless of where the app is run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# Absolute path to the saved TF-IDF vectorizer
VECTORIZER_PATH = os.path.join(
BASE_DIR, "models", "tfidf_vectorizer.pkl"
)
# Cached vectorizer instance (loaded once)
_vectorizer = None
def load_vectorizer():
"""
Loads the TF-IDF vectorizer from disk if not already loaded.
Uses lazy loading to:
- Avoid repeated disk I/O
- Improve inference performance
"""
global _vectorizer
if _vectorizer is None:
_vectorizer = joblib.load(VECTORIZER_PATH)
return _vectorizer
def vectorize_sentence(sentence: str):
"""
Converts a single Sinhala sentence into a TF-IDF feature vector.
Args:
sentence (str): Raw Sinhala sentence
Returns:
scipy sparse matrix: Vectorized sentence representation
"""
vectorizer = load_vectorizer()
return vectorizer.transform([sentence])
|