File size: 1,535 Bytes
30e9fd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Vectorizer module.

Responsibility:
- Load the trained TF-IDF vectorizer
- Convert raw Sinhala sentences into numerical feature vectors

This module is intentionally isolated so that:
- Feature extraction logic is reusable
- Vectorizer is loaded only once (lazy loading)
- Inference code remains clean and readable
"""

import joblib
import os

# ------------------------------------------------------------
# Resolve project root directory
# ------------------------------------------------------------
# BASE_DIR points to: binary_dyslexia_detector/
# This allows model paths to work regardless of where the app is run
BASE_DIR = os.path.dirname(os.path.abspath(__file__))


# Absolute path to the saved TF-IDF vectorizer
VECTORIZER_PATH = os.path.join(
    BASE_DIR, "models", "tfidf_vectorizer.pkl"
)

# Cached vectorizer instance (loaded once)
_vectorizer = None


def load_vectorizer():
    """
    Loads the TF-IDF vectorizer from disk if not already loaded.

    Uses lazy loading to:
    - Avoid repeated disk I/O
    - Improve inference performance
    """
    global _vectorizer
    if _vectorizer is None:
        _vectorizer = joblib.load(VECTORIZER_PATH)
    return _vectorizer


def vectorize_sentence(sentence: str):
    """
    Converts a single Sinhala sentence into a TF-IDF feature vector.

    Args:
        sentence (str): Raw Sinhala sentence

    Returns:
        scipy sparse matrix: Vectorized sentence representation
    """
    vectorizer = load_vectorizer()
    return vectorizer.transform([sentence])