Spaces:

tinykavi
/

sinhala-dyslexia-binary-demo

Sleeping

Update vectorizer.py

30e9fd8 verified about 1 month ago

1.54 kB

	"""
	Vectorizer module.

	Responsibility:
	- Load the trained TF-IDF vectorizer
	- Convert raw Sinhala sentences into numerical feature vectors

	This module is intentionally isolated so that:
	- Feature extraction logic is reusable
	- Vectorizer is loaded only once (lazy loading)
	- Inference code remains clean and readable
	"""

	import joblib
	import os

	# ------------------------------------------------------------
	# Resolve project root directory
	# ------------------------------------------------------------
	# BASE_DIR points to: binary_dyslexia_detector/
	# This allows model paths to work regardless of where the app is run
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))


	# Absolute path to the saved TF-IDF vectorizer
	VECTORIZER_PATH = os.path.join(
	BASE_DIR, "models", "tfidf_vectorizer.pkl"
	)

	# Cached vectorizer instance (loaded once)
	_vectorizer = None


	def load_vectorizer():
	"""
	Loads the TF-IDF vectorizer from disk if not already loaded.

	Uses lazy loading to:
	- Avoid repeated disk I/O
	- Improve inference performance
	"""
	global _vectorizer
	if _vectorizer is None:
	_vectorizer = joblib.load(VECTORIZER_PATH)
	return _vectorizer


	def vectorize_sentence(sentence: str):
	"""
	Converts a single Sinhala sentence into a TF-IDF feature vector.

	Args:
	sentence (str): Raw Sinhala sentence

	Returns:
	scipy sparse matrix: Vectorized sentence representation
	"""
	vectorizer = load_vectorizer()
	return vectorizer.transform([sentence])