Spaces:

elzaff
/

keyboard-recommendation

Sleeping

keyboard-recommendation / src /utils /models.py

Fazle Mawla Wahyuhanda

Add utils module and brain_params.json

f98879b about 2 months ago

8.99 kB

	"""
	Model classes untuk prediksi kata dengan Fuzzy Logic
	Load dari brain_data_processor.pkl
	"""
	import re
	import numpy as np
	from typing import List, Tuple
	from collections import Counter


	class DataProcessorWrapper:
	"""
	Wrapper class for data processor - needed for unpickling brain_data_processor.pkl
	"""
	def __init__(self, unigram_freq, bigram_freq, trigram_freq, vocabulary, slang_dict):
	self.unigram_freq = unigram_freq
	self.bigram_freq = dict(bigram_freq)
	self.trigram_freq = dict(trigram_freq)
	self.vocabulary = vocabulary
	self.slang_dict = slang_dict
	self.vocab_size = len(vocabulary)
	self.total_words = sum(unigram_freq.values())


	def preprocess_text(text: str, slang_dict: dict) -> List[str]:
	"""
	Preprocess text dengan urutan: Regex -> Slang Normalization
	Stopwords TIDAK dihapus (keyboard needs to predict them)

	Returns:
	List[str]: list of processed words
	str: transformation log untuk X-Ray view
	"""
	original_text = text

	# Step 1: Regex cleaning - hapus semua karakter non-alfabet dan non-spasi
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Step 2: Lowercase dan tokenize
	words = text.lower().split()

	# Step 3: Slang normalization dengan tracking
	normalized_words = []
	transformations = []

	for w in words:
	if w in slang_dict:
	normalized = slang_dict[w]
	transformations.append(f"'{w}' → '{normalized}'")
	normalized_words.append(normalized)
	else:
	normalized_words.append(w)

	return normalized_words, transformations


	class BaseNGramModel:
	"""
	Pure probabilistic N-Gram model dengan backoff mechanism
	"""
	def __init__(self, data_processor):
	self.unigram_freq = data_processor.unigram_freq
	self.bigram_freq = data_processor.bigram_freq
	self.trigram_freq = data_processor.trigram_freq
	self.vocabulary = data_processor.vocabulary
	self.vocab_size = data_processor.vocab_size
	self.total_words = data_processor.total_words

	def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
	"""
	Prediksi kata berikutnya berdasarkan context
	Returns: [(word, probability), ...]
	"""
	scores = {}

	if len(context) >= 2:
	# Try trigram first
	key = (context[-2], context[-1])
	if key in self.trigram_freq:
	candidates = self.trigram_freq[key]
	total = sum(candidates.values())
	for word, count in candidates.items():
	# Probability dengan Laplace smoothing
	scores[word] = (count + 1) / (total + self.vocab_size)

	if len(scores) == 0 and len(context) >= 1:
	# Backoff to bigram
	key = context[-1]
	if key in self.bigram_freq:
	candidates = self.bigram_freq[key]
	total = sum(candidates.values())
	for word, count in candidates.items():
	scores[word] = (count + 1) / (total + self.vocab_size)

	if len(scores) == 0:
	# Backoff to unigram (most frequent words)
	for word, count in Counter(self.unigram_freq).most_common(100):
	scores[word] = count / self.total_words

	# Sort by probability dan return top_k
	sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	return sorted_predictions[:top_k]


	class FuzzyManualModel:
	"""
	Fuzzy Logic model dengan parameter manual
	"""
	def __init__(self, data_processor):
	self.unigram_freq = data_processor.unigram_freq
	self.bigram_freq = data_processor.bigram_freq
	self.trigram_freq = data_processor.trigram_freq
	self.vocabulary = data_processor.vocabulary
	self.vocab_size = data_processor.vocab_size
	self.total_words = data_processor.total_words

	# Manual parameters untuk fuzzy membership functions
	# Probability: [low_peak, medium_peak, high_peak]
	self.prob_params = [0.15, 0.45, 0.85]

	# Popularity: [rare_peak, common_peak, verycommon_peak] (log scale)
	self.pop_params = [2.0, 4.5, 7.0] # log10 values

	# Fuzzy weights
	self.weights = {
	'prob': 0.6, # 60% weight ke probability
	'pop': 0.4 # 40% weight ke popularity
	}

	def _get_base_predictions(self, context: List[str], top_k: int = 50) -> List[Tuple[str, float]]:
	"""Get base predictions using n-gram model"""
	scores = {}

	if len(context) >= 2:
	key = (context[-2], context[-1])
	if key in self.trigram_freq:
	candidates = self.trigram_freq[key]
	total = sum(candidates.values())
	for word, count in candidates.items():
	scores[word] = (count + 1) / (total + self.vocab_size)

	if len(scores) == 0 and len(context) >= 1:
	key = context[-1]
	if key in self.bigram_freq:
	candidates = self.bigram_freq[key]
	total = sum(candidates.values())
	for word, count in candidates.items():
	scores[word] = (count + 1) / (total + self.vocab_size)

	if len(scores) == 0:
	for word, count in Counter(self.unigram_freq).most_common(100):
	scores[word] = count / self.total_words

	sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	return sorted_predictions[:top_k]

	def fuzzify_prob(self, prob):
	"""Fuzzify probability score"""
	low = max(0, 1 - abs(prob - self.prob_params[0]) / 0.3)
	med = max(0, 1 - abs(prob - self.prob_params[1]) / 0.3)
	high = max(0, 1 - abs(prob - self.prob_params[2]) / 0.3)
	return {'low': low, 'medium': med, 'high': high}

	def fuzzify_pop(self, count):
	"""Fuzzify popularity score (log scale)"""
	log_count = np.log10(max(1, count))
	rare = max(0, 1 - abs(log_count - self.pop_params[0]) / 2.5)
	common = max(0, 1 - abs(log_count - self.pop_params[1]) / 2.5)
	very_common = max(0, 1 - abs(log_count - self.pop_params[2]) / 2.5)
	return {'rare': rare, 'common': common, 'very_common': very_common}

	def fuzzy_inference(self, prob_fuzzy, pop_fuzzy):
	"""Apply fuzzy rules and defuzzify"""
	# Rule 1: High prob AND Very Common pop -> Excellent (0.9)
	rule1 = min(prob_fuzzy['high'], pop_fuzzy['very_common']) * 0.9

	# Rule 2: Medium prob AND Common pop -> Good (0.6)
	rule2 = min(prob_fuzzy['medium'], pop_fuzzy['common']) * 0.6

	# Rule 3: Low prob BUT Very Common pop -> Fair (0.45)
	rule3 = min(prob_fuzzy['low'], pop_fuzzy['very_common']) * 0.45

	# Rule 4: Any other combination -> Poor (weighted average)
	rule4 = (prob_fuzzy['low'] * 0.2 + pop_fuzzy['rare'] * 0.1) / 2

	# Defuzzification: weighted average
	return max(rule1, rule2, rule3, rule4)

	def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
	"""Predict dengan fuzzy scoring"""
	# Get base predictions
	base_preds = self._get_base_predictions(context, top_k=50)

	fuzzy_scores = {}
	for word, prob in base_preds:
	# Get popularity
	pop_count = self.unigram_freq.get(word, 1)

	# Fuzzify
	prob_fuzzy = self.fuzzify_prob(prob)
	pop_fuzzy = self.fuzzify_pop(pop_count)

	# Inference
	fuzzy_score = self.fuzzy_inference(prob_fuzzy, pop_fuzzy)

	# Combine dengan weights
	final_score = (self.weights['prob'] * prob +
	self.weights['pop'] * fuzzy_score)

	fuzzy_scores[word] = final_score

	# Sort dan return
	sorted_predictions = sorted(fuzzy_scores.items(), key=lambda x: x[1], reverse=True)
	return sorted_predictions[:top_k]


	class FuzzyGAModel(FuzzyManualModel):
	"""
	Fuzzy Logic model dengan parameter dari Genetic Algorithm
	"""
	def __init__(self, data_processor, ga_params):
	super().__init__(data_processor)
	# Override dengan parameter GA
	self.prob_params = ga_params[:3]
	self.pop_params = ga_params[3:6]


	class FuzzyPSOModel(FuzzyManualModel):
	"""
	Fuzzy Logic model dengan parameter dari Particle Swarm Optimization
	"""
	def __init__(self, data_processor, pso_params):
	super().__init__(data_processor)
	# Override dengan parameter PSO
	self.prob_params = pso_params[:3]
	self.pop_params = pso_params[3:6]