Spaces:

elzaff
/

keyboard-recommendation

Sleeping

File size: 8,992 Bytes

f98879b

"""
Model classes untuk prediksi kata dengan Fuzzy Logic
Load dari brain_data_processor.pkl
"""
import re
import numpy as np
from typing import List, Tuple
from collections import Counter


class DataProcessorWrapper:
    """
    Wrapper class for data processor - needed for unpickling brain_data_processor.pkl
    """
    def __init__(self, unigram_freq, bigram_freq, trigram_freq, vocabulary, slang_dict):
        self.unigram_freq = unigram_freq
        self.bigram_freq = dict(bigram_freq)
        self.trigram_freq = dict(trigram_freq)
        self.vocabulary = vocabulary
        self.slang_dict = slang_dict
        self.vocab_size = len(vocabulary)
        self.total_words = sum(unigram_freq.values())


def preprocess_text(text: str, slang_dict: dict) -> List[str]:
    """
    Preprocess text dengan urutan: Regex -> Slang Normalization
    Stopwords TIDAK dihapus (keyboard needs to predict them)
    
    Returns:
        List[str]: list of processed words
        str: transformation log untuk X-Ray view
    """
    original_text = text
    
    # Step 1: Regex cleaning - hapus semua karakter non-alfabet dan non-spasi
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 2: Lowercase dan tokenize
    words = text.lower().split()
    
    # Step 3: Slang normalization dengan tracking
    normalized_words = []
    transformations = []
    
    for w in words:
        if w in slang_dict:
            normalized = slang_dict[w]
            transformations.append(f"'{w}' → '{normalized}'")
            normalized_words.append(normalized)
        else:
            normalized_words.append(w)
    
    return normalized_words, transformations


class BaseNGramModel:
    """
    Pure probabilistic N-Gram model dengan backoff mechanism
    """
    def __init__(self, data_processor):
        self.unigram_freq = data_processor.unigram_freq
        self.bigram_freq = data_processor.bigram_freq
        self.trigram_freq = data_processor.trigram_freq
        self.vocabulary = data_processor.vocabulary
        self.vocab_size = data_processor.vocab_size
        self.total_words = data_processor.total_words
    
    def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
        """
        Prediksi kata berikutnya berdasarkan context
        Returns: [(word, probability), ...]
        """
        scores = {}
        
        if len(context) >= 2:
            # Try trigram first
            key = (context[-2], context[-1])
            if key in self.trigram_freq:
                candidates = self.trigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    # Probability dengan Laplace smoothing
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0 and len(context) >= 1:
            # Backoff to bigram
            key = context[-1]
            if key in self.bigram_freq:
                candidates = self.bigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0:
            # Backoff to unigram (most frequent words)
            for word, count in Counter(self.unigram_freq).most_common(100):
                scores[word] = count / self.total_words
        
        # Sort by probability dan return top_k
        sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]


class FuzzyManualModel:
    """
    Fuzzy Logic model dengan parameter manual
    """
    def __init__(self, data_processor):
        self.unigram_freq = data_processor.unigram_freq
        self.bigram_freq = data_processor.bigram_freq
        self.trigram_freq = data_processor.trigram_freq
        self.vocabulary = data_processor.vocabulary
        self.vocab_size = data_processor.vocab_size
        self.total_words = data_processor.total_words
        
        # Manual parameters untuk fuzzy membership functions
        # Probability: [low_peak, medium_peak, high_peak]
        self.prob_params = [0.15, 0.45, 0.85]
        
        # Popularity: [rare_peak, common_peak, verycommon_peak] (log scale)
        self.pop_params = [2.0, 4.5, 7.0]  # log10 values
        
        # Fuzzy weights
        self.weights = {
            'prob': 0.6,  # 60% weight ke probability
            'pop': 0.4    # 40% weight ke popularity
        }
    
    def _get_base_predictions(self, context: List[str], top_k: int = 50) -> List[Tuple[str, float]]:
        """Get base predictions using n-gram model"""
        scores = {}
        
        if len(context) >= 2:
            key = (context[-2], context[-1])
            if key in self.trigram_freq:
                candidates = self.trigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0 and len(context) >= 1:
            key = context[-1]
            if key in self.bigram_freq:
                candidates = self.bigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0:
            for word, count in Counter(self.unigram_freq).most_common(100):
                scores[word] = count / self.total_words
        
        sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]
    
    def fuzzify_prob(self, prob):
        """Fuzzify probability score"""
        low = max(0, 1 - abs(prob - self.prob_params[0]) / 0.3)
        med = max(0, 1 - abs(prob - self.prob_params[1]) / 0.3)
        high = max(0, 1 - abs(prob - self.prob_params[2]) / 0.3)
        return {'low': low, 'medium': med, 'high': high}
    
    def fuzzify_pop(self, count):
        """Fuzzify popularity score (log scale)"""
        log_count = np.log10(max(1, count))
        rare = max(0, 1 - abs(log_count - self.pop_params[0]) / 2.5)
        common = max(0, 1 - abs(log_count - self.pop_params[1]) / 2.5)
        very_common = max(0, 1 - abs(log_count - self.pop_params[2]) / 2.5)
        return {'rare': rare, 'common': common, 'very_common': very_common}
    
    def fuzzy_inference(self, prob_fuzzy, pop_fuzzy):
        """Apply fuzzy rules and defuzzify"""
        # Rule 1: High prob AND Very Common pop -> Excellent (0.9)
        rule1 = min(prob_fuzzy['high'], pop_fuzzy['very_common']) * 0.9
        
        # Rule 2: Medium prob AND Common pop -> Good (0.6)
        rule2 = min(prob_fuzzy['medium'], pop_fuzzy['common']) * 0.6
        
        # Rule 3: Low prob BUT Very Common pop -> Fair (0.45)
        rule3 = min(prob_fuzzy['low'], pop_fuzzy['very_common']) * 0.45
        
        # Rule 4: Any other combination -> Poor (weighted average)
        rule4 = (prob_fuzzy['low'] * 0.2 + pop_fuzzy['rare'] * 0.1) / 2
        
        # Defuzzification: weighted average
        return max(rule1, rule2, rule3, rule4)
    
    def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
        """Predict dengan fuzzy scoring"""
        # Get base predictions
        base_preds = self._get_base_predictions(context, top_k=50)
        
        fuzzy_scores = {}
        for word, prob in base_preds:
            # Get popularity
            pop_count = self.unigram_freq.get(word, 1)
            
            # Fuzzify
            prob_fuzzy = self.fuzzify_prob(prob)
            pop_fuzzy = self.fuzzify_pop(pop_count)
            
            # Inference
            fuzzy_score = self.fuzzy_inference(prob_fuzzy, pop_fuzzy)
            
            # Combine dengan weights
            final_score = (self.weights['prob'] * prob + 
                          self.weights['pop'] * fuzzy_score)
            
            fuzzy_scores[word] = final_score
        
        # Sort dan return
        sorted_predictions = sorted(fuzzy_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]


class FuzzyGAModel(FuzzyManualModel):
    """
    Fuzzy Logic model dengan parameter dari Genetic Algorithm
    """
    def __init__(self, data_processor, ga_params):
        super().__init__(data_processor)
        # Override dengan parameter GA
        self.prob_params = ga_params[:3]
        self.pop_params = ga_params[3:6]


class FuzzyPSOModel(FuzzyManualModel):
    """
    Fuzzy Logic model dengan parameter dari Particle Swarm Optimization
    """
    def __init__(self, data_processor, pso_params):
        super().__init__(data_processor)
        # Override dengan parameter PSO
        self.prob_params = pso_params[:3]
        self.pop_params = pso_params[3:6]