File size: 8,992 Bytes
f98879b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
"""
Model classes untuk prediksi kata dengan Fuzzy Logic
Load dari brain_data_processor.pkl
"""
import re
import numpy as np
from typing import List, Tuple
from collections import Counter


class DataProcessorWrapper:
    """
    Wrapper class for data processor - needed for unpickling brain_data_processor.pkl
    """
    def __init__(self, unigram_freq, bigram_freq, trigram_freq, vocabulary, slang_dict):
        self.unigram_freq = unigram_freq
        self.bigram_freq = dict(bigram_freq)
        self.trigram_freq = dict(trigram_freq)
        self.vocabulary = vocabulary
        self.slang_dict = slang_dict
        self.vocab_size = len(vocabulary)
        self.total_words = sum(unigram_freq.values())


def preprocess_text(text: str, slang_dict: dict) -> List[str]:
    """
    Preprocess text dengan urutan: Regex -> Slang Normalization
    Stopwords TIDAK dihapus (keyboard needs to predict them)
    
    Returns:
        List[str]: list of processed words
        str: transformation log untuk X-Ray view
    """
    original_text = text
    
    # Step 1: Regex cleaning - hapus semua karakter non-alfabet dan non-spasi
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 2: Lowercase dan tokenize
    words = text.lower().split()
    
    # Step 3: Slang normalization dengan tracking
    normalized_words = []
    transformations = []
    
    for w in words:
        if w in slang_dict:
            normalized = slang_dict[w]
            transformations.append(f"'{w}' → '{normalized}'")
            normalized_words.append(normalized)
        else:
            normalized_words.append(w)
    
    return normalized_words, transformations


class BaseNGramModel:
    """
    Pure probabilistic N-Gram model dengan backoff mechanism
    """
    def __init__(self, data_processor):
        self.unigram_freq = data_processor.unigram_freq
        self.bigram_freq = data_processor.bigram_freq
        self.trigram_freq = data_processor.trigram_freq
        self.vocabulary = data_processor.vocabulary
        self.vocab_size = data_processor.vocab_size
        self.total_words = data_processor.total_words
    
    def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
        """
        Prediksi kata berikutnya berdasarkan context
        Returns: [(word, probability), ...]
        """
        scores = {}
        
        if len(context) >= 2:
            # Try trigram first
            key = (context[-2], context[-1])
            if key in self.trigram_freq:
                candidates = self.trigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    # Probability dengan Laplace smoothing
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0 and len(context) >= 1:
            # Backoff to bigram
            key = context[-1]
            if key in self.bigram_freq:
                candidates = self.bigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0:
            # Backoff to unigram (most frequent words)
            for word, count in Counter(self.unigram_freq).most_common(100):
                scores[word] = count / self.total_words
        
        # Sort by probability dan return top_k
        sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]


class FuzzyManualModel:
    """
    Fuzzy Logic model dengan parameter manual
    """
    def __init__(self, data_processor):
        self.unigram_freq = data_processor.unigram_freq
        self.bigram_freq = data_processor.bigram_freq
        self.trigram_freq = data_processor.trigram_freq
        self.vocabulary = data_processor.vocabulary
        self.vocab_size = data_processor.vocab_size
        self.total_words = data_processor.total_words
        
        # Manual parameters untuk fuzzy membership functions
        # Probability: [low_peak, medium_peak, high_peak]
        self.prob_params = [0.15, 0.45, 0.85]
        
        # Popularity: [rare_peak, common_peak, verycommon_peak] (log scale)
        self.pop_params = [2.0, 4.5, 7.0]  # log10 values
        
        # Fuzzy weights
        self.weights = {
            'prob': 0.6,  # 60% weight ke probability
            'pop': 0.4    # 40% weight ke popularity
        }
    
    def _get_base_predictions(self, context: List[str], top_k: int = 50) -> List[Tuple[str, float]]:
        """Get base predictions using n-gram model"""
        scores = {}
        
        if len(context) >= 2:
            key = (context[-2], context[-1])
            if key in self.trigram_freq:
                candidates = self.trigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0 and len(context) >= 1:
            key = context[-1]
            if key in self.bigram_freq:
                candidates = self.bigram_freq[key]
                total = sum(candidates.values())
                for word, count in candidates.items():
                    scores[word] = (count + 1) / (total + self.vocab_size)
        
        if len(scores) == 0:
            for word, count in Counter(self.unigram_freq).most_common(100):
                scores[word] = count / self.total_words
        
        sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]
    
    def fuzzify_prob(self, prob):
        """Fuzzify probability score"""
        low = max(0, 1 - abs(prob - self.prob_params[0]) / 0.3)
        med = max(0, 1 - abs(prob - self.prob_params[1]) / 0.3)
        high = max(0, 1 - abs(prob - self.prob_params[2]) / 0.3)
        return {'low': low, 'medium': med, 'high': high}
    
    def fuzzify_pop(self, count):
        """Fuzzify popularity score (log scale)"""
        log_count = np.log10(max(1, count))
        rare = max(0, 1 - abs(log_count - self.pop_params[0]) / 2.5)
        common = max(0, 1 - abs(log_count - self.pop_params[1]) / 2.5)
        very_common = max(0, 1 - abs(log_count - self.pop_params[2]) / 2.5)
        return {'rare': rare, 'common': common, 'very_common': very_common}
    
    def fuzzy_inference(self, prob_fuzzy, pop_fuzzy):
        """Apply fuzzy rules and defuzzify"""
        # Rule 1: High prob AND Very Common pop -> Excellent (0.9)
        rule1 = min(prob_fuzzy['high'], pop_fuzzy['very_common']) * 0.9
        
        # Rule 2: Medium prob AND Common pop -> Good (0.6)
        rule2 = min(prob_fuzzy['medium'], pop_fuzzy['common']) * 0.6
        
        # Rule 3: Low prob BUT Very Common pop -> Fair (0.45)
        rule3 = min(prob_fuzzy['low'], pop_fuzzy['very_common']) * 0.45
        
        # Rule 4: Any other combination -> Poor (weighted average)
        rule4 = (prob_fuzzy['low'] * 0.2 + pop_fuzzy['rare'] * 0.1) / 2
        
        # Defuzzification: weighted average
        return max(rule1, rule2, rule3, rule4)
    
    def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
        """Predict dengan fuzzy scoring"""
        # Get base predictions
        base_preds = self._get_base_predictions(context, top_k=50)
        
        fuzzy_scores = {}
        for word, prob in base_preds:
            # Get popularity
            pop_count = self.unigram_freq.get(word, 1)
            
            # Fuzzify
            prob_fuzzy = self.fuzzify_prob(prob)
            pop_fuzzy = self.fuzzify_pop(pop_count)
            
            # Inference
            fuzzy_score = self.fuzzy_inference(prob_fuzzy, pop_fuzzy)
            
            # Combine dengan weights
            final_score = (self.weights['prob'] * prob + 
                          self.weights['pop'] * fuzzy_score)
            
            fuzzy_scores[word] = final_score
        
        # Sort dan return
        sorted_predictions = sorted(fuzzy_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[:top_k]


class FuzzyGAModel(FuzzyManualModel):
    """
    Fuzzy Logic model dengan parameter dari Genetic Algorithm
    """
    def __init__(self, data_processor, ga_params):
        super().__init__(data_processor)
        # Override dengan parameter GA
        self.prob_params = ga_params[:3]
        self.pop_params = ga_params[3:6]


class FuzzyPSOModel(FuzzyManualModel):
    """
    Fuzzy Logic model dengan parameter dari Particle Swarm Optimization
    """
    def __init__(self, data_processor, pso_params):
        super().__init__(data_processor)
        # Override dengan parameter PSO
        self.prob_params = pso_params[:3]
        self.pop_params = pso_params[3:6]