Spaces:
Sleeping
Sleeping
| """ | |
| Model classes untuk prediksi kata dengan Fuzzy Logic | |
| Load dari brain_data_processor.pkl | |
| """ | |
| import re | |
| import numpy as np | |
| from typing import List, Tuple | |
| from collections import Counter | |
| class DataProcessorWrapper: | |
| """ | |
| Wrapper class for data processor - needed for unpickling brain_data_processor.pkl | |
| """ | |
| def __init__(self, unigram_freq, bigram_freq, trigram_freq, vocabulary, slang_dict): | |
| self.unigram_freq = unigram_freq | |
| self.bigram_freq = dict(bigram_freq) | |
| self.trigram_freq = dict(trigram_freq) | |
| self.vocabulary = vocabulary | |
| self.slang_dict = slang_dict | |
| self.vocab_size = len(vocabulary) | |
| self.total_words = sum(unigram_freq.values()) | |
| def preprocess_text(text: str, slang_dict: dict) -> List[str]: | |
| """ | |
| Preprocess text dengan urutan: Regex -> Slang Normalization | |
| Stopwords TIDAK dihapus (keyboard needs to predict them) | |
| Returns: | |
| List[str]: list of processed words | |
| str: transformation log untuk X-Ray view | |
| """ | |
| original_text = text | |
| # Step 1: Regex cleaning - hapus semua karakter non-alfabet dan non-spasi | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Step 2: Lowercase dan tokenize | |
| words = text.lower().split() | |
| # Step 3: Slang normalization dengan tracking | |
| normalized_words = [] | |
| transformations = [] | |
| for w in words: | |
| if w in slang_dict: | |
| normalized = slang_dict[w] | |
| transformations.append(f"'{w}' → '{normalized}'") | |
| normalized_words.append(normalized) | |
| else: | |
| normalized_words.append(w) | |
| return normalized_words, transformations | |
| class BaseNGramModel: | |
| """ | |
| Pure probabilistic N-Gram model dengan backoff mechanism | |
| """ | |
| def __init__(self, data_processor): | |
| self.unigram_freq = data_processor.unigram_freq | |
| self.bigram_freq = data_processor.bigram_freq | |
| self.trigram_freq = data_processor.trigram_freq | |
| self.vocabulary = data_processor.vocabulary | |
| self.vocab_size = data_processor.vocab_size | |
| self.total_words = data_processor.total_words | |
| def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]: | |
| """ | |
| Prediksi kata berikutnya berdasarkan context | |
| Returns: [(word, probability), ...] | |
| """ | |
| scores = {} | |
| if len(context) >= 2: | |
| # Try trigram first | |
| key = (context[-2], context[-1]) | |
| if key in self.trigram_freq: | |
| candidates = self.trigram_freq[key] | |
| total = sum(candidates.values()) | |
| for word, count in candidates.items(): | |
| # Probability dengan Laplace smoothing | |
| scores[word] = (count + 1) / (total + self.vocab_size) | |
| if len(scores) == 0 and len(context) >= 1: | |
| # Backoff to bigram | |
| key = context[-1] | |
| if key in self.bigram_freq: | |
| candidates = self.bigram_freq[key] | |
| total = sum(candidates.values()) | |
| for word, count in candidates.items(): | |
| scores[word] = (count + 1) / (total + self.vocab_size) | |
| if len(scores) == 0: | |
| # Backoff to unigram (most frequent words) | |
| for word, count in Counter(self.unigram_freq).most_common(100): | |
| scores[word] = count / self.total_words | |
| # Sort by probability dan return top_k | |
| sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
| return sorted_predictions[:top_k] | |
| class FuzzyManualModel: | |
| """ | |
| Fuzzy Logic model dengan parameter manual | |
| """ | |
| def __init__(self, data_processor): | |
| self.unigram_freq = data_processor.unigram_freq | |
| self.bigram_freq = data_processor.bigram_freq | |
| self.trigram_freq = data_processor.trigram_freq | |
| self.vocabulary = data_processor.vocabulary | |
| self.vocab_size = data_processor.vocab_size | |
| self.total_words = data_processor.total_words | |
| # Manual parameters untuk fuzzy membership functions | |
| # Probability: [low_peak, medium_peak, high_peak] | |
| self.prob_params = [0.15, 0.45, 0.85] | |
| # Popularity: [rare_peak, common_peak, verycommon_peak] (log scale) | |
| self.pop_params = [2.0, 4.5, 7.0] # log10 values | |
| # Fuzzy weights | |
| self.weights = { | |
| 'prob': 0.6, # 60% weight ke probability | |
| 'pop': 0.4 # 40% weight ke popularity | |
| } | |
| def _get_base_predictions(self, context: List[str], top_k: int = 50) -> List[Tuple[str, float]]: | |
| """Get base predictions using n-gram model""" | |
| scores = {} | |
| if len(context) >= 2: | |
| key = (context[-2], context[-1]) | |
| if key in self.trigram_freq: | |
| candidates = self.trigram_freq[key] | |
| total = sum(candidates.values()) | |
| for word, count in candidates.items(): | |
| scores[word] = (count + 1) / (total + self.vocab_size) | |
| if len(scores) == 0 and len(context) >= 1: | |
| key = context[-1] | |
| if key in self.bigram_freq: | |
| candidates = self.bigram_freq[key] | |
| total = sum(candidates.values()) | |
| for word, count in candidates.items(): | |
| scores[word] = (count + 1) / (total + self.vocab_size) | |
| if len(scores) == 0: | |
| for word, count in Counter(self.unigram_freq).most_common(100): | |
| scores[word] = count / self.total_words | |
| sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
| return sorted_predictions[:top_k] | |
| def fuzzify_prob(self, prob): | |
| """Fuzzify probability score""" | |
| low = max(0, 1 - abs(prob - self.prob_params[0]) / 0.3) | |
| med = max(0, 1 - abs(prob - self.prob_params[1]) / 0.3) | |
| high = max(0, 1 - abs(prob - self.prob_params[2]) / 0.3) | |
| return {'low': low, 'medium': med, 'high': high} | |
| def fuzzify_pop(self, count): | |
| """Fuzzify popularity score (log scale)""" | |
| log_count = np.log10(max(1, count)) | |
| rare = max(0, 1 - abs(log_count - self.pop_params[0]) / 2.5) | |
| common = max(0, 1 - abs(log_count - self.pop_params[1]) / 2.5) | |
| very_common = max(0, 1 - abs(log_count - self.pop_params[2]) / 2.5) | |
| return {'rare': rare, 'common': common, 'very_common': very_common} | |
| def fuzzy_inference(self, prob_fuzzy, pop_fuzzy): | |
| """Apply fuzzy rules and defuzzify""" | |
| # Rule 1: High prob AND Very Common pop -> Excellent (0.9) | |
| rule1 = min(prob_fuzzy['high'], pop_fuzzy['very_common']) * 0.9 | |
| # Rule 2: Medium prob AND Common pop -> Good (0.6) | |
| rule2 = min(prob_fuzzy['medium'], pop_fuzzy['common']) * 0.6 | |
| # Rule 3: Low prob BUT Very Common pop -> Fair (0.45) | |
| rule3 = min(prob_fuzzy['low'], pop_fuzzy['very_common']) * 0.45 | |
| # Rule 4: Any other combination -> Poor (weighted average) | |
| rule4 = (prob_fuzzy['low'] * 0.2 + pop_fuzzy['rare'] * 0.1) / 2 | |
| # Defuzzification: weighted average | |
| return max(rule1, rule2, rule3, rule4) | |
| def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]: | |
| """Predict dengan fuzzy scoring""" | |
| # Get base predictions | |
| base_preds = self._get_base_predictions(context, top_k=50) | |
| fuzzy_scores = {} | |
| for word, prob in base_preds: | |
| # Get popularity | |
| pop_count = self.unigram_freq.get(word, 1) | |
| # Fuzzify | |
| prob_fuzzy = self.fuzzify_prob(prob) | |
| pop_fuzzy = self.fuzzify_pop(pop_count) | |
| # Inference | |
| fuzzy_score = self.fuzzy_inference(prob_fuzzy, pop_fuzzy) | |
| # Combine dengan weights | |
| final_score = (self.weights['prob'] * prob + | |
| self.weights['pop'] * fuzzy_score) | |
| fuzzy_scores[word] = final_score | |
| # Sort dan return | |
| sorted_predictions = sorted(fuzzy_scores.items(), key=lambda x: x[1], reverse=True) | |
| return sorted_predictions[:top_k] | |
| class FuzzyGAModel(FuzzyManualModel): | |
| """ | |
| Fuzzy Logic model dengan parameter dari Genetic Algorithm | |
| """ | |
| def __init__(self, data_processor, ga_params): | |
| super().__init__(data_processor) | |
| # Override dengan parameter GA | |
| self.prob_params = ga_params[:3] | |
| self.pop_params = ga_params[3:6] | |
| class FuzzyPSOModel(FuzzyManualModel): | |
| """ | |
| Fuzzy Logic model dengan parameter dari Particle Swarm Optimization | |
| """ | |
| def __init__(self, data_processor, pso_params): | |
| super().__init__(data_processor) | |
| # Override dengan parameter PSO | |
| self.prob_params = pso_params[:3] | |
| self.pop_params = pso_params[3:6] | |