Fazle Mawla Wahyuhanda commited on
Commit
f98879b
·
1 Parent(s): 126bd14

Add utils module and brain_params.json

Browse files
src/brain_params.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ga_params": [
3
+ 0.11470089804145127,
4
+ 0.19400274988046418,
5
+ 0.8812051256574979,
6
+ 1.0656220407276922,
7
+ 4.643527895439729,
8
+ 7.693048699833912
9
+ ],
10
+ "pso_params": [
11
+ 0.26044028650438855,
12
+ 0.3716727120094174,
13
+ 0.39161891527984233,
14
+ 1.5091777104339394,
15
+ 3.529365593941532,
16
+ 4.633741518442697
17
+ ]
18
+ }
src/streamlit_app.py CHANGED
@@ -12,7 +12,11 @@ import sys
12
  import requests
13
 
14
  # Add src directory to path for imports
15
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
 
 
 
16
 
17
  # ============================================
18
  # GOOGLE DRIVE DOWNLOAD HELPER
@@ -42,14 +46,36 @@ def download_from_gdrive(file_id, destination):
42
  # Google Drive File ID
43
  GDRIVE_FILE_ID = "1jetjbzPB4hLVHNmGpETpz4ifd0CX70Qm"
44
 
45
- from utils.models import (
46
- preprocess_text,
47
- BaseNGramModel,
48
- FuzzyManualModel,
49
- FuzzyGAModel,
50
- FuzzyPSOModel,
51
- DataProcessorWrapper
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Register DataProcessorWrapper in __main__ for unpickling
55
  import __main__
 
12
  import requests
13
 
14
  # Add src directory to path for imports
15
+ current_dir = os.path.dirname(os.path.abspath(__file__))
16
+ sys.path.insert(0, current_dir)
17
+ # Also add parent directory in case running from different locations
18
+ parent_dir = os.path.dirname(current_dir)
19
+ sys.path.insert(0, parent_dir)
20
 
21
  # ============================================
22
  # GOOGLE DRIVE DOWNLOAD HELPER
 
46
  # Google Drive File ID
47
  GDRIVE_FILE_ID = "1jetjbzPB4hLVHNmGpETpz4ifd0CX70Qm"
48
 
49
+ # Try multiple import paths for different deployment environments
50
+ try:
51
+ from utils.models import (
52
+ preprocess_text,
53
+ BaseNGramModel,
54
+ FuzzyManualModel,
55
+ FuzzyGAModel,
56
+ FuzzyPSOModel,
57
+ DataProcessorWrapper
58
+ )
59
+ except ModuleNotFoundError:
60
+ try:
61
+ from src.utils.models import (
62
+ preprocess_text,
63
+ BaseNGramModel,
64
+ FuzzyManualModel,
65
+ FuzzyGAModel,
66
+ FuzzyPSOModel,
67
+ DataProcessorWrapper
68
+ )
69
+ except ModuleNotFoundError:
70
+ # Direct import if models.py is in same directory structure
71
+ from models import (
72
+ preprocess_text,
73
+ BaseNGramModel,
74
+ FuzzyManualModel,
75
+ FuzzyGAModel,
76
+ FuzzyPSOModel,
77
+ DataProcessorWrapper
78
+ )
79
 
80
  # Register DataProcessorWrapper in __main__ for unpickling
81
  import __main__
src/utils/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils package
2
+ from .models import (
3
+ DataProcessorWrapper,
4
+ preprocess_text,
5
+ BaseNGramModel,
6
+ FuzzyManualModel,
7
+ FuzzyGAModel,
8
+ FuzzyPSOModel
9
+ )
10
+
11
+ __all__ = [
12
+ 'DataProcessorWrapper',
13
+ 'preprocess_text',
14
+ 'BaseNGramModel',
15
+ 'FuzzyManualModel',
16
+ 'FuzzyGAModel',
17
+ 'FuzzyPSOModel'
18
+ ]
src/utils/models.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model classes untuk prediksi kata dengan Fuzzy Logic
3
+ Load dari brain_data_processor.pkl
4
+ """
5
+ import re
6
+ import numpy as np
7
+ from typing import List, Tuple
8
+ from collections import Counter
9
+
10
+
11
+ class DataProcessorWrapper:
12
+ """
13
+ Wrapper class for data processor - needed for unpickling brain_data_processor.pkl
14
+ """
15
+ def __init__(self, unigram_freq, bigram_freq, trigram_freq, vocabulary, slang_dict):
16
+ self.unigram_freq = unigram_freq
17
+ self.bigram_freq = dict(bigram_freq)
18
+ self.trigram_freq = dict(trigram_freq)
19
+ self.vocabulary = vocabulary
20
+ self.slang_dict = slang_dict
21
+ self.vocab_size = len(vocabulary)
22
+ self.total_words = sum(unigram_freq.values())
23
+
24
+
25
+ def preprocess_text(text: str, slang_dict: dict) -> List[str]:
26
+ """
27
+ Preprocess text dengan urutan: Regex -> Slang Normalization
28
+ Stopwords TIDAK dihapus (keyboard needs to predict them)
29
+
30
+ Returns:
31
+ List[str]: list of processed words
32
+ str: transformation log untuk X-Ray view
33
+ """
34
+ original_text = text
35
+
36
+ # Step 1: Regex cleaning - hapus semua karakter non-alfabet dan non-spasi
37
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
38
+
39
+ # Step 2: Lowercase dan tokenize
40
+ words = text.lower().split()
41
+
42
+ # Step 3: Slang normalization dengan tracking
43
+ normalized_words = []
44
+ transformations = []
45
+
46
+ for w in words:
47
+ if w in slang_dict:
48
+ normalized = slang_dict[w]
49
+ transformations.append(f"'{w}' → '{normalized}'")
50
+ normalized_words.append(normalized)
51
+ else:
52
+ normalized_words.append(w)
53
+
54
+ return normalized_words, transformations
55
+
56
+
57
+ class BaseNGramModel:
58
+ """
59
+ Pure probabilistic N-Gram model dengan backoff mechanism
60
+ """
61
+ def __init__(self, data_processor):
62
+ self.unigram_freq = data_processor.unigram_freq
63
+ self.bigram_freq = data_processor.bigram_freq
64
+ self.trigram_freq = data_processor.trigram_freq
65
+ self.vocabulary = data_processor.vocabulary
66
+ self.vocab_size = data_processor.vocab_size
67
+ self.total_words = data_processor.total_words
68
+
69
+ def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
70
+ """
71
+ Prediksi kata berikutnya berdasarkan context
72
+ Returns: [(word, probability), ...]
73
+ """
74
+ scores = {}
75
+
76
+ if len(context) >= 2:
77
+ # Try trigram first
78
+ key = (context[-2], context[-1])
79
+ if key in self.trigram_freq:
80
+ candidates = self.trigram_freq[key]
81
+ total = sum(candidates.values())
82
+ for word, count in candidates.items():
83
+ # Probability dengan Laplace smoothing
84
+ scores[word] = (count + 1) / (total + self.vocab_size)
85
+
86
+ if len(scores) == 0 and len(context) >= 1:
87
+ # Backoff to bigram
88
+ key = context[-1]
89
+ if key in self.bigram_freq:
90
+ candidates = self.bigram_freq[key]
91
+ total = sum(candidates.values())
92
+ for word, count in candidates.items():
93
+ scores[word] = (count + 1) / (total + self.vocab_size)
94
+
95
+ if len(scores) == 0:
96
+ # Backoff to unigram (most frequent words)
97
+ for word, count in Counter(self.unigram_freq).most_common(100):
98
+ scores[word] = count / self.total_words
99
+
100
+ # Sort by probability dan return top_k
101
+ sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
102
+ return sorted_predictions[:top_k]
103
+
104
+
105
+ class FuzzyManualModel:
106
+ """
107
+ Fuzzy Logic model dengan parameter manual
108
+ """
109
+ def __init__(self, data_processor):
110
+ self.unigram_freq = data_processor.unigram_freq
111
+ self.bigram_freq = data_processor.bigram_freq
112
+ self.trigram_freq = data_processor.trigram_freq
113
+ self.vocabulary = data_processor.vocabulary
114
+ self.vocab_size = data_processor.vocab_size
115
+ self.total_words = data_processor.total_words
116
+
117
+ # Manual parameters untuk fuzzy membership functions
118
+ # Probability: [low_peak, medium_peak, high_peak]
119
+ self.prob_params = [0.15, 0.45, 0.85]
120
+
121
+ # Popularity: [rare_peak, common_peak, verycommon_peak] (log scale)
122
+ self.pop_params = [2.0, 4.5, 7.0] # log10 values
123
+
124
+ # Fuzzy weights
125
+ self.weights = {
126
+ 'prob': 0.6, # 60% weight ke probability
127
+ 'pop': 0.4 # 40% weight ke popularity
128
+ }
129
+
130
+ def _get_base_predictions(self, context: List[str], top_k: int = 50) -> List[Tuple[str, float]]:
131
+ """Get base predictions using n-gram model"""
132
+ scores = {}
133
+
134
+ if len(context) >= 2:
135
+ key = (context[-2], context[-1])
136
+ if key in self.trigram_freq:
137
+ candidates = self.trigram_freq[key]
138
+ total = sum(candidates.values())
139
+ for word, count in candidates.items():
140
+ scores[word] = (count + 1) / (total + self.vocab_size)
141
+
142
+ if len(scores) == 0 and len(context) >= 1:
143
+ key = context[-1]
144
+ if key in self.bigram_freq:
145
+ candidates = self.bigram_freq[key]
146
+ total = sum(candidates.values())
147
+ for word, count in candidates.items():
148
+ scores[word] = (count + 1) / (total + self.vocab_size)
149
+
150
+ if len(scores) == 0:
151
+ for word, count in Counter(self.unigram_freq).most_common(100):
152
+ scores[word] = count / self.total_words
153
+
154
+ sorted_predictions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
155
+ return sorted_predictions[:top_k]
156
+
157
+ def fuzzify_prob(self, prob):
158
+ """Fuzzify probability score"""
159
+ low = max(0, 1 - abs(prob - self.prob_params[0]) / 0.3)
160
+ med = max(0, 1 - abs(prob - self.prob_params[1]) / 0.3)
161
+ high = max(0, 1 - abs(prob - self.prob_params[2]) / 0.3)
162
+ return {'low': low, 'medium': med, 'high': high}
163
+
164
+ def fuzzify_pop(self, count):
165
+ """Fuzzify popularity score (log scale)"""
166
+ log_count = np.log10(max(1, count))
167
+ rare = max(0, 1 - abs(log_count - self.pop_params[0]) / 2.5)
168
+ common = max(0, 1 - abs(log_count - self.pop_params[1]) / 2.5)
169
+ very_common = max(0, 1 - abs(log_count - self.pop_params[2]) / 2.5)
170
+ return {'rare': rare, 'common': common, 'very_common': very_common}
171
+
172
+ def fuzzy_inference(self, prob_fuzzy, pop_fuzzy):
173
+ """Apply fuzzy rules and defuzzify"""
174
+ # Rule 1: High prob AND Very Common pop -> Excellent (0.9)
175
+ rule1 = min(prob_fuzzy['high'], pop_fuzzy['very_common']) * 0.9
176
+
177
+ # Rule 2: Medium prob AND Common pop -> Good (0.6)
178
+ rule2 = min(prob_fuzzy['medium'], pop_fuzzy['common']) * 0.6
179
+
180
+ # Rule 3: Low prob BUT Very Common pop -> Fair (0.45)
181
+ rule3 = min(prob_fuzzy['low'], pop_fuzzy['very_common']) * 0.45
182
+
183
+ # Rule 4: Any other combination -> Poor (weighted average)
184
+ rule4 = (prob_fuzzy['low'] * 0.2 + pop_fuzzy['rare'] * 0.1) / 2
185
+
186
+ # Defuzzification: weighted average
187
+ return max(rule1, rule2, rule3, rule4)
188
+
189
+ def predict(self, context: List[str], top_k: int = 3) -> List[Tuple[str, float]]:
190
+ """Predict dengan fuzzy scoring"""
191
+ # Get base predictions
192
+ base_preds = self._get_base_predictions(context, top_k=50)
193
+
194
+ fuzzy_scores = {}
195
+ for word, prob in base_preds:
196
+ # Get popularity
197
+ pop_count = self.unigram_freq.get(word, 1)
198
+
199
+ # Fuzzify
200
+ prob_fuzzy = self.fuzzify_prob(prob)
201
+ pop_fuzzy = self.fuzzify_pop(pop_count)
202
+
203
+ # Inference
204
+ fuzzy_score = self.fuzzy_inference(prob_fuzzy, pop_fuzzy)
205
+
206
+ # Combine dengan weights
207
+ final_score = (self.weights['prob'] * prob +
208
+ self.weights['pop'] * fuzzy_score)
209
+
210
+ fuzzy_scores[word] = final_score
211
+
212
+ # Sort dan return
213
+ sorted_predictions = sorted(fuzzy_scores.items(), key=lambda x: x[1], reverse=True)
214
+ return sorted_predictions[:top_k]
215
+
216
+
217
+ class FuzzyGAModel(FuzzyManualModel):
218
+ """
219
+ Fuzzy Logic model dengan parameter dari Genetic Algorithm
220
+ """
221
+ def __init__(self, data_processor, ga_params):
222
+ super().__init__(data_processor)
223
+ # Override dengan parameter GA
224
+ self.prob_params = ga_params[:3]
225
+ self.pop_params = ga_params[3:6]
226
+
227
+
228
+ class FuzzyPSOModel(FuzzyManualModel):
229
+ """
230
+ Fuzzy Logic model dengan parameter dari Particle Swarm Optimization
231
+ """
232
+ def __init__(self, data_processor, pso_params):
233
+ super().__init__(data_processor)
234
+ # Override dengan parameter PSO
235
+ self.prob_params = pso_params[:3]
236
+ self.pop_params = pso_params[3:6]