anggars commited on
Commit
a63526c
·
verified ·
1 Parent(s): f3ff343

Sync from GitHub Actions: 642491bc500cc4efa8f7631f60e5f79da784fdcf

Browse files
Files changed (2) hide show
  1. api/core/nlp_handler.py +67 -253
  2. api/requirements.txt +0 -2
api/core/nlp_handler.py CHANGED
@@ -1,23 +1,15 @@
1
- import joblib
2
  import os
3
  import re
4
  import requests
5
- import numpy as np
6
  import html
7
  from deep_translator import GoogleTranslator
8
  from youtube_transcript_api import YouTubeTranscriptApi
9
 
10
- import time
11
-
12
- # --- CONFIG PATH ---
13
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14
- MBTI_PATH = os.path.join(BASE_DIR, 'data', 'model_mbti.pkl')
15
- EMOTION_PATH = os.path.join(BASE_DIR, 'data', 'model_emotion.pkl')
16
 
17
- _model_mbti = None
18
- _classifier_mbti_transformer = None
19
- _classifier_roberta = None
20
- _classifier_distilbert = None
21
 
22
  EMOTION_TRANSLATIONS = {
23
  'admiration': 'Kagum', 'amusement': 'Terhibur', 'anger': 'Marah',
@@ -75,138 +67,24 @@ class NLPHandler:
75
 
76
  @staticmethod
77
  def load_models():
78
- global _model_mbti, _classifier_mbti_transformer, _classifier_roberta, _classifier_distilbert
79
- print(f"Loading models from: {BASE_DIR}")
80
 
81
- if _model_mbti is None and os.path.exists(MBTI_PATH):
82
- try:
83
- print(f"Loading MBTI Model (SVM) from: {MBTI_PATH}")
84
- _model_mbti = joblib.load(MBTI_PATH)
85
- except Exception as e: print(f"MBTI Load Error: {e}")
86
-
87
- if _classifier_mbti_transformer is None:
88
  try:
89
- print(f"Loading MBTI Model (Transformer): parka735/mbti-classifier")
90
  from transformers import pipeline
91
- _classifier_mbti_transformer = pipeline("text-classification", model="parka735/mbti-classifier", top_k=1)
92
- except Exception as e: print(f"MBTI Transformer Load Error: {e}")
93
-
94
- if _classifier_roberta is None:
95
- try:
96
- print("Loading Emotion Model 1: SamLowe/roberta-base-go_emotions")
97
- from transformers import pipeline
98
- _classifier_roberta = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
99
- except Exception as e: print(f"Emotion 1 Load Error: {e}")
100
 
101
- if _classifier_distilbert is None:
102
  try:
103
- print("Loading Emotion Model 2: joeddav/distilbert-base-uncased-go-emotions-student")
104
  from transformers import pipeline
105
- _classifier_distilbert = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student", top_k=None)
106
- except Exception as e: print(f"Emotion 2 Load Error: {e}")
107
-
108
- # --- GEMINI VALIDATOR SETUP ---
109
- _gemini_client = None
110
-
111
- @staticmethod
112
- def _init_gemini():
113
- """Initialize Gemini Client for validation (lazy loading)"""
114
- if NLPHandler._gemini_client is None:
115
- api_key = os.getenv("GEMINI_API_KEY")
116
- if api_key:
117
- try:
118
- from google import genai
119
- NLPHandler._gemini_client = genai.Client(api_key=api_key)
120
- print("Gemini Validator Ready (google-genai SDK)")
121
- except Exception as e:
122
- print(f"Gemini Init Failed: {e}")
123
- return NLPHandler._gemini_client is not None
124
-
125
- @staticmethod
126
- def _validate_with_gemini(text, ml_prediction):
127
- """
128
- Use Gemini to validate ML prediction.
129
- Returns: (validated_mbti, confidence, reasoning)
130
- """
131
- if not NLPHandler._init_gemini():
132
- return ml_prediction, 0.6, "ML only (Gemini unavailable)"
133
-
134
-
135
-
136
- prompt = f"""You are an MBTI expert. Analyze this text and determine the MOST LIKELY MBTI type based ONLY on the content.
137
-
138
- TEXT TO ANALYZE:
139
- "{text}"
140
 
141
- ANALYSIS FRAMEWORK:
142
- 1. I/E (Introversion/Extraversion):
143
- - E indicators: Mentions of social events, leading teams, networking, group activities, energized by people
144
- - I indicators: Preference for solitude, reflection, working alone, drained by social interaction
145
 
146
- 2. N/S (Intuition/Sensing):
147
- - N indicators: Abstract thinking, future-focused, big picture, patterns, possibilities, theory
148
- - S indicators: Concrete details, present-focused, practical, facts, reality, hands-on
149
-
150
- 3. T/F (Thinking/Feeling):
151
- - T indicators: Logic, efficiency, objectivity, direct communication, "facts over feelings"
152
- - F indicators: Empathy, harmony, values, subjective decisions, people-focused
153
-
154
- 4. J/P (Judging/Perceiving):
155
- - J indicators: Planning, structure, deadlines, organization, schedules, decisive
156
- - P indicators: Spontaneous, flexible, adaptable, open-ended, exploratory
157
-
158
- CRITICAL INSTRUCTIONS:
159
- - Analyze INDEPENDENTLY - ignore any preconceptions
160
- - Look for EXPLICIT behavioral indicators in the text
161
- - Weight E/I heavily on social energy language (not just content topic)
162
- - If text mentions "leading", "networking", "team meetings" → strong E signal
163
- - If text emphasizes "planning", "deadlines", "structure" → strong J signal
164
-
165
- Respond in this EXACT format:
166
- MBTI: [4-letter type]
167
- CONFIDENCE: [0.0-1.0]
168
- REASON: [One sentence citing specific text evidence]
169
-
170
- Example:
171
- MBTI: ENTJ
172
- CONFIDENCE: 0.88
173
- REASON: Explicit mentions of networking, leading teams, and structured planning indicate ENTJ.
174
- """
175
-
176
- try:
177
- response = NLPHandler._gemini_client.models.generate_content(
178
- model='gemini-2.0-flash',
179
- contents=prompt
180
- )
181
- result_text = response.text.strip()
182
-
183
- # Parse response
184
- lines = result_text.split('\n')
185
- validated_mbti = ml_prediction
186
- confidence = 0.7
187
- reason = "Gemini validation"
188
-
189
- for line in lines:
190
- if line.startswith('MBTI:'):
191
- validated_mbti = line.split(':', 1)[1].strip().upper()
192
- elif line.startswith('CONFIDENCE:'):
193
- try:
194
- confidence = float(line.split(':', 1)[1].strip())
195
- except:
196
- confidence = 0.7
197
- elif line.startswith('REASON:'):
198
- reason = line.split(':', 1)[1].strip()
199
-
200
- # Validate MBTI format (must be 4 chars)
201
- if len(validated_mbti) != 4 or not all(c in 'IENTFSJP' for c in validated_mbti):
202
- print(f"Invalid Gemini MBTI: {validated_mbti}, using ML: {ml_prediction}")
203
- return ml_prediction, 0.6, "Invalid Gemini response - using ML"
204
-
205
- return validated_mbti, confidence, reason
206
-
207
- except Exception as e:
208
- print(f"Gemini Validation Error: {e}")
209
- return ml_prediction, 0.6, f"Gemini error - using ML"
210
 
211
  @staticmethod
212
  def translate_to_english(text):
@@ -237,150 +115,86 @@ REASON: Explicit mentions of networking, leading teams, and structured planning
237
  NLPHandler.load_models()
238
  processed_text = NLPHandler.translate_to_english(raw_text)
239
 
240
- # --- MBTI PREDICTION WITH GEMINI VALIDATION ---
241
  mbti_result = "UNKNOWN"
242
  mbti_confidence = 0.0
243
- mbti_reasoning = ""
244
 
245
- if _model_mbti and _classifier_mbti_transformer:
246
  try:
247
- # 1. SVM Prediction (Keyword/Structure)
248
- svm_pred = _model_mbti.predict([processed_text])[0]
249
 
250
- # 2. Transformer Prediction
251
- trans_input = processed_text[:2000]
252
- trans_output = _classifier_mbti_transformer(trans_input)
253
-
254
- # Handle nested list output (common in batched pipelines)
255
- # Output can be [{'label': 'A'}] OR [[{'label': 'A'}]]
256
- if isinstance(trans_output, list) and isinstance(trans_output[0], list):
257
- trans_res = trans_output[0][0]
258
- elif isinstance(trans_output, list):
259
- trans_res = trans_output[0]
260
  else:
261
- trans_res = trans_output
262
-
263
- trans_pred = trans_res['label'].upper()
264
- trans_conf = trans_res['score']
265
-
266
- print(f"[Voting] SVM='{svm_pred}' vs Transformer='{trans_pred}' ({trans_conf:.2%})")
267
 
268
- # 3. Consensus Logic
269
- if svm_pred == trans_pred:
270
- # Both agree! High confidence.
271
- print("[Check] Models AGREE! Auto-approving.")
272
- mbti_result = svm_pred
273
- mbti_confidence = 0.95
274
- mbti_reasoning = f"Both AI models agreed strictly on {mbti_result}."
275
-
276
- # Optional: Lightweight Gemini check just for reasoning text, IF enabled.
277
- # validation is skipped for speed since we have consensus.
278
- else:
279
- # Disagreement! Gemini is the Tie-Breaker.
280
- print("[Warning] Models DISAGREE! Summoning Gemini Judge...")
281
-
282
- # Prepare context for Gemini
283
- validation_context = f"Model A (Keyword) detected {svm_pred}. Model B (Context) detected {trans_pred}."
284
-
285
- validated_mbti, confidence, reason = NLPHandler._validate_with_gemini(
286
- processed_text, validation_context
287
- )
288
-
289
- mbti_result = validated_mbti
290
- mbti_confidence = confidence
291
- mbti_reasoning = reason
292
- print(f"[Gemini] Verdict: {mbti_result} (Confidence: {confidence})")
293
 
294
  except Exception as e:
295
- print(f"[Error] Hybrid MBTI Error: {e}")
296
- # Fallback to SVM if everything explodes
297
- try:
298
- mbti_result = _model_mbti.predict([processed_text])[0]
299
- mbti_confidence = 0.4
300
- except:
301
- mbti_result = "INTJ"
302
- mbti_reasoning = "System fallback due to hybrid error."
303
 
304
- # --- EMOTION PREDICTION (HYBRID TRANSFORMER) ---
305
  emotion_data = {"id": "Netral", "en": "Neutral", "raw": "neutral", "list": []}
306
  confidence_score = 0.0
307
 
308
- try:
309
- # Load pipelines (Ensured in load_models)
310
- global _classifier_roberta, _classifier_distilbert
311
-
312
- # Truncate for safety
313
- emo_input = processed_text[:1500]
314
-
315
- combined_scores = {}
316
-
317
- def add_scores(results):
318
- if isinstance(results, list) and isinstance(results[0], list):
319
- results = results[0]
320
- for item in results:
321
- label = item['label']
322
- score = item['score']
323
- combined_scores[label] = combined_scores.get(label, 0) + score
324
-
325
- if _classifier_roberta:
326
- add_scores(_classifier_roberta(emo_input))
327
- if _classifier_distilbert:
328
- add_scores(_classifier_distilbert(emo_input))
329
-
330
- # Normalize and filter
331
- if 'neutral' in combined_scores:
332
- del combined_scores['neutral'] # Remove neutral preference
333
-
334
- sorted_emotions = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
335
-
336
- top_3_list = []
337
- if sorted_emotions:
338
- # Top 1 for legacy compatibility
339
- best_label, total_score = sorted_emotions[0]
340
- confidence_score = (total_score / 2.0)
341
 
342
- indo_label = EMOTION_TRANSLATIONS.get(best_label, best_label.capitalize())
343
- emotion_data = {
344
- "id": indo_label,
345
- "en": best_label.capitalize(),
346
- "raw": best_label,
347
- "list": [] # Will populate below
348
- }
349
 
350
- # Populate Top 3 List
351
- for label, score in sorted_emotions[:3]:
352
- norm_score = score / 2.0
353
- top_3_list.append({
354
- "en": label.capitalize(),
355
- "id": EMOTION_TRANSLATIONS.get(label, label.capitalize()),
356
- "score": norm_score
357
- })
358
 
359
- emotion_data["list"] = top_3_list
360
- print(f"Emotion Hybrid Top 1: {emotion_data['en']} ({confidence_score:.2%})")
361
- else:
362
- print("Emotion Hybrid: No clear emotion found (Neutral)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
- except Exception as e:
365
- print(f"Emotion Prediction Error: {e}")
366
 
367
  # --- REASONING GENERATION ---
368
  mbti_desc = MBTI_EXPLANATIONS.get(mbti_result, {
369
  'en': "Complex personality type.",
370
  'id': "Kepribadian yang cukup kompleks."
371
  })
372
-
373
- # Add Gemini reasoning to MBTI description
374
- if mbti_reasoning:
375
- mbti_desc['validation'] = mbti_reasoning
376
- mbti_desc['confidence'] = mbti_confidence
377
 
378
  # Emotion Reasoning
379
- conf_percent = int(confidence_score * 100)
380
-
381
- # Generate dynamic reasoning for Top 3
382
  em_list_str = ""
383
- if 'list' in emotion_data and emotion_data['list']:
384
  labels = [f"{item['en']} ({int(item['score']*100)}%)" for item in emotion_data['list']]
385
  em_list_str = ", ".join(labels)
386
 
 
 
1
  import os
2
  import re
3
  import requests
 
4
  import html
5
  from deep_translator import GoogleTranslator
6
  from youtube_transcript_api import YouTubeTranscriptApi
7
 
8
+ # --- CONFIG ---
 
 
9
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 
10
 
11
+ _classifier_mbti = None
12
+ _classifier_emotion = None
 
 
13
 
14
  EMOTION_TRANSLATIONS = {
15
  'admiration': 'Kagum', 'amusement': 'Terhibur', 'anger': 'Marah',
 
67
 
68
  @staticmethod
69
  def load_models():
70
+ global _classifier_mbti, _classifier_emotion
71
+ print(f"Loading models from HuggingFace Hub...")
72
 
73
+ if _classifier_mbti is None:
 
 
 
 
 
 
74
  try:
75
+ print("Loading MBTI Model: anggars/xlm-mbti")
76
  from transformers import pipeline
77
+ _classifier_mbti = pipeline("text-classification", model="anggars/xlm-mbti", top_k=1)
78
+ except Exception as e: print(f"MBTI Load Error: {e}")
 
 
 
 
 
 
 
79
 
80
+ if _classifier_emotion is None:
81
  try:
82
+ print("Loading Emotion Model: anggars/xlm-emotion")
83
  from transformers import pipeline
84
+ _classifier_emotion = pipeline("text-classification", model="anggars/xlm-emotion", top_k=None)
85
+ except Exception as e: print(f"Emotion Load Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  @staticmethod
90
  def translate_to_english(text):
 
115
  NLPHandler.load_models()
116
  processed_text = NLPHandler.translate_to_english(raw_text)
117
 
118
+ # --- MBTI PREDICTION (anggars/xlm-mbti) ---
119
  mbti_result = "UNKNOWN"
120
  mbti_confidence = 0.0
 
121
 
122
+ if _classifier_mbti:
123
  try:
124
+ mbti_input = processed_text[:2000]
125
+ mbti_output = _classifier_mbti(mbti_input)
126
 
127
+ # Handle nested list output
128
+ if isinstance(mbti_output, list) and isinstance(mbti_output[0], list):
129
+ mbti_res = mbti_output[0][0]
130
+ elif isinstance(mbti_output, list):
131
+ mbti_res = mbti_output[0]
 
 
 
 
 
132
  else:
133
+ mbti_res = mbti_output
 
 
 
 
 
134
 
135
+ mbti_result = mbti_res['label'].upper()
136
+ mbti_confidence = mbti_res['score']
137
+ print(f"[MBTI] Predicted: {mbti_result} ({mbti_confidence:.2%})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  except Exception as e:
140
+ print(f"[Error] MBTI Prediction Error: {e}")
141
+ mbti_result = "INTJ"
142
+ mbti_confidence = 0.0
 
 
 
 
 
143
 
144
+ # --- EMOTION PREDICTION (anggars/xlm-emotion) ---
145
  emotion_data = {"id": "Netral", "en": "Neutral", "raw": "neutral", "list": []}
146
  confidence_score = 0.0
147
 
148
+ if _classifier_emotion:
149
+ try:
150
+ emo_input = processed_text[:1500]
151
+ emo_output = _classifier_emotion(emo_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # Handle nested list output
154
+ if isinstance(emo_output, list) and isinstance(emo_output[0], list):
155
+ emo_output = emo_output[0]
 
 
 
 
156
 
157
+ # Filter out neutral and sort by score
158
+ scores = {item['label']: item['score'] for item in emo_output if item['label'] != 'neutral'}
159
+ sorted_emotions = sorted(scores.items(), key=lambda x: x[1], reverse=True)
 
 
 
 
 
160
 
161
+ if sorted_emotions:
162
+ best_label, best_score = sorted_emotions[0]
163
+ confidence_score = best_score
164
+
165
+ indo_label = EMOTION_TRANSLATIONS.get(best_label, best_label.capitalize())
166
+ emotion_data = {
167
+ "id": indo_label,
168
+ "en": best_label.capitalize(),
169
+ "raw": best_label,
170
+ "list": []
171
+ }
172
+
173
+ # Top 3 list
174
+ for label, score in sorted_emotions[:3]:
175
+ emotion_data["list"].append({
176
+ "en": label.capitalize(),
177
+ "id": EMOTION_TRANSLATIONS.get(label, label.capitalize()),
178
+ "score": score
179
+ })
180
+
181
+ print(f"[Emotion] Top 1: {emotion_data['en']} ({confidence_score:.2%})")
182
+ else:
183
+ print("[Emotion] No clear emotion found (Neutral)")
184
 
185
+ except Exception as e:
186
+ print(f"[Error] Emotion Prediction Error: {e}")
187
 
188
  # --- REASONING GENERATION ---
189
  mbti_desc = MBTI_EXPLANATIONS.get(mbti_result, {
190
  'en': "Complex personality type.",
191
  'id': "Kepribadian yang cukup kompleks."
192
  })
193
+ mbti_desc['confidence'] = mbti_confidence
 
 
 
 
194
 
195
  # Emotion Reasoning
 
 
 
196
  em_list_str = ""
197
+ if emotion_data['list']:
198
  labels = [f"{item['en']} ({int(item['score']*100)}%)" for item in emotion_data['list']]
199
  em_list_str = ", ".join(labels)
200
 
api/requirements.txt CHANGED
@@ -3,8 +3,6 @@ uvicorn
3
  python-dotenv
4
  pydantic
5
  numpy
6
- scikit-learn==1.8.0
7
- joblib
8
  deep-translator
9
  requests
10
  youtube-transcript-api
 
3
  python-dotenv
4
  pydantic
5
  numpy
 
 
6
  deep-translator
7
  requests
8
  youtube-transcript-api