import joblib import os import re import requests import numpy as np import html from deep_translator import GoogleTranslator from youtube_transcript_api import YouTubeTranscriptApi import time # --- CONFIG PATH --- BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) MBTI_PATH = os.path.join(BASE_DIR, 'data', 'model_mbti.pkl') EMOTION_PATH = os.path.join(BASE_DIR, 'data', 'model_emotion.pkl') _model_mbti = None _classifier_mbti_transformer = None _classifier_roberta = None _classifier_distilbert = None EMOTION_TRANSLATIONS = { 'admiration': 'Kagum', 'amusement': 'Terhibur', 'anger': 'Marah', 'annoyance': 'Kesal', 'approval': 'Setuju', 'caring': 'Peduli', 'confusion': 'Bingung', 'curiosity': 'Penasaran', 'desire': 'Keinginan', 'disappointment': 'Kecewa', 'disapproval': 'Tidak Setuju', 'disgust': 'Jijik', 'embarrassment': 'Malu', 'excitement': 'Semangat', 'fear': 'Takut', 'gratitude': 'Bersyukur', 'grief': 'Berduka', 'joy': 'Gembira', 'love': 'Cinta', 'nervousness': 'Gugup', 'optimism': 'Optimis', 'pride': 'Bangga', 'realization': 'Sadar', 'relief': 'Lega', 'remorse': 'Menyesal', 'sadness': 'Sedih', 'surprise': 'Terkejut', 'neutral': 'Netral' } MBTI_EXPLANATIONS = { 'ISTJ': {'en': "The Logistician. Practical and fact-minded individuals, whose reliability cannot be doubted.", 'id': "Si Organisator. Lo orangnya logis, praktis, dan bisa diandelin banget. Anti ribet-ribet club."}, 'ISFJ': {'en': "The Defender. Very dedicated and warm protectors, always ready to defend their loved ones.", 'id': "Si Pelindung. Hati lo lembut, setia, dan care banget sama orang terdekat. Temen curhat terbaik."}, 'INFJ': {'en': "The Advocate. Quiet and mystical, yet very inspiring and tireless idealists.", 'id': "Si Visioner Misterius. Lo peka, idealis, dan suka mikirin makna hidup mendalam. Langka nih!"}, 'INTJ': {'en': "The Architect. Imaginative and strategic thinkers, with a plan for everything.", 'id': "Si Strategis. Otak lo jalan terus, visioner, dan selalu punya rencana cadangan buat segala hal."}, 'ISTP': {'en': "The Virtuoso. Bold and practical experimenters, masters of all kinds of tools.", 'id': "Si Pengrajin. Lo cool, santuy, tapi jago banget mecahin masalah teknis secara praktis."}, 'ISFP': {'en': "The Adventurer. Flexible and charming artists, always ready to explore and experience something new.", 'id': "Si Seniman Bebas. Lo estetik, santai, dan suka banget nge-explore hal baru tanpa banyak drama."}, 'INFP': {'en': "The Mediator. Poetic, kind and altruistic people, always eager to help a good cause.", 'id': "Si Paling Perasa. Hati lo kayak kapas, puitis, idealis banget, dan selalu mau bikin dunia lebih baik."}, 'INTP': {'en': "The Logician. Innovative inventors with an unquenchable thirst for knowledge.", 'id': "Si Pemikir Kritis. Lo kepoan parah, logis abis, dan suka banget debat teori sampe pagi."}, 'ESTP': {'en': "The Entrepreneur. Smart, energetic and very perceptive people, who truly enjoy living on the edge.", 'id': "Si Pemberani. Lo enerjik, spontan, dan jago banget ngambil peluang dalam situasi mepet."}, 'ESFP': {'en': "The Entertainer. Spontaneous, energetic and enthusiastic people - life is never boring around them.", 'id': "Si Penghibur. Lo asik parah, spontan, dan selalu jadi pusat perhatian di tongkrongan."}, 'ENFP': {'en': "The Campaigner. Enthusiastic, creative and sociable free spirits, who can always find a reason to smile.", 'id': "Si Semangat 45. Lo kreatif, ramah, dan punya energi positif yang nular ke semua orang."}, 'ENTP': {'en': "The Debater. Smart and curious thinkers who cannot resist an intellectual challenge.", 'id': "Si Pendebat Ulung. Lo pinter, kritis, dan iseng banget suka mancing debat cuma buat seru-seruan."}, 'ESTJ': {'en': "The Executive. Excellent administrators, unsurpassed at managing things - or people.", 'id': "Si Bos Tegas. Lo jago ngatur, disiplin, dan gak suka liat ada yang lelet atau berantakan."}, 'ESFJ': {'en': "The Consul. Extraordinarily caring, social and popular people, always eager to help.", 'id': "Si Paling Gaul. Lo ramah, suka nolong, dan care banget sama harmoni di sirkel pertemanan."}, 'ENFJ': {'en': "The Protagonist. Charismatic and inspiring leaders, able to mesmerize their listeners.", 'id': "Si Pemimpin Karismatik. Lo jago banget ngomong, inspiratif, dan bisa bikin orang lain nurut sama lo."}, 'ENTJ': {'en': "The Commander. Bold, imaginative and strong-willed leaders, always finding a way - or making one.", 'id': "Si Jenderal. Lo ambisius, tegas, dan punya bakat alami buat mimpin dan naklukin tantangan."} } class NLPHandler: # ... code before ... # (The existing static methods load_models, translate_to_english, extract_keywords are unchanged) # Re-writing predict_all to include explanation logic @staticmethod def load_models(): global _model_mbti, _classifier_mbti_transformer, _classifier_roberta, _classifier_distilbert print(f"Loading models from: {BASE_DIR}") if _model_mbti is None and os.path.exists(MBTI_PATH): try: print(f"Loading MBTI Model (SVM) from: {MBTI_PATH}") _model_mbti = joblib.load(MBTI_PATH) except Exception as e: print(f"MBTI Load Error: {e}") if _classifier_mbti_transformer is None: try: print(f"Loading MBTI Model (Transformer): parka735/mbti-classifier") from transformers import pipeline _classifier_mbti_transformer = pipeline("text-classification", model="parka735/mbti-classifier", top_k=1) except Exception as e: print(f"MBTI Transformer Load Error: {e}") if _classifier_roberta is None: try: print("Loading Emotion Model 1: SamLowe/roberta-base-go_emotions") from transformers import pipeline _classifier_roberta = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) except Exception as e: print(f"Emotion 1 Load Error: {e}") if _classifier_distilbert is None: try: print("Loading Emotion Model 2: joeddav/distilbert-base-uncased-go-emotions-student") from transformers import pipeline _classifier_distilbert = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student", top_k=None) except Exception as e: print(f"Emotion 2 Load Error: {e}") # --- GEMINI VALIDATOR SETUP --- _gemini_client = None @staticmethod def _init_gemini(): """Initialize Gemini Client for validation (lazy loading)""" if NLPHandler._gemini_client is None: api_key = os.getenv("GEMINI_API_KEY") if api_key: try: from google import genai NLPHandler._gemini_client = genai.Client(api_key=api_key) print("Gemini Validator Ready (google-genai SDK)") except Exception as e: print(f"Gemini Init Failed: {e}") return NLPHandler._gemini_client is not None @staticmethod def _validate_with_gemini(text, ml_prediction): """ Use Gemini to validate ML prediction. Returns: (validated_mbti, confidence, reasoning) """ if not NLPHandler._init_gemini(): return ml_prediction, 0.6, "ML only (Gemini unavailable)" prompt = f"""You are an MBTI expert. Analyze this text and determine the MOST LIKELY MBTI type based ONLY on the content. TEXT TO ANALYZE: "{text}" ANALYSIS FRAMEWORK: 1. I/E (Introversion/Extraversion): - E indicators: Mentions of social events, leading teams, networking, group activities, energized by people - I indicators: Preference for solitude, reflection, working alone, drained by social interaction 2. N/S (Intuition/Sensing): - N indicators: Abstract thinking, future-focused, big picture, patterns, possibilities, theory - S indicators: Concrete details, present-focused, practical, facts, reality, hands-on 3. T/F (Thinking/Feeling): - T indicators: Logic, efficiency, objectivity, direct communication, "facts over feelings" - F indicators: Empathy, harmony, values, subjective decisions, people-focused 4. J/P (Judging/Perceiving): - J indicators: Planning, structure, deadlines, organization, schedules, decisive - P indicators: Spontaneous, flexible, adaptable, open-ended, exploratory CRITICAL INSTRUCTIONS: - Analyze INDEPENDENTLY - ignore any preconceptions - Look for EXPLICIT behavioral indicators in the text - Weight E/I heavily on social energy language (not just content topic) - If text mentions "leading", "networking", "team meetings" → strong E signal - If text emphasizes "planning", "deadlines", "structure" → strong J signal Respond in this EXACT format: MBTI: [4-letter type] CONFIDENCE: [0.0-1.0] REASON: [One sentence citing specific text evidence] Example: MBTI: ENTJ CONFIDENCE: 0.88 REASON: Explicit mentions of networking, leading teams, and structured planning indicate ENTJ. """ try: response = NLPHandler._gemini_client.models.generate_content( model='gemini-2.0-flash', contents=prompt ) result_text = response.text.strip() # Parse response lines = result_text.split('\n') validated_mbti = ml_prediction confidence = 0.7 reason = "Gemini validation" for line in lines: if line.startswith('MBTI:'): validated_mbti = line.split(':', 1)[1].strip().upper() elif line.startswith('CONFIDENCE:'): try: confidence = float(line.split(':', 1)[1].strip()) except: confidence = 0.7 elif line.startswith('REASON:'): reason = line.split(':', 1)[1].strip() # Validate MBTI format (must be 4 chars) if len(validated_mbti) != 4 or not all(c in 'IENTFSJP' for c in validated_mbti): print(f"Invalid Gemini MBTI: {validated_mbti}, using ML: {ml_prediction}") return ml_prediction, 0.6, "Invalid Gemini response - using ML" return validated_mbti, confidence, reason except Exception as e: print(f"Gemini Validation Error: {e}") return ml_prediction, 0.6, f"Gemini error - using ML" @staticmethod def translate_to_english(text): try: if len(text) > 4500: text = text[:4500] return GoogleTranslator(source='auto', target='en').translate(text) except: return text @staticmethod def extract_keywords(text): stopwords = ["the", "and", "is", "to", "in", "it", "of", "for", "with", "on", "that", "this", "my", "was", "as", "are", "have", "you", "but", "so", "ini", "itu", "dan", "yang", "di", "ke"] words = re.findall(r'\w+', text.lower()) filtered = [w for w in words if len(w) > 3 and w not in stopwords] freq = {} for w in filtered: freq[w] = freq.get(w, 0) + 1 sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True) keywords_en = [w[0] for w in sorted_words[:5]] keywords_id = [] try: translator = GoogleTranslator(source='auto', target='id') for k in keywords_en: keywords_id.append(translator.translate(k)) except: keywords_id = keywords_en return {"en": keywords_en, "id": keywords_id} @staticmethod def predict_all(raw_text): NLPHandler.load_models() processed_text = NLPHandler.translate_to_english(raw_text) # --- MBTI PREDICTION WITH GEMINI VALIDATION --- mbti_result = "UNKNOWN" mbti_confidence = 0.0 mbti_reasoning = "" if _model_mbti and _classifier_mbti_transformer: try: # 1. SVM Prediction (Keyword/Structure) svm_pred = _model_mbti.predict([processed_text])[0] # 2. Transformer Prediction trans_input = processed_text[:2000] trans_output = _classifier_mbti_transformer(trans_input) # Handle nested list output (common in batched pipelines) # Output can be [{'label': 'A'}] OR [[{'label': 'A'}]] if isinstance(trans_output, list) and isinstance(trans_output[0], list): trans_res = trans_output[0][0] elif isinstance(trans_output, list): trans_res = trans_output[0] else: trans_res = trans_output trans_pred = trans_res['label'].upper() trans_conf = trans_res['score'] print(f"[Voting] SVM='{svm_pred}' vs Transformer='{trans_pred}' ({trans_conf:.2%})") # 3. Consensus Logic if svm_pred == trans_pred: # Both agree! High confidence. print("[Check] Models AGREE! Auto-approving.") mbti_result = svm_pred mbti_confidence = 0.95 mbti_reasoning = f"Both AI models agreed strictly on {mbti_result}." # Optional: Lightweight Gemini check just for reasoning text, IF enabled. # validation is skipped for speed since we have consensus. else: # Disagreement! Gemini is the Tie-Breaker. print("[Warning] Models DISAGREE! Summoning Gemini Judge...") # Prepare context for Gemini validation_context = f"Model A (Keyword) detected {svm_pred}. Model B (Context) detected {trans_pred}." validated_mbti, confidence, reason = NLPHandler._validate_with_gemini( processed_text, validation_context ) mbti_result = validated_mbti mbti_confidence = confidence mbti_reasoning = reason print(f"[Gemini] Verdict: {mbti_result} (Confidence: {confidence})") except Exception as e: print(f"[Error] Hybrid MBTI Error: {e}") # Fallback to SVM if everything explodes try: mbti_result = _model_mbti.predict([processed_text])[0] mbti_confidence = 0.4 except: mbti_result = "INTJ" mbti_reasoning = "System fallback due to hybrid error." # --- EMOTION PREDICTION (HYBRID TRANSFORMER) --- emotion_data = {"id": "Netral", "en": "Neutral", "raw": "neutral", "list": []} confidence_score = 0.0 try: # Load pipelines (Ensured in load_models) global _classifier_roberta, _classifier_distilbert # Truncate for safety emo_input = processed_text[:1500] combined_scores = {} def add_scores(results): if isinstance(results, list) and isinstance(results[0], list): results = results[0] for item in results: label = item['label'] score = item['score'] combined_scores[label] = combined_scores.get(label, 0) + score if _classifier_roberta: add_scores(_classifier_roberta(emo_input)) if _classifier_distilbert: add_scores(_classifier_distilbert(emo_input)) # Normalize and filter if 'neutral' in combined_scores: del combined_scores['neutral'] # Remove neutral preference sorted_emotions = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) top_3_list = [] if sorted_emotions: # Top 1 for legacy compatibility best_label, total_score = sorted_emotions[0] confidence_score = (total_score / 2.0) indo_label = EMOTION_TRANSLATIONS.get(best_label, best_label.capitalize()) emotion_data = { "id": indo_label, "en": best_label.capitalize(), "raw": best_label, "list": [] # Will populate below } # Populate Top 3 List for label, score in sorted_emotions[:3]: norm_score = score / 2.0 top_3_list.append({ "en": label.capitalize(), "id": EMOTION_TRANSLATIONS.get(label, label.capitalize()), "score": norm_score }) emotion_data["list"] = top_3_list print(f"Emotion Hybrid Top 1: {emotion_data['en']} ({confidence_score:.2%})") else: print("Emotion Hybrid: No clear emotion found (Neutral)") except Exception as e: print(f"Emotion Prediction Error: {e}") # --- REASONING GENERATION --- mbti_desc = MBTI_EXPLANATIONS.get(mbti_result, { 'en': "Complex personality type.", 'id': "Kepribadian yang cukup kompleks." }) # Add Gemini reasoning to MBTI description if mbti_reasoning: mbti_desc['validation'] = mbti_reasoning mbti_desc['confidence'] = mbti_confidence # Emotion Reasoning conf_percent = int(confidence_score * 100) # Generate dynamic reasoning for Top 3 em_list_str = "" if 'list' in emotion_data and emotion_data['list']: labels = [f"{item['en']} ({int(item['score']*100)}%)" for item in emotion_data['list']] em_list_str = ", ".join(labels) emotion_reasoning = { 'en': f"Dominant emotion is '{emotion_data['en']}'. Mix: {em_list_str}.", 'id': f"Emosi dominan '{emotion_data['id']}'. Campuran: {em_list_str}." } # Keywords Reasoning keywords_reasoning = { 'en': "These words appeared most frequently and define the main topic.", 'id': "Kata-kata ini paling sering muncul dan jadi inti topik lo." } return { "mbti": mbti_result, "emotion": emotion_data, "keywords": NLPHandler.extract_keywords(processed_text), "reasoning": { "mbti": mbti_desc, "emotion": emotion_reasoning, "keywords": keywords_reasoning } } # --- JALUR RESMI: YOUTUBE DATA API --- @staticmethod def _fetch_official_api(video_id, api_key): print(f"Using Official API Key for {video_id}...") result = { "video": None, "comments": [], "text_for_analysis": "" } text_parts = [] try: # 1. Ambil Metadata Video url_meta = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id={video_id}&key={api_key}" res_meta = requests.get(url_meta, timeout=5) if res_meta.status_code == 200: data = res_meta.json() if "items" in data and len(data["items"]) > 0: item = data["items"][0] snippet = item["snippet"] stats = item.get("statistics", {}) # Unescape HTML entities title = html.unescape(snippet['title']) desc = html.unescape(snippet['description']) # Get best thumbnail thumbnails = snippet.get('thumbnails', {}) thumbnail = (thumbnails.get('maxres') or thumbnails.get('high') or thumbnails.get('medium') or thumbnails.get('default', {})).get('url', '') result["video"] = { "title": title, "description": desc, "thumbnail": thumbnail, "channel": snippet.get('channelTitle', 'Unknown Channel'), "publishedAt": snippet.get('publishedAt', ''), "viewCount": stats.get('viewCount', '0'), "likeCount": stats.get('likeCount', '0'), "commentCount": stats.get('commentCount', '0') } text_parts.append(title) text_parts.append(desc) # 2. Ambil Komentar dengan detail url_comm = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&maxResults=20&order=relevance&key={api_key}" res_comm = requests.get(url_comm, timeout=5) if res_comm.status_code == 200: data = res_comm.json() for item in data.get("items", []): comment_snippet = item["snippet"]["topLevelComment"]["snippet"] raw_text = comment_snippet.get("textDisplay", "") clean_text = re.sub(r'<[^>]+>', '', raw_text) clean_text = html.unescape(clean_text) result["comments"].append({ "text": clean_text, "author": comment_snippet.get("authorDisplayName", "Anonymous"), "authorImage": comment_snippet.get("authorProfileImageUrl", ""), "likeCount": comment_snippet.get("likeCount", 0), "publishedAt": comment_snippet.get("publishedAt", ""), "replyCount": item["snippet"].get("totalReplyCount", 0) }) text_parts.append(clean_text) if not text_parts: return None result["text_for_analysis"] = " ".join(text_parts) return result except Exception as e: print(f"Official API Error: {e}") return None @staticmethod def fetch_youtube_transcript(video_id): # 1. PRIORITAS UTAMA: Cek API Key api_key = os.getenv("YOUTUBE_API_KEY") if api_key: official_data = NLPHandler._fetch_official_api(video_id, api_key) if official_data: return official_data # 2. PRIORITAS KEDUA: Fallback Scraping print(f"Fetching transcript (fallback) for: {video_id}") try: transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['id', 'en', 'en-US']) full_text = " ".join([item['text'] for item in transcript_list]) clean_text = re.sub(r'\[.*?\]|\(.*?\)', '', full_text).strip() # Unescape juga buat hasil scraping return html.unescape(clean_text) except Exception: pass return None