Sync from GitHub Actions: e47275f6fa73e62abbc2b5982f8d3a30ddeec898
Browse files- api/core/nlp_handler.py +20 -2
- api/requirements.txt +1 -0
api/core/nlp_handler.py
CHANGED
|
@@ -4,6 +4,10 @@ import requests
|
|
| 4 |
import html
|
| 5 |
from deep_translator import GoogleTranslator
|
| 6 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# --- CONFIG ---
|
| 9 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
@@ -86,8 +90,21 @@ class NLPHandler:
|
|
| 86 |
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
@staticmethod
|
| 90 |
def translate_to_english(text):
|
|
|
|
| 91 |
try:
|
| 92 |
if len(text) > 4500: text = text[:4500]
|
| 93 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
|
@@ -113,7 +130,8 @@ class NLPHandler:
|
|
| 113 |
@staticmethod
|
| 114 |
def predict_all(raw_text):
|
| 115 |
NLPHandler.load_models()
|
| 116 |
-
processed_text = NLPHandler.
|
|
|
|
| 117 |
|
| 118 |
# --- MBTI PREDICTION (anggars/xlm-mbti) ---
|
| 119 |
mbti_result = "UNKNOWN"
|
|
@@ -212,7 +230,7 @@ class NLPHandler:
|
|
| 212 |
return {
|
| 213 |
"mbti": mbti_result,
|
| 214 |
"emotion": emotion_data,
|
| 215 |
-
"keywords": NLPHandler.extract_keywords(
|
| 216 |
"reasoning": {
|
| 217 |
"mbti": mbti_desc,
|
| 218 |
"emotion": emotion_reasoning,
|
|
|
|
| 4 |
import html
|
| 5 |
from deep_translator import GoogleTranslator
|
| 6 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 7 |
+
from langdetect import detect, DetectorFactory
|
| 8 |
+
|
| 9 |
+
# Force consistent language detection
|
| 10 |
+
DetectorFactory.seed = 0
|
| 11 |
|
| 12 |
# --- CONFIG ---
|
| 13 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
|
| 93 |
+
@staticmethod
|
| 94 |
+
def prepare_text(text):
|
| 95 |
+
"""Only translate if language is not Indonesian or English.
|
| 96 |
+
XLM-RoBERTa handles id/en natively, no translation needed."""
|
| 97 |
+
try:
|
| 98 |
+
if len(text) > 4500: text = text[:4500]
|
| 99 |
+
lang = detect(text)
|
| 100 |
+
if lang not in ['id', 'en']:
|
| 101 |
+
return GoogleTranslator(source='auto', target='en').translate(text)
|
| 102 |
+
return text
|
| 103 |
+
except: return text
|
| 104 |
+
|
| 105 |
@staticmethod
|
| 106 |
def translate_to_english(text):
|
| 107 |
+
"""Force translate to English (used for keywords extraction)."""
|
| 108 |
try:
|
| 109 |
if len(text) > 4500: text = text[:4500]
|
| 110 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
|
|
|
| 130 |
@staticmethod
|
| 131 |
def predict_all(raw_text):
|
| 132 |
NLPHandler.load_models()
|
| 133 |
+
processed_text = NLPHandler.prepare_text(raw_text)
|
| 134 |
+
english_text = NLPHandler.translate_to_english(raw_text) # For keywords only
|
| 135 |
|
| 136 |
# --- MBTI PREDICTION (anggars/xlm-mbti) ---
|
| 137 |
mbti_result = "UNKNOWN"
|
|
|
|
| 230 |
return {
|
| 231 |
"mbti": mbti_result,
|
| 232 |
"emotion": emotion_data,
|
| 233 |
+
"keywords": NLPHandler.extract_keywords(english_text),
|
| 234 |
"reasoning": {
|
| 235 |
"mbti": mbti_desc,
|
| 236 |
"emotion": emotion_reasoning,
|
api/requirements.txt
CHANGED
|
@@ -4,6 +4,7 @@ python-dotenv
|
|
| 4 |
pydantic
|
| 5 |
numpy
|
| 6 |
deep-translator
|
|
|
|
| 7 |
requests
|
| 8 |
youtube-transcript-api
|
| 9 |
google-genai
|
|
|
|
| 4 |
pydantic
|
| 5 |
numpy
|
| 6 |
deep-translator
|
| 7 |
+
langdetect
|
| 8 |
requests
|
| 9 |
youtube-transcript-api
|
| 10 |
google-genai
|