communiKate / backend /services /classifier.py
ashfortune
pytorch 둜 λ³€κ²½
1f613d7
import re
import numpy as np
import os
import torch # [μˆ˜μ •] TensorFlow λŒ€μ‹  PyTorch μ‚¬μš©
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification # [μˆ˜μ •] TF 제거된 만λŠ₯ 클래슀
from huggingface_hub import snapshot_download
# .env 파일 ν™œμ„±ν™”
load_dotenv()
class MBTIClassifier:
def __init__(self):
self.repo_id = os.getenv("MODEL_PATH", "ashfortune/communiKate")
self.version_folder = "bert_mbti_ver2"
# 1. λͺ¨λΈ κ°•μ œ λ‹€μš΄λ‘œλ“œ (경둜 μ—λŸ¬ μ™„λ²½ 차단)
if not os.path.isdir(self.repo_id):
print(f"DEBUG: [HUGGINGFACE HUB] λͺ¨λΈ νŒŒμΌμ„ μ„œλ²„λ‘œ κ°•μ œ λ‹€μš΄λ‘œλ“œ ν•©λ‹ˆλ‹€... ({self.repo_id})")
local_download_path = snapshot_download(repo_id=self.repo_id, repo_type="model")
self.base_path = os.path.join(local_download_path, self.version_folder)
else:
print(f"DEBUG: [LOCAL] λ‚΄ μ»΄ν“¨ν„°μ˜ λͺ¨λΈμ„ μ‚¬μš©ν•©λ‹ˆλ‹€.")
self.base_path = self.repo_id if self.version_folder in self.repo_id else os.path.join(self.repo_id, self.version_folder)
print(f"DEBUG: μ΅œμ’… λ‘œλ“œ 경둜 -> {self.base_path}")
self.axis_map = {'ie': 'mbti_model_ie', 'ns': 'mbti_model_ns', 'tf': 'mbti_model_tf', 'jp': 'mbti_model_jp'}
self.model_names = list(self.axis_map.keys())
self.models = {}
# 2. ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ
first_sub = self.axis_map[self.model_names[0]]
tokenizer_path = os.path.join(self.base_path, first_sub)
print(f"DEBUG: ν† ν¬λ‚˜μ΄μ € λ‘œλ”© 쀑... ({tokenizer_path})")
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# 3. 4개의 독립 λͺ¨λΈ λ‘œλ“œ (PyTorch 클래슀둜 λ‘œλ“œ)
for name, subfolder in self.axis_map.items():
model_full_path = os.path.join(self.base_path, subfolder)
print(f"DEBUG: '{name.upper()}' μ „λ¬Έ λͺ¨λΈ λ‘œλ”© 쀑... ({model_full_path})")
# [μˆ˜μ •] TF λ–Όκ³ , from_pt=True μ˜΅μ…˜λ„ λΊμŠ΅λ‹ˆλ‹€. (μ›λž˜ 자기 ν¬λ§·μ΄λ‹ˆκΉŒμš”!)
self.models[name] = AutoModelForSequenceClassification.from_pretrained(
model_full_path,
use_safetensors=True
)
# 예츑 속도 ν–₯상을 μœ„ν•΄ λͺ¨λΈμ„ 평가(Evaluation) λͺ¨λ“œλ‘œ μ„€μ •
self.models[name].eval()
self.labels = {'ie': ['E', 'I'], 'ns': ['N', 'S'], 'tf': ['F', 'T'], 'jp': ['J', 'P']}
self.all_types = [
'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'
]
def _clean_text(self, text):
text = text.lower()
text = re.sub(r'http\S+|www.\S+', '', text)
return text
def predict(self, text):
cleaned = self._clean_text(text)
# [μˆ˜μ •] return_tensorsλ₯Ό 'tf'μ—μ„œ 'pt'(PyTorch)둜 λ³€κ²½
inputs = self.tokenizer(
[cleaned],
truncation=True,
padding=True,
max_length=256,
return_tensors="pt"
)
axis_probs = {}
result_mbti = ""
for name in self.model_names:
filtered_inputs = {k: v for k, v in inputs.items() if k != 'token_type_ids'}
# [μˆ˜μ •] PyTorch λ°©μ‹μœΌλ‘œ 예츑 μ‹€ν–‰ 및 ν™•λ₯ (Softmax) 계산
with torch.no_grad(): # λ©”λͺ¨λ¦¬ μ ˆμ•½
outputs = self.models[name](**filtered_inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].numpy()
axis_probs[name] = probs
best_idx = np.argmax(probs)
result_mbti += self.labels[name][best_idx]
full_probabilities = {}
for mbti in self.all_types:
p_ie = axis_probs['ie'][0 if mbti[0]=='E' else 1]
p_ns = axis_probs['ns'][0 if mbti[1]=='N' else 1]
p_tf = axis_probs['tf'][0 if mbti[2]=='F' else 1]
p_jp = axis_probs['jp'][0 if mbti[3]=='J' else 1]
full_probabilities[mbti] = float(p_ie * p_ns * p_tf * p_jp)
confidence = full_probabilities[result_mbti]
return {
"mbti": result_mbti,
"confidence": confidence,
"probabilities": full_probabilities
}