Spaces:
Sleeping
Sleeping
File size: 4,493 Bytes
aa53da8 1f613d7 75332d4 1f613d7 44a3fd2 aa53da8 6d6eec9 aa53da8 6d6eec9 44a3fd2 75332d4 44a3fd2 1f613d7 44a3fd2 75332d4 44a3fd2 aa53da8 71c66e8 aa53da8 1f613d7 6d6eec9 44a3fd2 6d6eec9 1f613d7 6d6eec9 44a3fd2 aa53da8 1f613d7 44a3fd2 1f613d7 6d6eec9 aa53da8 1f613d7 aa53da8 1f613d7 aa53da8 1f613d7 aa53da8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import re
import numpy as np
import os
import torch # [์์ ] TensorFlow ๋์ PyTorch ์ฌ์ฉ
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification # [์์ ] TF ์ ๊ฑฐ๋ ๋ง๋ฅ ํด๋์ค
from huggingface_hub import snapshot_download
# .env ํ์ผ ํ์ฑํ
load_dotenv()
class MBTIClassifier:
def __init__(self):
self.repo_id = os.getenv("MODEL_PATH", "ashfortune/communiKate")
self.version_folder = "bert_mbti_ver2"
# 1. ๋ชจ๋ธ ๊ฐ์ ๋ค์ด๋ก๋ (๊ฒฝ๋ก ์๋ฌ ์๋ฒฝ ์ฐจ๋จ)
if not os.path.isdir(self.repo_id):
print(f"DEBUG: [HUGGINGFACE HUB] ๋ชจ๋ธ ํ์ผ์ ์๋ฒ๋ก ๊ฐ์ ๋ค์ด๋ก๋ ํฉ๋๋ค... ({self.repo_id})")
local_download_path = snapshot_download(repo_id=self.repo_id, repo_type="model")
self.base_path = os.path.join(local_download_path, self.version_folder)
else:
print(f"DEBUG: [LOCAL] ๋ด ์ปดํจํฐ์ ๋ชจ๋ธ์ ์ฌ์ฉํฉ๋๋ค.")
self.base_path = self.repo_id if self.version_folder in self.repo_id else os.path.join(self.repo_id, self.version_folder)
print(f"DEBUG: ์ต์ข
๋ก๋ ๊ฒฝ๋ก -> {self.base_path}")
self.axis_map = {'ie': 'mbti_model_ie', 'ns': 'mbti_model_ns', 'tf': 'mbti_model_tf', 'jp': 'mbti_model_jp'}
self.model_names = list(self.axis_map.keys())
self.models = {}
# 2. ํ ํฌ๋์ด์ ๋ก๋
first_sub = self.axis_map[self.model_names[0]]
tokenizer_path = os.path.join(self.base_path, first_sub)
print(f"DEBUG: ํ ํฌ๋์ด์ ๋ก๋ฉ ์ค... ({tokenizer_path})")
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# 3. 4๊ฐ์ ๋
๋ฆฝ ๋ชจ๋ธ ๋ก๋ (PyTorch ํด๋์ค๋ก ๋ก๋)
for name, subfolder in self.axis_map.items():
model_full_path = os.path.join(self.base_path, subfolder)
print(f"DEBUG: '{name.upper()}' ์ ๋ฌธ ๋ชจ๋ธ ๋ก๋ฉ ์ค... ({model_full_path})")
# [์์ ] TF ๋ผ๊ณ , from_pt=True ์ต์
๋ ๋บ์ต๋๋ค. (์๋ ์๊ธฐ ํฌ๋งท์ด๋๊น์!)
self.models[name] = AutoModelForSequenceClassification.from_pretrained(
model_full_path,
use_safetensors=True
)
# ์์ธก ์๋ ํฅ์์ ์ํด ๋ชจ๋ธ์ ํ๊ฐ(Evaluation) ๋ชจ๋๋ก ์ค์
self.models[name].eval()
self.labels = {'ie': ['E', 'I'], 'ns': ['N', 'S'], 'tf': ['F', 'T'], 'jp': ['J', 'P']}
self.all_types = [
'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'
]
def _clean_text(self, text):
text = text.lower()
text = re.sub(r'http\S+|www.\S+', '', text)
return text
def predict(self, text):
cleaned = self._clean_text(text)
# [์์ ] return_tensors๋ฅผ 'tf'์์ 'pt'(PyTorch)๋ก ๋ณ๊ฒฝ
inputs = self.tokenizer(
[cleaned],
truncation=True,
padding=True,
max_length=256,
return_tensors="pt"
)
axis_probs = {}
result_mbti = ""
for name in self.model_names:
filtered_inputs = {k: v for k, v in inputs.items() if k != 'token_type_ids'}
# [์์ ] PyTorch ๋ฐฉ์์ผ๋ก ์์ธก ์คํ ๋ฐ ํ๋ฅ (Softmax) ๊ณ์ฐ
with torch.no_grad(): # ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ
outputs = self.models[name](**filtered_inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].numpy()
axis_probs[name] = probs
best_idx = np.argmax(probs)
result_mbti += self.labels[name][best_idx]
full_probabilities = {}
for mbti in self.all_types:
p_ie = axis_probs['ie'][0 if mbti[0]=='E' else 1]
p_ns = axis_probs['ns'][0 if mbti[1]=='N' else 1]
p_tf = axis_probs['tf'][0 if mbti[2]=='F' else 1]
p_jp = axis_probs['jp'][0 if mbti[3]=='J' else 1]
full_probabilities[mbti] = float(p_ie * p_ns * p_tf * p_jp)
confidence = full_probabilities[result_mbti]
return {
"mbti": result_mbti,
"confidence": confidence,
"probabilities": full_probabilities
} |