Spaces:
Sleeping
Sleeping
| import re | |
| import numpy as np | |
| import os | |
| import torch # [μμ ] TensorFlow λμ PyTorch μ¬μ© | |
| from dotenv import load_dotenv | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification # [μμ ] TF μ κ±°λ λ§λ₯ ν΄λμ€ | |
| from huggingface_hub import snapshot_download | |
| # .env νμΌ νμ±ν | |
| load_dotenv() | |
| class MBTIClassifier: | |
| def __init__(self): | |
| self.repo_id = os.getenv("MODEL_PATH", "ashfortune/communiKate") | |
| self.version_folder = "bert_mbti_ver2" | |
| # 1. λͺ¨λΈ κ°μ λ€μ΄λ‘λ (κ²½λ‘ μλ¬ μλ²½ μ°¨λ¨) | |
| if not os.path.isdir(self.repo_id): | |
| print(f"DEBUG: [HUGGINGFACE HUB] λͺ¨λΈ νμΌμ μλ²λ‘ κ°μ λ€μ΄λ‘λ ν©λλ€... ({self.repo_id})") | |
| local_download_path = snapshot_download(repo_id=self.repo_id, repo_type="model") | |
| self.base_path = os.path.join(local_download_path, self.version_folder) | |
| else: | |
| print(f"DEBUG: [LOCAL] λ΄ μ»΄ν¨ν°μ λͺ¨λΈμ μ¬μ©ν©λλ€.") | |
| self.base_path = self.repo_id if self.version_folder in self.repo_id else os.path.join(self.repo_id, self.version_folder) | |
| print(f"DEBUG: μ΅μ’ λ‘λ κ²½λ‘ -> {self.base_path}") | |
| self.axis_map = {'ie': 'mbti_model_ie', 'ns': 'mbti_model_ns', 'tf': 'mbti_model_tf', 'jp': 'mbti_model_jp'} | |
| self.model_names = list(self.axis_map.keys()) | |
| self.models = {} | |
| # 2. ν ν¬λμ΄μ λ‘λ | |
| first_sub = self.axis_map[self.model_names[0]] | |
| tokenizer_path = os.path.join(self.base_path, first_sub) | |
| print(f"DEBUG: ν ν¬λμ΄μ λ‘λ© μ€... ({tokenizer_path})") | |
| self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) | |
| # 3. 4κ°μ λ 립 λͺ¨λΈ λ‘λ (PyTorch ν΄λμ€λ‘ λ‘λ) | |
| for name, subfolder in self.axis_map.items(): | |
| model_full_path = os.path.join(self.base_path, subfolder) | |
| print(f"DEBUG: '{name.upper()}' μ λ¬Έ λͺ¨λΈ λ‘λ© μ€... ({model_full_path})") | |
| # [μμ ] TF λΌκ³ , from_pt=True μ΅μ λ λΊμ΅λλ€. (μλ μκΈ° ν¬λ§·μ΄λκΉμ!) | |
| self.models[name] = AutoModelForSequenceClassification.from_pretrained( | |
| model_full_path, | |
| use_safetensors=True | |
| ) | |
| # μμΈ‘ μλ ν₯μμ μν΄ λͺ¨λΈμ νκ°(Evaluation) λͺ¨λλ‘ μ€μ | |
| self.models[name].eval() | |
| self.labels = {'ie': ['E', 'I'], 'ns': ['N', 'S'], 'tf': ['F', 'T'], 'jp': ['J', 'P']} | |
| self.all_types = [ | |
| 'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', | |
| 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP' | |
| ] | |
| def _clean_text(self, text): | |
| text = text.lower() | |
| text = re.sub(r'http\S+|www.\S+', '', text) | |
| return text | |
| def predict(self, text): | |
| cleaned = self._clean_text(text) | |
| # [μμ ] return_tensorsλ₯Ό 'tf'μμ 'pt'(PyTorch)λ‘ λ³κ²½ | |
| inputs = self.tokenizer( | |
| [cleaned], | |
| truncation=True, | |
| padding=True, | |
| max_length=256, | |
| return_tensors="pt" | |
| ) | |
| axis_probs = {} | |
| result_mbti = "" | |
| for name in self.model_names: | |
| filtered_inputs = {k: v for k, v in inputs.items() if k != 'token_type_ids'} | |
| # [μμ ] PyTorch λ°©μμΌλ‘ μμΈ‘ μ€ν λ° νλ₯ (Softmax) κ³μ° | |
| with torch.no_grad(): # λ©λͺ¨λ¦¬ μ μ½ | |
| outputs = self.models[name](**filtered_inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].numpy() | |
| axis_probs[name] = probs | |
| best_idx = np.argmax(probs) | |
| result_mbti += self.labels[name][best_idx] | |
| full_probabilities = {} | |
| for mbti in self.all_types: | |
| p_ie = axis_probs['ie'][0 if mbti[0]=='E' else 1] | |
| p_ns = axis_probs['ns'][0 if mbti[1]=='N' else 1] | |
| p_tf = axis_probs['tf'][0 if mbti[2]=='F' else 1] | |
| p_jp = axis_probs['jp'][0 if mbti[3]=='J' else 1] | |
| full_probabilities[mbti] = float(p_ie * p_ns * p_tf * p_jp) | |
| confidence = full_probabilities[result_mbti] | |
| return { | |
| "mbti": result_mbti, | |
| "confidence": confidence, | |
| "probabilities": full_probabilities | |
| } |