File size: 4,493 Bytes
aa53da8
 
 
1f613d7
75332d4
1f613d7
44a3fd2
aa53da8
6d6eec9
 
 
aa53da8
6d6eec9
44a3fd2
75332d4
44a3fd2
1f613d7
44a3fd2
 
 
 
 
 
 
75332d4
44a3fd2
aa53da8
71c66e8
aa53da8
 
 
1f613d7
6d6eec9
44a3fd2
 
 
6d6eec9
1f613d7
6d6eec9
44a3fd2
 
aa53da8
1f613d7
 
44a3fd2
 
 
1f613d7
 
6d6eec9
 
aa53da8
 
 
 
 
 
 
 
 
 
 
 
1f613d7
aa53da8
 
 
 
 
1f613d7
aa53da8
 
 
 
 
 
 
1f613d7
 
 
 
 
 
aa53da8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import numpy as np
import os
import torch  # [์ˆ˜์ •] TensorFlow ๋Œ€์‹  PyTorch ์‚ฌ์šฉ
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification # [์ˆ˜์ •] TF ์ œ๊ฑฐ๋œ ๋งŒ๋Šฅ ํด๋ž˜์Šค
from huggingface_hub import snapshot_download

# .env ํŒŒ์ผ ํ™œ์„ฑํ™”
load_dotenv()

class MBTIClassifier:
    def __init__(self):
        self.repo_id = os.getenv("MODEL_PATH", "ashfortune/communiKate")
        self.version_folder = "bert_mbti_ver2"
        
        # 1. ๋ชจ๋ธ ๊ฐ•์ œ ๋‹ค์šด๋กœ๋“œ (๊ฒฝ๋กœ ์—๋Ÿฌ ์™„๋ฒฝ ์ฐจ๋‹จ)
        if not os.path.isdir(self.repo_id):
            print(f"DEBUG: [HUGGINGFACE HUB] ๋ชจ๋ธ ํŒŒ์ผ์„ ์„œ๋ฒ„๋กœ ๊ฐ•์ œ ๋‹ค์šด๋กœ๋“œ ํ•ฉ๋‹ˆ๋‹ค... ({self.repo_id})")
            local_download_path = snapshot_download(repo_id=self.repo_id, repo_type="model")
            self.base_path = os.path.join(local_download_path, self.version_folder)
        else:
            print(f"DEBUG: [LOCAL] ๋‚ด ์ปดํ“จํ„ฐ์˜ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
            self.base_path = self.repo_id if self.version_folder in self.repo_id else os.path.join(self.repo_id, self.version_folder)

        print(f"DEBUG: ์ตœ์ข… ๋กœ๋“œ ๊ฒฝ๋กœ -> {self.base_path}")
        
        self.axis_map = {'ie': 'mbti_model_ie', 'ns': 'mbti_model_ns', 'tf': 'mbti_model_tf', 'jp': 'mbti_model_jp'}
        self.model_names = list(self.axis_map.keys())
        self.models = {}
        
        # 2. ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
        first_sub = self.axis_map[self.model_names[0]]
        tokenizer_path = os.path.join(self.base_path, first_sub)
        print(f"DEBUG: ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ ์ค‘... ({tokenizer_path})")
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        
        # 3. 4๊ฐœ์˜ ๋…๋ฆฝ ๋ชจ๋ธ ๋กœ๋“œ (PyTorch ํด๋ž˜์Šค๋กœ ๋กœ๋“œ)
        for name, subfolder in self.axis_map.items():
            model_full_path = os.path.join(self.base_path, subfolder)
            print(f"DEBUG: '{name.upper()}' ์ „๋ฌธ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘... ({model_full_path})")
            
            # [์ˆ˜์ •] TF ๋–ผ๊ณ , from_pt=True ์˜ต์…˜๋„ ๋บ์Šต๋‹ˆ๋‹ค. (์›๋ž˜ ์ž๊ธฐ ํฌ๋งท์ด๋‹ˆ๊นŒ์š”!)
            self.models[name] = AutoModelForSequenceClassification.from_pretrained(
                model_full_path, 
                use_safetensors=True
            )
            # ์˜ˆ์ธก ์†๋„ ํ–ฅ์ƒ์„ ์œ„ํ•ด ๋ชจ๋ธ์„ ํ‰๊ฐ€(Evaluation) ๋ชจ๋“œ๋กœ ์„ค์ •
            self.models[name].eval()
            
        self.labels = {'ie': ['E', 'I'], 'ns': ['N', 'S'], 'tf': ['F', 'T'], 'jp': ['J', 'P']}
        self.all_types = [
            'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
            'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'
        ]

    def _clean_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+|www.\S+', '', text)
        return text

    def predict(self, text):
        cleaned = self._clean_text(text)
        # [์ˆ˜์ •] return_tensors๋ฅผ 'tf'์—์„œ 'pt'(PyTorch)๋กœ ๋ณ€๊ฒฝ
        inputs = self.tokenizer(
            [cleaned], 
            truncation=True, 
            padding=True, 
            max_length=256, 
            return_tensors="pt" 
        )
        
        axis_probs = {}
        result_mbti = ""
        
        for name in self.model_names:
            filtered_inputs = {k: v for k, v in inputs.items() if k != 'token_type_ids'}
            
            # [์ˆ˜์ •] PyTorch ๋ฐฉ์‹์œผ๋กœ ์˜ˆ์ธก ์‹คํ–‰ ๋ฐ ํ™•๋ฅ (Softmax) ๊ณ„์‚ฐ
            with torch.no_grad(): # ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
                outputs = self.models[name](**filtered_inputs)
                
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].numpy()
            axis_probs[name] = probs
            
            best_idx = np.argmax(probs)
            result_mbti += self.labels[name][best_idx]
            
        full_probabilities = {}
        for mbti in self.all_types:
            p_ie = axis_probs['ie'][0 if mbti[0]=='E' else 1]
            p_ns = axis_probs['ns'][0 if mbti[1]=='N' else 1]
            p_tf = axis_probs['tf'][0 if mbti[2]=='F' else 1]
            p_jp = axis_probs['jp'][0 if mbti[3]=='J' else 1]
            
            full_probabilities[mbti] = float(p_ie * p_ns * p_tf * p_jp)
            
        confidence = full_probabilities[result_mbti]
        
        return {
            "mbti": result_mbti,
            "confidence": confidence,
            "probabilities": full_probabilities
        }