wokogaming commited on
Commit
debc72e
·
verified ·
1 Parent(s): 8ec1e48

Update infer.py

Browse files
Files changed (1) hide show
  1. infer.py +111 -111
infer.py CHANGED
@@ -1,111 +1,111 @@
1
- import sys
2
- from pathlib import Path
3
- import torch
4
- from transformers import AutoTokenizer
5
-
6
- sys.path.append(str(Path(__file__).parent / "model_code"))
7
- from architecture import PhoBERTMultiHeadGRU
8
-
9
- ASPECTS = [
10
- "vệ sinh",
11
- "đồ ăn thức uống",
12
- "khách sạn",
13
- "vị trí",
14
- "phòng ốc",
15
- "dịch vụ",
16
- ]
17
-
18
- LABEL_MAP = {
19
- 0: "Negative",
20
- 1: "Neutral",
21
- 2: "Positive"
22
- }
23
-
24
- def _load_model(checkpoint_path: Path):
25
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
-
27
- # Init tokenizer and model with 'vinai/phobert-base'
28
- phobert_name = "vinai/phobert-base"
29
- tokenizer = AutoTokenizer.from_pretrained(phobert_name)
30
-
31
- model = PhoBERTMultiHeadGRU(
32
- phobert_path=phobert_name,
33
- gru_hidden_dim=256,
34
- num_labels=len(ASPECTS),
35
- num_classes=3
36
- )
37
-
38
- # Load state dict
39
- checkpoint = torch.load(checkpoint_path, map_location=device)
40
- if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
41
- model.load_state_dict(checkpoint["model_state_dict"])
42
- else:
43
- model.load_state_dict(checkpoint)
44
-
45
- model.to(device)
46
- model.eval()
47
-
48
- cfg = {
49
- "max_len": 128,
50
- "aspects": ASPECTS,
51
- "label_map": LABEL_MAP
52
- }
53
-
54
- return model, tokenizer, cfg, device
55
-
56
- def _predict_single(model, tokenizer, cfg, device, text: str):
57
- encoding = tokenizer(
58
- text,
59
- add_special_tokens=True,
60
- max_length=cfg["max_len"],
61
- padding="max_length",
62
- truncation=True,
63
- return_attention_mask=True,
64
- return_tensors="pt",
65
- )
66
-
67
- input_ids = encoding["input_ids"].to(device)
68
- attention_mask = encoding["attention_mask"].to(device)
69
-
70
- with torch.no_grad():
71
- logits_list = model(input_ids=input_ids, attention_mask=attention_mask)
72
- # logits_list is a list of tensors [1, num_classes]
73
- preds = [logits.argmax(dim=-1).item() for logits in logits_list]
74
-
75
- results = {}
76
- for i, aspect in enumerate(cfg["aspects"]):
77
- results[aspect] = cfg["label_map"][preds[i]]
78
-
79
- return results
80
-
81
- def _predict_batch(model, tokenizer, cfg, device, texts: list[str], batch_size: int = 32):
82
- results = []
83
-
84
- for i in range(0, len(texts), batch_size):
85
- batch_texts = texts[i:i+batch_size]
86
- encoding = tokenizer(
87
- batch_texts,
88
- add_special_tokens=True,
89
- max_length=cfg["max_len"],
90
- padding="max_length",
91
- truncation=True,
92
- return_attention_mask=True,
93
- return_tensors="pt",
94
- )
95
-
96
- input_ids = encoding["input_ids"].to(device)
97
- attention_mask = encoding["attention_mask"].to(device)
98
-
99
- with torch.no_grad():
100
- logits_list = model(input_ids=input_ids, attention_mask=attention_mask)
101
- # logits_list is a list of num_labels tensors of shape [batch, num_classes]
102
- # We want to stack them to [batch, num_labels]
103
- preds = torch.stack([logits.argmax(dim=-1) for logits in logits_list], dim=1).cpu().numpy()
104
-
105
- for b_idx in range(len(batch_texts)):
106
- res = {}
107
- for a_idx, aspect in enumerate(cfg["aspects"]):
108
- res[aspect] = cfg["label_map"][preds[b_idx, a_idx]]
109
- results.append(res)
110
-
111
- return results
 
1
+ import sys
2
+ from pathlib import Path
3
+ import torch
4
+ from transformers import AutoTokenizer
5
+
6
+ sys.path.append(str(Path(__file__).parent / "model_code"))
7
+ from architecture import PhoBERTMultiHeadGRU
8
+
9
+ ASPECTS = [
10
+ "vệ sinh",
11
+ "đồ ăn thức uống",
12
+ "khách sạn",
13
+ "vị trí",
14
+ "phòng ốc",
15
+ "dịch vụ",
16
+ ]
17
+
18
+ LABEL_MAP = {
19
+ 0: "None",
20
+ 1: "Positive",
21
+ 2: "Negative"
22
+ }
23
+
24
+ def _load_model(checkpoint_path: Path):
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ # Init tokenizer and model with 'vinai/phobert-base'
28
+ phobert_name = "vinai/phobert-base"
29
+ tokenizer = AutoTokenizer.from_pretrained(phobert_name)
30
+
31
+ model = PhoBERTMultiHeadGRU(
32
+ phobert_path=phobert_name,
33
+ gru_hidden_dim=256,
34
+ num_labels=len(ASPECTS),
35
+ num_classes=3
36
+ )
37
+
38
+ # Load state dict
39
+ checkpoint = torch.load(checkpoint_path, map_location=device)
40
+ if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
41
+ model.load_state_dict(checkpoint["model_state_dict"])
42
+ else:
43
+ model.load_state_dict(checkpoint)
44
+
45
+ model.to(device)
46
+ model.eval()
47
+
48
+ cfg = {
49
+ "max_len": 128,
50
+ "aspects": ASPECTS,
51
+ "label_map": LABEL_MAP
52
+ }
53
+
54
+ return model, tokenizer, cfg, device
55
+
56
+ def _predict_single(model, tokenizer, cfg, device, text: str):
57
+ encoding = tokenizer(
58
+ text,
59
+ add_special_tokens=True,
60
+ max_length=cfg["max_len"],
61
+ padding="max_length",
62
+ truncation=True,
63
+ return_attention_mask=True,
64
+ return_tensors="pt",
65
+ )
66
+
67
+ input_ids = encoding["input_ids"].to(device)
68
+ attention_mask = encoding["attention_mask"].to(device)
69
+
70
+ with torch.no_grad():
71
+ logits_list = model(input_ids=input_ids, attention_mask=attention_mask)
72
+ # logits_list is a list of tensors [1, num_classes]
73
+ preds = [logits.argmax(dim=-1).item() for logits in logits_list]
74
+
75
+ results = {}
76
+ for i, aspect in enumerate(cfg["aspects"]):
77
+ results[aspect] = cfg["label_map"][preds[i]]
78
+
79
+ return results
80
+
81
+ def _predict_batch(model, tokenizer, cfg, device, texts: list[str], batch_size: int = 32):
82
+ results = []
83
+
84
+ for i in range(0, len(texts), batch_size):
85
+ batch_texts = texts[i:i+batch_size]
86
+ encoding = tokenizer(
87
+ batch_texts,
88
+ add_special_tokens=True,
89
+ max_length=cfg["max_len"],
90
+ padding="max_length",
91
+ truncation=True,
92
+ return_attention_mask=True,
93
+ return_tensors="pt",
94
+ )
95
+
96
+ input_ids = encoding["input_ids"].to(device)
97
+ attention_mask = encoding["attention_mask"].to(device)
98
+
99
+ with torch.no_grad():
100
+ logits_list = model(input_ids=input_ids, attention_mask=attention_mask)
101
+ # logits_list is a list of num_labels tensors of shape [batch, num_classes]
102
+ # We want to stack them to [batch, num_labels]
103
+ preds = torch.stack([logits.argmax(dim=-1) for logits in logits_list], dim=1).cpu().numpy()
104
+
105
+ for b_idx in range(len(batch_texts)):
106
+ res = {}
107
+ for a_idx, aspect in enumerate(cfg["aspects"]):
108
+ res[aspect] = cfg["label_map"][preds[b_idx, a_idx]]
109
+ results.append(res)
110
+
111
+ return results