dat201204 commited on
Commit
b58b88f
·
verified ·
1 Parent(s): 900be44

Add files using upload-large-folder tool

Browse files
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ license: apache-2.0
5
+ library_name: transformers
6
+ pipeline_tag: text-classification
7
+ tags:
8
+ - vietnamese
9
+ - phobert
10
+ - disaster-response
11
+ - emergency-detection
12
+ - text-classification
13
+ - peft
14
+ - lora
15
+ base_model: vinai/phobert-base
16
+ ---
17
+
18
+ # PhoBERT Vietnamese Cau Cuu Classifier
19
+
20
+ PhoBERT-based Vietnamese Facebook comment classifier for detecting **"cầu cứu"** comments during natural-disaster situations.
21
+
22
+ ## Labels
23
+
24
+ - `0`: `khong_cau_cuu`
25
+ - `1`: `cau_cuu`
26
+
27
+ ## Intended use
28
+
29
+ This model is designed to prioritize **high recall** for emergency rescue requests in Vietnamese social-media comments, especially when comments may contain distress language, location hints, phone numbers, or SOS markers.
30
+
31
+ ## Training setup
32
+
33
+ - Base model: `vinai/phobert-base`
34
+ - Fine-tuning method: LoRA / PEFT
35
+ - Evaluation checkpoint source: `/content/phobert-cau-cuu/saved_model/checkpoint-171`
36
+ - Decision threshold for deployment: `0.4941`
37
+ - Threshold selection policy: `target_recall` with validation target recall `0.88`
38
+
39
+ ## Validation metrics at selected threshold
40
+
41
+ - Accuracy: `0.8469`
42
+ - F1 macro: `0.8380`
43
+ - F1 (`cau_cuu`): `0.8000`
44
+ - Recall (`cau_cuu`): `0.8955`
45
+ - Precision (`cau_cuu`): `0.7229`
46
+
47
+ ## Test metrics
48
+
49
+ - Accuracy: `0.8520`
50
+ - F1 macro: `0.8430`
51
+ - F1 (`cau_cuu`): `0.8054`
52
+ - Recall (`cau_cuu`): `0.9091`
53
+ - Precision (`cau_cuu`): `0.7229`
54
+
55
+ ## Confusion matrix on test set
56
+
57
+ ```text
58
+ 107 23
59
+ 6 60
60
+ ```
61
+
62
+ ## Recommended inference rule
63
+
64
+ Convert logits to probabilities and classify as `cau_cuu` when:
65
+
66
+ ```python
67
+ prob_cau_cuu >= 0.4941
68
+ ```
69
+
70
+ This threshold was chosen on the validation set to preserve strong recall while improving `F1(cau_cuu)` and overall accuracy.
71
+
72
+ ## Example loading code
73
+
74
+ ```python
75
+ import torch
76
+ from peft import AutoPeftModelForSequenceClassification
77
+ from transformers import AutoTokenizer
78
+
79
+ repo_id = "dat201204/phobert-vi-caucu-classifier"
80
+ threshold = 0.4941
81
+
82
+ tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
83
+ model = AutoPeftModelForSequenceClassification.from_pretrained(repo_id)
84
+ model.eval()
85
+
86
+ text = "Cuu voi, nha em dang ngap va co nguoi gia bi ket"
87
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
88
+ with torch.no_grad():
89
+ logits = model(**inputs).logits
90
+ prob_cau_cuu = torch.softmax(logits, dim=-1)[0, 1].item()
91
+
92
+ label = "cau_cuu" if prob_cau_cuu >= threshold else "khong_cau_cuu"
93
+ print({"label": label, "prob_cau_cuu": prob_cau_cuu})
94
+ ```
95
+
96
+ ## Limitations
97
+
98
+ - The dataset was weakly supervised in the first labeling stage and may contain residual noise.
99
+ - The model is optimized for disaster-response triage, not for general sentiment or topic classification.
100
+ - Human verification is still recommended for high-stakes rescue coordination.
adapter_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "vinai/phobert-base",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "classifier",
26
+ "classifier",
27
+ "score"
28
+ ],
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 16,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "value",
37
+ "query"
38
+ ],
39
+ "target_parameters": null,
40
+ "task_type": "SEQ_CLS",
41
+ "trainable_token_indices": null,
42
+ "use_dora": false,
43
+ "use_qalora": false,
44
+ "use_rslora": false
45
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbfeb72d109817a757e709a9ad235bb30988e3eea400ecc1e528ffe33f6ea767
3
+ size 4735200
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
bpe.codes ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "64000": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": null,
45
+ "backend": "custom",
46
+ "bos_token": "<s>",
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "is_local": false,
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "PhobertTokenizer",
55
+ "unk_token": "<unk>"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff