WSHAPER commited on
Commit
ef79e5e
·
verified ·
1 Parent(s): 2a5b8ed

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - de
5
+ - ru
6
+ license: apache-2.0
7
+ library_name: transformers
8
+ tags:
9
+ - dialogue-act-classification
10
+ - distilbert
11
+ - multilingual
12
+ - conversational-ai
13
+ - asr
14
+ base_model: distilbert-base-multilingual-cased
15
+ metrics:
16
+ - accuracy
17
+ - f1
18
+ pipeline_tag: text-classification
19
+ ---
20
+
21
+ # distilbert-multilingual-dialogue-act-classifier
22
+
23
+ Fine-tuned **DistilBERT** (`distilbert-base-multilingual-cased`) for **4-class dialogue act classification** in English, German, and Russian. Trained on conversational dialogue data, optimized for ASR transcripts.
24
+
25
+ ## Labels
26
+
27
+ | Index | Label | Description |
28
+ |-------|-------|-------------|
29
+ | 0 | commissive | Promises, commitments ("I'll handle it.") |
30
+ | 1 | directive | Commands, requests ("Send the report.") |
31
+ | 2 | inform | Statements, facts ("The deadline is Friday.") |
32
+ | 3 | question | Questions, inquiries ("What is the timeline?") |
33
+
34
+ ## Evaluation
35
+
36
+ Per-language performance on held-out test sets:
37
+
38
+ | Language | Test Set | Accuracy | F1 Macro |
39
+ |----------|----------|----------|----------|
40
+ | English | SILICONE dyda_da | 80.8% | 0.725 |
41
+ | English | XDailyDialog | 82.5% | 0.750 |
42
+ | German | XDailyDialog | 81.8% | 0.738 |
43
+ | Russian | xdailydialog-ru | 81.7% | 0.734 |
44
+
45
+ Edge-case test suite (ASR disfluent input, conversational): **77.8%** (35/45)
46
+
47
+ ## Usage
48
+
49
+ ```python
50
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
51
+ import torch
52
+
53
+ model = AutoModelForSequenceClassification.from_pretrained("WSHAPER/distilbert-multilingual-dialogue-act-classifier")
54
+ tokenizer = AutoTokenizer.from_pretrained("WSHAPER/distilbert-multilingual-dialogue-act-classifier")
55
+
56
+ texts = ["What is the timeline?", "Send the report.", "The meeting went well."]
57
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
58
+
59
+ with torch.no_grad():
60
+ logits = model(**inputs).logits
61
+ probs = torch.softmax(logits, dim=-1)
62
+ preds = torch.argmax(probs, dim=-1)
63
+
64
+ labels = ["commissive", "directive", "inform", "question"]
65
+ for text, pred, prob in zip(texts, preds, probs):
66
+ print(f"{text} → {labels[pred]} ({prob[pred]:.2f})")
67
+ ```
68
+
69
+ ## Training Details
70
+
71
+ - **Base model**: `distilbert-base-multilingual-cased` (277M params)
72
+ - **Training data**:
73
+ - [XDailyDialog](https://github.com/liuzeming01/XDailyDialog) — EN, DE, IT (~249K utterances)
74
+ - [WSHAPER/xdailydialog-ru](https://huggingface.co/datasets/WSHAPER/xdailydialog-ru) — RU (~82K utterances)
75
+ - Total: ~331K utterances across 4 languages
76
+ - **Hyperparameters**: 5 epochs, batch 32, lr 2e-5, warmup 10%
77
+ - **Hardware**: NVIDIA RTX A3000 12GB, ~1.5 hours
78
+
79
+ ## Rust Inference (candle-transformers)
80
+
81
+ This model is compatible with `candle-transformers` for pure Rust inference:
82
+
83
+ ```rust
84
+ // Loads model.safetensors + tokenizer.json directly
85
+ let config = DistilBertConfig::from_file("config.json");
86
+ let bert = BertModel::load(vb.pp("distilbert"), &config)?;
87
+ let classifier = candle_nn::linear(config.hidden_size, 4, vb.pp("classifier"))?;
88
+ ```
89
+
90
+ ## Links
91
+
92
+ - **GitHub**: [WSHAPER/dialogue-act-classifier](https://github.com/WSHAPER/dialogue-act-classifier) — training code, evaluation scripts, export tools
93
+ - **Russian dataset**: [WSHAPER/xdailydialog-ru](https://huggingface.co/datasets/WSHAPER/xdailydialog-ru) — Russian translation of XDailyDialog
94
+
95
+ ## License
96
+
97
+ Apache-2.0
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForSequenceClassification"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "hidden_dim": 3072,
10
+ "id2label": {
11
+ "0": "commissive",
12
+ "1": "directive",
13
+ "2": "inform",
14
+ "3": "question"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "label2id": {
18
+ "commissive": 0,
19
+ "directive": 1,
20
+ "inform": 2,
21
+ "question": 3
22
+ },
23
+ "max_position_embeddings": 512,
24
+ "model_type": "distilbert",
25
+ "n_heads": 12,
26
+ "n_layers": 6,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "problem_type": "single_label_classification",
30
+ "qa_dropout": 0.1,
31
+ "seq_classif_dropout": 0.2,
32
+ "sinusoidal_pos_embds": false,
33
+ "tie_weights_": true,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.53.1",
36
+ "vocab_size": 119547
37
+ }
label_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "0": "commissive",
3
+ "1": "directive",
4
+ "2": "inform",
5
+ "3": "question"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f226db309b0b679faaa4dc3b955f31b6024cbe87a7ea43af2a372b78d0be38b5
3
+ size 541323496
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 128,
50
+ "model_max_length": 512,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "[PAD]",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "[SEP]",
56
+ "stride": 0,
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "DistilBertTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "[UNK]"
63
+ }
training_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "distilbert-base-multilingual-cased",
3
+ "num_labels": 4,
4
+ "max_seq_length": 128,
5
+ "epochs": 5,
6
+ "batch_size": 32,
7
+ "learning_rate": 2e-05,
8
+ "seed": 42,
9
+ "trained_at": "20260514_193557",
10
+ "languages": [
11
+ "it",
12
+ "de",
13
+ "ru",
14
+ "en"
15
+ ]
16
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff