DanVP commited on
Commit
bb10b4f
·
verified ·
1 Parent(s): c3439a0

v3.0: V2-grammar (g120k) — better grammar/idiom, less repetition (drop-in over v2.2)

Browse files
README.md CHANGED
@@ -30,14 +30,30 @@ for web-novel / xianxia content.
30
 
31
  - Chinese -> Vietnamese web novel / fiction translation (xianxia, modern, cross-domain).
32
  - Fast local or server inference where a small model is preferred.
33
- - This is an experimental release; output should still be reviewed for high-stakes or publication use.
 
 
34
 
35
  ## Model Details
36
 
37
  - Architecture: Marian seq2seq
38
  - Parameters: ~37M
39
  - Tokenizer: SentencePiece source/target tokenizer
40
- - Suggested decoding: `num_beams=1`, `max_length=512`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  ## Quick Start
43
 
@@ -50,14 +66,13 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
50
 
51
  text = "他抬头看向远处的山门。"
52
  inputs = tok(text, return_tensors="pt", truncation=True, max_length=512)
53
- out = model.generate(**inputs, max_length=512, num_beams=1)
54
  print(tok.decode(out[0], skip_special_tokens=True))
55
  ```
56
 
57
  ## Fast CPU Runtime
58
 
59
- A CTranslate2 INT8 export is available in `ct2-int8_float32/` for ~3-5x faster
60
- CPU inference vs the HuggingFace transformers runtime.
61
 
62
  ```python
63
  import ctranslate2
@@ -80,13 +95,14 @@ translator = ctranslate2.Translator(
80
  ## Training Data
81
 
82
  Trained from scratch on a curated Chinese-Vietnamese parallel corpus covering
83
- xianxia, modern fiction, historical, and cross-domain web-novel content.
 
84
 
85
  ## Notes
86
 
87
- - This model prioritizes speed and small footprint.
88
  - Known hard cases include rare proper nouns and highly domain-specific OOD terminology.
89
- - For production-style usage, pair with reviewed glossary/guard layers where appropriate.
90
 
91
  ## License
92
 
 
30
 
31
  - Chinese -> Vietnamese web novel / fiction translation (xianxia, modern, cross-domain).
32
  - Fast local or server inference where a small model is preferred.
33
+ - Tuned for xianxia / wuxia / classical-flavoured text; on purely modern non-fiction
34
+ it may occasionally lean to a slightly classical register.
35
+ - Experimental release; review output for high-stakes or publication use.
36
 
37
  ## Model Details
38
 
39
  - Architecture: Marian seq2seq
40
  - Parameters: ~37M
41
  - Tokenizer: SentencePiece source/target tokenizer
42
+ - Suggested decoding: `num_beams=4`, `max_length=512`
43
+
44
+ ## Versions
45
+
46
+ | Tag | Notes |
47
+ |---|---|
48
+ | **`v3.0`** (current `main`) | Improved classical-grammar + idiom handling; markedly less decoder repetition on short input (no DPO trade-off). |
49
+ | `v2.2` | Previous release. Pin with `revision="v2.2"`. |
50
+
51
+ Pin a specific version:
52
+
53
+ ```python
54
+ from transformers import AutoModelForSeq2SeqLM
55
+ model = AutoModelForSeq2SeqLM.from_pretrained("DanVP/MoxhiMT-30", revision="v2.2")
56
+ ```
57
 
58
  ## Quick Start
59
 
 
66
 
67
  text = "他抬头看向远处的山门。"
68
  inputs = tok(text, return_tensors="pt", truncation=True, max_length=512)
69
+ out = model.generate(**inputs, max_length=512, num_beams=4)
70
  print(tok.decode(out[0], skip_special_tokens=True))
71
  ```
72
 
73
  ## Fast CPU Runtime
74
 
75
+ A CTranslate2 INT8 export is in `ct2-int8_float32/` for ~3-5x faster CPU inference.
 
76
 
77
  ```python
78
  import ctranslate2
 
95
  ## Training Data
96
 
97
  Trained from scratch on a curated Chinese-Vietnamese parallel corpus covering
98
+ xianxia, modern fiction, historical, and cross-domain web-novel content, with a
99
+ research-grounded layer for idioms and classical-Chinese grammar constructions.
100
 
101
  ## Notes
102
 
103
+ - Prioritizes speed and small footprint.
104
  - Known hard cases include rare proper nouns and highly domain-specific OOD terminology.
105
+ - For production usage, pair with reviewed glossary/guard layers where appropriate.
106
 
107
  ## License
108
 
config.json CHANGED
@@ -1,36 +1,36 @@
1
- {
2
- "activation_dropout": 0.0,
3
- "activation_function": "swish",
4
- "architectures": [
5
- "MarianMTModel"
6
- ],
7
- "attention_dropout": 0.1,
8
- "bos_token_id": 1,
9
- "d_model": 448,
10
- "decoder_attention_heads": 8,
11
- "decoder_ffn_dim": 1792,
12
- "decoder_layerdrop": 0.0,
13
- "decoder_layers": 2,
14
- "decoder_start_token_id": 0,
15
- "decoder_vocab_size": 24000,
16
- "dropout": 0.1,
17
- "dtype": "float32",
18
- "encoder_attention_heads": 8,
19
- "encoder_ffn_dim": 1792,
20
- "encoder_layerdrop": 0.0,
21
- "encoder_layers": 8,
22
- "eos_token_id": 2,
23
- "forced_eos_token_id": 2,
24
- "init_std": 0.02,
25
- "is_decoder": false,
26
- "is_encoder_decoder": true,
27
- "max_position_embeddings": 512,
28
- "model_type": "marian",
29
- "pad_token_id": 0,
30
- "scale_embedding": true,
31
- "share_encoder_decoder_embeddings": true,
32
- "tie_word_embeddings": true,
33
- "transformers_version": "5.9.0",
34
- "use_cache": true,
35
- "vocab_size": 24000
36
- }
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "swish",
4
+ "architectures": [
5
+ "MarianMTModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "d_model": 448,
10
+ "decoder_attention_heads": 8,
11
+ "decoder_ffn_dim": 1792,
12
+ "decoder_layerdrop": 0.0,
13
+ "decoder_layers": 2,
14
+ "decoder_start_token_id": 0,
15
+ "decoder_vocab_size": 24000,
16
+ "dropout": 0.1,
17
+ "dtype": "float32",
18
+ "encoder_attention_heads": 8,
19
+ "encoder_ffn_dim": 1792,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 8,
22
+ "eos_token_id": 2,
23
+ "forced_eos_token_id": 2,
24
+ "init_std": 0.02,
25
+ "is_decoder": false,
26
+ "is_encoder_decoder": true,
27
+ "max_position_embeddings": 512,
28
+ "model_type": "marian",
29
+ "pad_token_id": 0,
30
+ "scale_embedding": true,
31
+ "share_encoder_decoder_embeddings": true,
32
+ "tie_word_embeddings": true,
33
+ "transformers_version": "5.9.0",
34
+ "use_cache": true,
35
+ "vocab_size": 24000
36
+ }
ct2-int8_float32/config.json CHANGED
@@ -1,10 +1,10 @@
1
- {
2
- "add_source_bos": false,
3
- "add_source_eos": false,
4
- "bos_token": "<s>",
5
- "decoder_start_token": "</s>",
6
- "eos_token": "</s>",
7
- "layer_norm_epsilon": null,
8
- "multi_query_attention": false,
9
- "unk_token": "<unk>"
10
- }
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "multi_query_attention": false,
9
+ "unk_token": "<unk>"
10
+ }
ct2-int8_float32/model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dff45323096dec7e62a97888b6dd97ea9f8b979014e9c2f623543e2bc716518
3
  size 37896719
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc5ccad1bf4ee8cd47859130daf2627ae3935bb3c76e017d0c46a563475bc1ef
3
  size 37896719
ct2-int8_float32/shared_vocabulary.json CHANGED
The diff for this file is too large to render. See raw diff
 
ct2-int8_float32/special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "bos_token": "<s>",
3
  "eos_token": "</s>",
 
4
  "pad_token": "<pad>",
5
- "unk_token": "<unk>"
6
  }
 
1
  {
 
2
  "eos_token": "</s>",
3
+ "unk_token": "<unk>",
4
  "pad_token": "<pad>",
5
+ "bos_token": "<s>"
6
  }
ct2-int8_float32/tokenizer_config.json CHANGED
@@ -1,50 +1,50 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<pad>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<s>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- }
35
- },
36
- "backend": "custom",
37
- "bos_token": "<s>",
38
- "clean_up_tokenization_spaces": false,
39
- "eos_token": "</s>",
40
- "is_local": true,
41
- "local_files_only": false,
42
- "model_max_length": 512,
43
- "pad_token": "<pad>",
44
- "separate_vocabs": false,
45
- "source_lang": null,
46
- "sp_model_kwargs": {},
47
- "target_lang": null,
48
- "tokenizer_class": "MarianTokenizer",
49
- "unk_token": "<unk>"
50
- }
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "backend": "custom",
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "is_local": true,
41
+ "local_files_only": false,
42
+ "model_max_length": 512,
43
+ "pad_token": "<pad>",
44
+ "separate_vocabs": false,
45
+ "source_lang": null,
46
+ "sp_model_kwargs": {},
47
+ "target_lang": null,
48
+ "tokenizer_class": "MarianTokenizer",
49
+ "unk_token": "<unk>"
50
+ }
ct2-int8_float32/vocab.json CHANGED
The diff for this file is too large to render. See raw diff
 
generation_config.json CHANGED
@@ -1,12 +1,12 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "decoder_start_token_id": 0,
5
- "eos_token_id": 2,
6
- "forced_eos_token_id": 2,
7
- "output_attentions": false,
8
- "output_hidden_states": false,
9
- "pad_token_id": 0,
10
- "transformers_version": "5.9.0",
11
- "use_cache": true
12
- }
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "output_attentions": false,
8
+ "output_hidden_states": false,
9
+ "pad_token_id": 0,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:342d4a26dba83c8ee54d79cf93d5363ee7a399fdc83906711d95f941137952ad
3
  size 146139424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a489a6f2a5ab6a6493c05e711bc1bea04804b6c27c5c019a710caca3e35386da
3
  size 146139424
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "bos_token": "<s>",
3
  "eos_token": "</s>",
 
4
  "pad_token": "<pad>",
5
- "unk_token": "<unk>"
6
  }
 
1
  {
 
2
  "eos_token": "</s>",
3
+ "unk_token": "<unk>",
4
  "pad_token": "<pad>",
5
+ "bos_token": "<s>"
6
  }
tokenizer_config.json CHANGED
@@ -1,50 +1,50 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<pad>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<s>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- }
35
- },
36
- "backend": "custom",
37
- "bos_token": "<s>",
38
- "clean_up_tokenization_spaces": false,
39
- "eos_token": "</s>",
40
- "is_local": true,
41
- "local_files_only": false,
42
- "model_max_length": 512,
43
- "pad_token": "<pad>",
44
- "separate_vocabs": false,
45
- "source_lang": null,
46
- "sp_model_kwargs": {},
47
- "target_lang": null,
48
- "tokenizer_class": "MarianTokenizer",
49
- "unk_token": "<unk>"
50
- }
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "backend": "custom",
37
+ "bos_token": "<s>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "</s>",
40
+ "is_local": true,
41
+ "local_files_only": false,
42
+ "model_max_length": 512,
43
+ "pad_token": "<pad>",
44
+ "separate_vocabs": false,
45
+ "source_lang": null,
46
+ "sp_model_kwargs": {},
47
+ "target_lang": null,
48
+ "tokenizer_class": "MarianTokenizer",
49
+ "unk_token": "<unk>"
50
+ }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff