vrashad commited on
Commit
f451933
·
verified ·
1 Parent(s): 4a30186

Upload Azerbaijani text quality classifier

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +53 -0
  3. config.json +85 -0
  4. model.safetensors +3 -0
  5. tokenizer.json +3 -0
  6. tokenizer_config.json +29 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - az
5
+ base_model: jhu-clsp/mmBERT-base
6
+ pipeline_tag: text-classification
7
+ tags:
8
+ - azerbaijani
9
+ - text-quality
10
+ - data-filtering
11
+ ---
12
+
13
+ # Azerbaijani Text Quality Classifier
14
+
15
+ Regression model that scores the quality of Azerbaijani web text on a
16
+ continuous 0-3 scale. Built to filter a raw web corpus (OSCAR-derived)
17
+ before language-model pretraining.
18
+
19
+ - **Base model:** jhu-clsp/mmBERT-base
20
+ - **Task:** regression, single output (~0..3). Higher = cleaner text.
21
+ - **Max length:** 4096 tokens
22
+
23
+ ## Score scale
24
+
25
+ - **3** — clean, coherent Azerbaijani prose
26
+ - **2** — substantial good prose mixed with junk (menus, footers, ads)
27
+ - **1** — mostly junk, little recoverable prose
28
+ - **0** — pure junk: navigation pages, spam, machine translation, non-Azerbaijani text
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ import torch
34
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
35
+
36
+ tok = AutoTokenizer.from_pretrained("LocalDoc/azerbaijani-text-quality-classifier")
37
+ model = AutoModelForSequenceClassification.from_pretrained("LocalDoc/azerbaijani-text-quality-classifier")
38
+ model.eval()
39
+
40
+ text = "..."
41
+ enc = tok(text, truncation=True, max_length=4096, return_tensors="pt")
42
+ with torch.no_grad():
43
+ score = model(**enc).logits.squeeze().item()
44
+ print(score)
45
+ ```
46
+
47
+ ## Limitations
48
+
49
+ Training labels were generated by an LLM (Mistral-Small-24B), not by humans.
50
+ Reported validation metrics (val-MSE ~0.14, rounded accuracy ~0.83) measure
51
+ **agreement with the LLM labels**, not agreement with human judgement —
52
+ the latter has not yet been measured against a human-annotated test set.
53
+ Use with this caveat in mind.
config.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertForSequenceClassification"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 2,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 1,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 1,
18
+ "global_attn_every_n_layers": 3,
19
+ "gradient_checkpointing": false,
20
+ "hidden_activation": "gelu",
21
+ "hidden_size": 768,
22
+ "id2label": {
23
+ "0": "LABEL_0"
24
+ },
25
+ "initializer_cutoff_factor": 2.0,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 1152,
28
+ "label2id": {
29
+ "LABEL_0": 0
30
+ },
31
+ "layer_norm_eps": 1e-05,
32
+ "layer_types": [
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "full_attention"
55
+ ],
56
+ "local_attention": 128,
57
+ "mask_token_id": 4,
58
+ "max_position_embeddings": 8192,
59
+ "mlp_bias": false,
60
+ "mlp_dropout": 0.0,
61
+ "model_type": "modernbert",
62
+ "norm_bias": false,
63
+ "norm_eps": 1e-05,
64
+ "num_attention_heads": 12,
65
+ "num_hidden_layers": 22,
66
+ "pad_token_id": 0,
67
+ "position_embedding_type": "sans_pos",
68
+ "problem_type": "regression",
69
+ "rope_parameters": {
70
+ "full_attention": {
71
+ "rope_theta": 160000,
72
+ "rope_type": "default"
73
+ },
74
+ "sliding_attention": {
75
+ "rope_theta": 160000,
76
+ "rope_type": "default"
77
+ }
78
+ },
79
+ "sep_token_id": 1,
80
+ "sparse_pred_ignore_index": -100,
81
+ "sparse_prediction": false,
82
+ "tie_word_embeddings": true,
83
+ "transformers_version": "5.9.0",
84
+ "vocab_size": 256000
85
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd03384786de123b52d529530b2ebded179e5a9a78e2fe4f76b8493014974f4
3
+ size 615076330
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a919df3596ecc3da31343106620dc367df9b12fb41c4b96e8ad773faaade7b5b
3
+ size 34363287
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<bos>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "cls_token": "<bos>",
6
+ "eos_token": "<eos>",
7
+ "extra_special_tokens": [
8
+ "<start_of_turn>",
9
+ "<end_of_turn>"
10
+ ],
11
+ "is_local": true,
12
+ "local_files_only": false,
13
+ "mask_token": "<mask>",
14
+ "max_length": 4096,
15
+ "model_input_names": [
16
+ "input_ids",
17
+ "attention_mask"
18
+ ],
19
+ "model_max_length": 8192,
20
+ "pad_token": "<pad>",
21
+ "padding_side": "right",
22
+ "sep_token": "<eos>",
23
+ "spaces_between_special_tokens": false,
24
+ "stride": 0,
25
+ "tokenizer_class": "TokenizersBackend",
26
+ "truncation_side": "right",
27
+ "truncation_strategy": "longest_first",
28
+ "unk_token": "<unk>"
29
+ }