BharatVLM commited on
Commit
81472f1
·
verified ·
1 Parent(s): 3d930be

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,83 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - gpt2
5
+ - assamese
6
+ - language-model
7
+ - text-generation
8
+ - low-resource
9
+ - educational
10
+ - research
11
+ - generated_from_trainer
12
+ metrics:
13
+ - accuracy
14
+ model-index:
15
+ - name: Assamese GPT-2
16
+ results: []
17
+ ---
18
+
19
+ # Assamese GPT-2 Model
20
+
21
+ This is a GPT-2 language model trained from scratch on Assamese monolingual text, using data from **IndicCorpV2** and **OSCAR**. The model is developed for **educational and research purposes** to support natural language understanding and generation tasks in Assamese — a low-resource language.
22
+
23
+ ## 📖 Model Description
24
+
25
+ The Assamese GPT-2 model is based on the standard GPT-2 decoder-only transformer architecture. It is capable of generating grammatically coherent and contextually relevant Assamese text and serves as a foundation for downstream NLP tasks such as:
26
+
27
+ - Language modeling
28
+ - Text completion/generation
29
+ - Fine-tuning for classification or summarization
30
+
31
+ ## ✅ Intended Uses
32
+
33
+ - Academic research on Assamese NLP
34
+ - Training and benchmarking in educational settings
35
+ - Exploration of low-resource language modeling
36
+
37
+ ## 🚫 Limitations
38
+
39
+ - Trained on general-domain monolingual data, may not perform well on domain-specific texts (e.g., legal, medical).
40
+ - Might generate biased, incomplete, or hallucinated outputs.
41
+ - Not suitable for production use or deployment in sensitive applications.
42
+
43
+ ## 📚 Training and Evaluation Data
44
+
45
+ The model was trained using Assamese monolingual data collected from:
46
+
47
+ - **IndicCorpV2**: A curated collection of web-crawled and processed data for Indic languages.
48
+ - **OSCAR (Open Super-large Crawled ALMAnaCH coRpus)**: Filtered web-crawled corpus available through Hugging Face datasets.
49
+
50
+ Data preprocessing included:
51
+ - Unicode normalization
52
+ - Removal of noisy characters and malformed tokens
53
+ - Sentence segmentation using Assamese-specific heuristics
54
+
55
+ ## 🧪 Training Procedure
56
+
57
+ ### Hyperparameters
58
+ - Learning rate: 5e-5
59
+ - Epochs: 20
60
+ - Batch size: 64
61
+ - Optimizer: AdamW (β₁=0.9, β₂=0.999, ε=1e-8)
62
+ - Scheduler: Linear
63
+ - Mixed Precision: Native AMP
64
+ - Seed: 42
65
+
66
+ ### Results
67
+ - Final Evaluation Loss: -29.1890
68
+ - Accuracy: 0.3452
69
+
70
+ ## 🚀 Example Usage
71
+
72
+ ```python
73
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
74
+
75
+ model = GPT2LMHeadModel.from_pretrained("your-username/gpt2_assamese_model")
76
+ tokenizer = GPT2Tokenizer.from_pretrained("your-username/gpt2_assamese_model")
77
+
78
+ prompt = "অসমৰ ইতিহাস"
79
+ inputs = tokenizer(prompt, return_tensors="pt")
80
+ outputs = model.generate(**inputs, max_length=50, do_sample=True)
81
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
82
+ ```
83
+
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.3452154413455673,
4
+ "eval_loss": -29.189016342163086,
5
+ "eval_runtime": 2434.4538,
6
+ "eval_samples": 10618,
7
+ "eval_samples_per_second": 4.362,
8
+ "eval_steps_per_second": 0.068,
9
+ "perplexity": 2.1055776904663675e-13,
10
+ "total_flos": 2.10597197119488e+18,
11
+ "train_loss": 0.33622724912249125,
12
+ "train_runtime": 72693.7372,
13
+ "train_samples": 201496,
14
+ "train_samples_per_second": 55.437,
15
+ "train_steps_per_second": 0.866
16
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "mask_token_id": 4,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "pad_token_id": 1,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.52.0.dev0",
32
+ "unk_token_id": 3,
33
+ "use_cache": true,
34
+ "vocab_size": 50000
35
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.3452154413455673,
4
+ "eval_loss": -29.189016342163086,
5
+ "eval_runtime": 2434.4538,
6
+ "eval_samples": 10618,
7
+ "eval_samples_per_second": 4.362,
8
+ "eval_steps_per_second": 0.068,
9
+ "perplexity": 2.1055776904663675e-13
10
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.52.0.dev0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ebdd0b85e0ddcbbb57a300bc90ad61342ac8759e4fbdc264220f6fdadc66ea
3
+ size 496984704
runs/May13_18-16-24_uma/events.out.tfevents.1747140395.uma.2750189.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d266216ac4d0715e394e0ce4da673f0d4b664b55e16b640ec2d9aa4cb260117c
3
+ size 139307
runs/May13_18-16-24_uma/events.out.tfevents.1747231217.uma.2750189.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed35378faa2a97f78db4779c9e82d6a734135b9e5babfc6f68318691862eeb90
3
+ size 417
runs/May14_22-41-05_uma/events.out.tfevents.1747242678.uma.3260955.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:991b9019cbd4fccf2b340e5663fae8dd86f4d2b02aefd1a5be9ef37b4f837b81
3
+ size 140858
runs/May14_22-41-05_uma/events.out.tfevents.1747317807.uma.3260955.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:615c02f60eb6d13bcb05b36283e7ac74f556d4967a1a2bc2a1bbc70abfcd57c2
3
+ size 417
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<pad>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": false,
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "tokenizer_class": "GPT2Tokenizer",
55
+ "unk_token": "<unk>"
56
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 2.10597197119488e+18,
4
+ "train_loss": 0.33622724912249125,
5
+ "train_runtime": 72693.7372,
6
+ "train_samples": 201496,
7
+ "train_samples_per_second": 55.437,
8
+ "train_steps_per_second": 0.866
9
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c0474f633b98bb6445345154f9ee9b4af4bbac2841b138afd5cbfc3c3f825b4
3
+ size 5304
vocab.json ADDED
The diff for this file is too large to render. See raw diff