PredictiveManish commited on
Commit
45bcb9b
·
verified ·
1 Parent(s): c828a9d

Upload folder using huggingface_hub

Browse files
Files changed (45) hide show
  1. .gitattributes +9 -0
  2. __pycache__/model_config.cpython-310.pyc +0 -0
  3. checkpoints_fast/checkpoint-interrupted/config.json +32 -0
  4. checkpoints_fast/checkpoint-interrupted/generation_config.json +8 -0
  5. checkpoints_fast/checkpoint-interrupted/model.safetensors +3 -0
  6. checkpoints_fast/checkpoint-interrupted/tokenizer/spiece.model +3 -0
  7. checkpoints_tiny/final/config.json +32 -0
  8. checkpoints_tiny/final/generation_config.json +7 -0
  9. checkpoints_tiny/final/model.safetensors +3 -0
  10. checkpoints_tiny/step1000/config.json +32 -0
  11. checkpoints_tiny/step1000/generation_config.json +7 -0
  12. checkpoints_tiny/step1000/model.safetensors +3 -0
  13. checkpoints_tiny/step2000/config.json +32 -0
  14. checkpoints_tiny/step2000/generation_config.json +7 -0
  15. checkpoints_tiny/step2000/model.safetensors +3 -0
  16. checkpoints_tiny/step3000/config.json +32 -0
  17. checkpoints_tiny/step3000/generation_config.json +7 -0
  18. checkpoints_tiny/step3000/model.safetensors +3 -0
  19. checkpoints_tiny/step4000/config.json +32 -0
  20. checkpoints_tiny/step4000/generation_config.json +7 -0
  21. checkpoints_tiny/step4000/model.safetensors +3 -0
  22. checkpoints_tiny/step5000/config.json +32 -0
  23. checkpoints_tiny/step5000/generation_config.json +7 -0
  24. checkpoints_tiny/step5000/model.safetensors +3 -0
  25. data/en-hi.csv +3 -0
  26. data/en-pa.csv +3 -0
  27. data/extracted_sentences/en.txt +3 -0
  28. data/extracted_sentences/en_hi_english.txt +3 -0
  29. data/extracted_sentences/en_pa_english.txt +3 -0
  30. data/extracted_sentences/extraction_summary.txt +13 -0
  31. data/extracted_sentences/hi.txt +3 -0
  32. data/extracted_sentences/pa.txt +3 -0
  33. data/main.py +316 -0
  34. evaluate_model.py +138 -0
  35. final_corpus/multilingual_corpus.txt +3 -0
  36. final_corpus/multilingual_corpus_train.txt +3 -0
  37. final_corpus/multilingual_corpus_val.txt +0 -0
  38. final_corpus/multilingual_spm.model +3 -0
  39. final_corpus/multilingual_spm.vocab +0 -0
  40. model_config.py +64 -0
  41. model_demo.html +67 -0
  42. preprocess.py +267 -0
  43. test_model.py +418 -0
  44. train_model.py +156 -0
  45. web_interface.py +133 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/en-hi.csv filter=lfs diff=lfs merge=lfs -text
37
+ data/en-pa.csv filter=lfs diff=lfs merge=lfs -text
38
+ data/extracted_sentences/en.txt filter=lfs diff=lfs merge=lfs -text
39
+ data/extracted_sentences/en_hi_english.txt filter=lfs diff=lfs merge=lfs -text
40
+ data/extracted_sentences/en_pa_english.txt filter=lfs diff=lfs merge=lfs -text
41
+ data/extracted_sentences/hi.txt filter=lfs diff=lfs merge=lfs -text
42
+ data/extracted_sentences/pa.txt filter=lfs diff=lfs merge=lfs -text
43
+ final_corpus/multilingual_corpus.txt filter=lfs diff=lfs merge=lfs -text
44
+ final_corpus/multilingual_corpus_train.txt filter=lfs diff=lfs merge=lfs -text
__pycache__/model_config.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
checkpoints_fast/checkpoint-interrupted/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 2,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 384,
15
+ "n_head": 6,
16
+ "n_inner": 1024,
17
+ "n_layer": 6,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": false,
31
+ "vocab_size": 8000
32
+ }
checkpoints_fast/checkpoint-interrupted/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3",
7
+ "use_cache": false
8
+ }
checkpoints_fast/checkpoint-interrupted/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4440a5199eda3c064840d077c625fa853e0e95c38ce76c705398f9cc31ac907d
3
+ size 45632880
checkpoints_fast/checkpoint-interrupted/tokenizer/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf8ae3d54cbc33b749cfff520a86c0e0cbc131ac949b233b8848cb1bf5fe940
3
+ size 166057
checkpoints_tiny/final/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/final/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/final/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585f9b64fb2ff0cc99c4c11d0b12135cd3473d9178fd12598fc7b1d218963678
3
+ size 16763848
checkpoints_tiny/step1000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/step1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/step1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:146653a8e856d8db69d47936b0c0575f6022372a60e3ef54e3a0128fe59777d5
3
+ size 16763848
checkpoints_tiny/step2000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/step2000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/step2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ff9197d428cfea94bca9ddf0af4fda7e340ba44e3106f10566db3fea86a31e2
3
+ size 16763848
checkpoints_tiny/step3000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/step3000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/step3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14a8b4327bc025e10dee73541bccab2124ea65154954da8c1c76ff182520402f
3
+ size 16763848
checkpoints_tiny/step4000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/step4000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/step4000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22620971d56dee4ae04f7fd04bbca4f1809763fd76fcea14ece315033cc7fa5d
3
+ size 16763848
checkpoints_tiny/step5000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_embd": 256,
15
+ "n_head": 4,
16
+ "n_inner": 512,
17
+ "n_layer": 4,
18
+ "n_positions": 128,
19
+ "pad_token_id": 0,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.3",
30
+ "use_cache": true,
31
+ "vocab_size": 8000
32
+ }
checkpoints_tiny/step5000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.3"
7
+ }
checkpoints_tiny/step5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585f9b64fb2ff0cc99c4c11d0b12135cd3473d9178fd12598fc7b1d218963678
3
+ size 16763848
data/en-hi.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8f0a4024a9987812636856077835e435ae4c7fbcae541b6e7c84001de02f72
3
+ size 444580427
data/en-pa.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13d3e0a194847b100b9f817bbc767f1db4aba36006067f284917c3b8c4c295ac
3
+ size 431640910
data/extracted_sentences/en.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99f963e335c435f173545a5f6cf6ab0b5008e465f9b7744d4891ab4d637532f7
3
+ size 28936803
data/extracted_sentences/en_hi_english.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:214ee59b38f22bb42db8c9dcde22cee3be97c1ad50973e7e84e80a5b05f324c9
3
+ size 15095632
data/extracted_sentences/en_pa_english.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8fce34736e406f8348179671100e5cf43e91a652905fd18dd8e0501b8b6e2bd
3
+ size 13841171
data/extracted_sentences/extraction_summary.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DATA EXTRACTION SUMMARY
2
+ ==================================================
3
+
4
+ English-Hindi Dataset:
5
+ English sentences: 150,000
6
+ Hindi sentences: 300,000
7
+
8
+ English-Punjabi Dataset:
9
+ English sentences: 150,000
10
+ Punjabi sentences: 300,000
11
+
12
+ Combined English: 100,000
13
+ Total corpus size: 900,000 sentences
data/extracted_sentences/hi.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17658044b073e93c4705679f1d44239446350ddcfe00fdb7a2a8e27643c610df
3
+ size 70698192
data/extracted_sentences/pa.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdd25ab9d3a3f4793d1270d7446ecd835c3564b479a846e5f28ad1149086e824
3
+ size 62188668
data/main.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script 1: Extract random sentences from EN-HI and EN-PA parallel files
3
+ WITH PROGRESS BAR AND OPTIMIZATIONS
4
+ """
5
+
6
+ import pandas as pd
7
+ import random
8
+ import ftfy
9
+ from langdetect import detect, LangDetectException
10
+ import re
11
+ import numpy as np
12
+ from pathlib import Path
13
+ from tqdm import tqdm
14
+ import time
15
+
16
+ def clean_text(text):
17
+ """Basic text cleaning - optimized"""
18
+ if not isinstance(text, str):
19
+ return ""
20
+
21
+ # Quick check for NaN
22
+ if text == 'nan' or pd.isna(text):
23
+ return ""
24
+
25
+ text = ftfy.fix_text(text)
26
+ text = re.sub(r'\s+', ' ', text)
27
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
28
+ return text.strip()
29
+
30
+ def is_valid_sentence_fast(text, target_lang):
31
+ """Optimized version without langdetect for initial filtering"""
32
+ if not text or len(text) < 20:
33
+ return False
34
+
35
+ # Length check
36
+ words = text.split()
37
+ if len(words) < 5 or len(words) > 50:
38
+ return False
39
+
40
+ # Character diversity
41
+ unique_chars = len(set(text))
42
+ if unique_chars < 7:
43
+ return False
44
+
45
+ # Quick language heuristics (fast checks)
46
+ if target_lang == 'en':
47
+ # Check if has Latin script
48
+ if not re.search(r'[a-zA-Z]', text):
49
+ return False
50
+ elif target_lang == 'hi':
51
+ # Check for Devanagari script
52
+ if not re.search(r'[\u0900-\u097F]', text):
53
+ return False
54
+ elif target_lang == 'pa':
55
+ # Check for Gurmukhi script
56
+ if not re.search(r'[\u0A00-\u0A7F]', text):
57
+ return False
58
+
59
+ return True
60
+
61
+ def is_valid_sentence_with_lang(text, target_lang, use_fast=True):
62
+ """Full validation with optional langdetect"""
63
+ if not is_valid_sentence_fast(text, target_lang):
64
+ return False
65
+
66
+ # Only use langdetect for a subset if needed
67
+ if not use_fast:
68
+ try:
69
+ detected = detect(text)
70
+ lang_map = {
71
+ 'hi': ['hi'],
72
+ 'pa': ['pa'],
73
+ 'en': ['en']
74
+ }
75
+
76
+ if target_lang in lang_map and detected not in lang_map[target_lang]:
77
+ if target_lang == 'en' and detected not in ['hi', 'pa', 'mr', 'gu']:
78
+ return True
79
+ elif target_lang in ['hi', 'pa'] and detected not in ['en']:
80
+ return True
81
+ return False
82
+ except LangDetectException:
83
+ pass
84
+
85
+ return True
86
+
87
+ def extract_from_parallel_csv_optimized(input_csv, output_dir, en_samples, other_samples, other_lang_code):
88
+ """
89
+ Extract random sentences from parallel CSV - OPTIMIZED
90
+ """
91
+ print(f"\n{'='*60}")
92
+ print(f"Processing {input_csv}...")
93
+ print(f"Target: {en_samples} EN, {other_samples} {other_lang_code}")
94
+ print('='*60)
95
+
96
+ start_time = time.time()
97
+
98
+ # Read CSV in chunks for memory efficiency
99
+ print("Reading CSV file...")
100
+ try:
101
+ df = pd.read_csv(input_csv, on_bad_lines='skip')
102
+ except Exception as e:
103
+ print(f"Error reading {input_csv}: {e}")
104
+ # Try with different encoding
105
+ try:
106
+ df = pd.read_csv(input_csv, encoding='latin-1', on_bad_lines='skip')
107
+ except:
108
+ print(f"Failed to read {input_csv}")
109
+ return [], []
110
+
111
+ print(f"Loaded {len(df):,} rows")
112
+ print(f"Columns: {list(df.columns)}")
113
+
114
+ # Identify columns
115
+ src_col = 'src' if 'src' in df.columns else df.columns[1]
116
+ tgt_col = 'tgt' if 'tgt' in df.columns else df.columns[2]
117
+ print(f"Source: {src_col}, Target: {tgt_col}")
118
+
119
+ # Clean data in batches with progress bar
120
+ print("\nCleaning data...")
121
+ df_clean = df.copy()
122
+
123
+ # Clean source column
124
+ valid_src = []
125
+ valid_src_indices = []
126
+ print(f"Processing {src_col} column...")
127
+ for idx, text in tqdm(enumerate(df[src_col].astype(str)), total=len(df), desc="Cleaning English"):
128
+ cleaned = clean_text(text)
129
+ if len(cleaned) > 10:
130
+ valid_src.append(cleaned)
131
+ valid_src_indices.append(idx)
132
+
133
+ # Clean target column
134
+ valid_tgt = []
135
+ valid_tgt_indices = []
136
+ print(f"\nProcessing {tgt_col} column...")
137
+ for idx, text in tqdm(enumerate(df[tgt_col].astype(str)), total=len(df), desc=f"Cleaning {other_lang_code}"):
138
+ cleaned = clean_text(text)
139
+ if len(cleaned) > 10:
140
+ valid_tgt.append(cleaned)
141
+ valid_tgt_indices.append(idx)
142
+
143
+ print(f"\nAfter cleaning:")
144
+ print(f" Valid English sentences: {len(valid_src):,}")
145
+ print(f" Valid {other_lang_code} sentences: {len(valid_tgt):,}")
146
+
147
+ # Fast filtering (no langdetect)
148
+ print("\nFast filtering sentences...")
149
+ fast_valid_en = []
150
+ for text in tqdm(valid_src, desc="Filtering English"):
151
+ if is_valid_sentence_fast(text, 'en'):
152
+ fast_valid_en.append(text)
153
+
154
+ fast_valid_other = []
155
+ for text in tqdm(valid_tgt, desc=f"Filtering {other_lang_code}"):
156
+ if is_valid_sentence_fast(text, other_lang_code):
157
+ fast_valid_other.append(text)
158
+
159
+ print(f"\nAfter fast filtering:")
160
+ print(f" English: {len(fast_valid_en):,}")
161
+ print(f" {other_lang_code}: {len(fast_valid_other):,}")
162
+
163
+ # If we have enough sentences with fast filtering, use them
164
+ # Otherwise, apply langdetect on a subset
165
+ if len(fast_valid_en) >= en_samples and len(fast_valid_other) >= other_samples:
166
+ final_en = fast_valid_en
167
+ final_other = fast_valid_other
168
+ print("Using fast-filtered sentences (skipping langdetect)")
169
+ else:
170
+ # Apply langdetect on a subset
171
+ print("\nApplying language detection on subset...")
172
+
173
+ # Sample for langdetect (max 100k each)
174
+ sample_en = fast_valid_en[:100000] if len(fast_valid_en) > 100000 else fast_valid_en
175
+ sample_other = fast_valid_other[:100000] if len(fast_valid_other) > 100000 else fast_valid_other
176
+
177
+ final_en = []
178
+ print("Validating English with langdetect...")
179
+ for text in tqdm(sample_en, desc="English langdetect"):
180
+ if is_valid_sentence_with_lang(text, 'en', use_fast=False):
181
+ final_en.append(text)
182
+
183
+ final_other = []
184
+ print(f"Validating {other_lang_code} with langdetect...")
185
+ for text in tqdm(sample_other, desc=f"{other_lang_code} langdetect"):
186
+ if is_valid_sentence_with_lang(text, other_lang_code, use_fast=False):
187
+ final_other.append(text)
188
+
189
+ print(f"\nAfter langdetect:")
190
+ print(f" English: {len(final_en):,}")
191
+ print(f" {other_lang_code}: {len(final_other):,}")
192
+
193
+ # Random sampling
194
+ en_samples = min(en_samples, len(final_en))
195
+ other_samples = min(other_samples, len(final_other))
196
+
197
+ print(f"\nSampling {en_samples:,} English and {other_samples:,} {other_lang_code} sentences...")
198
+
199
+ sampled_en = random.sample(final_en, en_samples)
200
+ sampled_other = random.sample(final_other, other_samples)
201
+
202
+ # Save to files
203
+ output_dir = Path(output_dir)
204
+ output_dir.mkdir(parents=True, exist_ok=True)
205
+
206
+ # Save English sentences
207
+ en_filename = output_dir / f'en_{other_lang_code}_english.txt'
208
+ with open(en_filename, 'w', encoding='utf-8') as f:
209
+ for sentence in sampled_en:
210
+ f.write(f"{sentence}\n")
211
+
212
+ # Save other language sentences
213
+ other_filename = output_dir / f'en_{other_lang_code}_{other_lang_code}.txt'
214
+ with open(other_filename, 'w', encoding='utf-8') as f:
215
+ for sentence in sampled_other:
216
+ f.write(f"{sentence}\n")
217
+
218
+ elapsed = time.time() - start_time
219
+ print(f"\n✓ Saved {en_samples:,} English sentences to: {en_filename}")
220
+ print(f"✓ Saved {other_samples:,} {other_lang_code} sentences to: {other_filename}")
221
+ print(f"⏱️ Processing time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
222
+
223
+ return sampled_en, sampled_other
224
+
225
+ def main():
226
+ # Configuration
227
+ EN_HI_CSV = "en-hi.csv"
228
+ EN_PA_CSV = "en-pa.csv"
229
+ OUTPUT_DIR = "./extracted_sentences"
230
+
231
+ # Sample counts (adjusted for speed)
232
+ # Start with smaller samples for testing
233
+ EN_HI_EN_SAMPLES = 150000 # Reduced for testing
234
+ EN_HI_HI_SAMPLES = 300000
235
+ EN_PA_EN_SAMPLES = 150000
236
+ EN_PA_PA_SAMPLES = 300000
237
+
238
+ print("="*70)
239
+ print("MULTILINGUAL DATA EXTRACTION TOOL")
240
+ print("="*70)
241
+
242
+ # Set random seed for reproducibility
243
+ random.seed(42)
244
+ np.random.seed(42)
245
+
246
+ # Extract from EN-HI
247
+ print("\n" + "="*70)
248
+ print("EXTRACTING FROM ENGLISH-HINDI DATASET")
249
+ print("="*70)
250
+
251
+ en_hi_en, en_hi_hi = extract_from_parallel_csv_optimized(
252
+ EN_HI_CSV, OUTPUT_DIR,
253
+ EN_HI_EN_SAMPLES, EN_HI_HI_SAMPLES, 'hi'
254
+ )
255
+
256
+ # Extract from EN-PA
257
+ print("\n" + "="*70)
258
+ print("EXTRACTING FROM ENGLISH-PUNJABI DATASET")
259
+ print("="*70)
260
+
261
+ en_pa_en, en_pa_pa = extract_from_parallel_csv_optimized(
262
+ EN_PA_CSV, OUTPUT_DIR,
263
+ EN_PA_EN_SAMPLES, EN_PA_PA_SAMPLES, 'pa'
264
+ )
265
+
266
+ # Create combined English file
267
+ print("\n" + "="*70)
268
+ print("CREATING COMBINED ENGLISH FILE")
269
+ print("="*70)
270
+
271
+ all_english = en_hi_en + en_pa_en
272
+ random.shuffle(all_english)
273
+
274
+ combined_filename = Path(OUTPUT_DIR) / "combined_english.txt"
275
+ with open(combined_filename, 'w', encoding='utf-8') as f:
276
+ for sentence in all_english[:100000]: # Take 100k for combined
277
+ f.write(f"{sentence}\n")
278
+
279
+ print(f"\n✓ Saved {min(100000, len(all_english)):,} combined English sentences")
280
+
281
+ # Final statistics
282
+ print("\n" + "="*70)
283
+ print("EXTRACTION COMPLETE - FINAL STATISTICS")
284
+ print("="*70)
285
+ print(f"Total English sentences: {len(all_english):,}")
286
+ print(f"Total Hindi sentences: {len(en_hi_hi):,}")
287
+ print(f"Total Punjabi sentences: {len(en_pa_pa):,}")
288
+
289
+ # Create a summary file
290
+ summary_file = Path(OUTPUT_DIR) / "extraction_summary.txt"
291
+ with open(summary_file, 'w', encoding='utf-8') as f:
292
+ f.write("DATA EXTRACTION SUMMARY\n")
293
+ f.write("="*50 + "\n\n")
294
+ f.write(f"English-Hindi Dataset:\n")
295
+ f.write(f" English sentences: {len(en_hi_en):,}\n")
296
+ f.write(f" Hindi sentences: {len(en_hi_hi):,}\n\n")
297
+ f.write(f"English-Punjabi Dataset:\n")
298
+ f.write(f" English sentences: {len(en_pa_en):,}\n")
299
+ f.write(f" Punjabi sentences: {len(en_pa_pa):,}\n\n")
300
+ f.write(f"Combined English: {min(100000, len(all_english)):,}\n")
301
+ f.write(f"Total corpus size: {len(all_english) + len(en_hi_hi) + len(en_pa_pa):,} sentences\n")
302
+
303
+ print(f"\n📊 Summary saved to: {summary_file}")
304
+ print("\n✅ All done! Ready for corpus creation.")
305
+
306
+ if __name__ == "__main__":
307
+ # Install required package if not installed
308
+ try:
309
+ from tqdm import tqdm
310
+ except ImportError:
311
+ print("Installing tqdm for progress bars...")
312
+ import subprocess
313
+ subprocess.check_call(["pip", "install", "tqdm"])
314
+ from tqdm import tqdm
315
+
316
+ main()
evaluate_model.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 5: Evaluate model quality
3
+ """
4
+
5
+ import torch
6
+ from transformers import GPT2LMHeadModel
7
+ import sentencepiece as spm
8
+ import numpy as np
9
+ from pathlib import Path
10
+ import json
11
+
12
+ def evaluate_multilingual_capabilities(model_path="./checkpoints_tiny/final"):
13
+ """Comprehensive evaluation"""
14
+ print("="*60)
15
+ print("MODEL EVALUATION")
16
+ print("="*60)
17
+
18
+ # Load model
19
+ tokenizer_path = "./final_corpus/multilingual_spm.model"
20
+ tokenizer = spm.SentencePieceProcessor()
21
+ tokenizer.load(tokenizer_path)
22
+
23
+ model = GPT2LMHeadModel.from_pretrained(model_path)
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ model.to(device)
26
+ model.eval()
27
+
28
+ results = {
29
+ "english": {"success": 0, "total": 0, "perplexities": []},
30
+ "hindi": {"success": 0, "total": 0, "perplexities": []},
31
+ "punjabi": {"success": 0, "total": 0, "perplexities": []},
32
+ "mixed": {"success": 0, "total": 0, "perplexities": []},
33
+ }
34
+
35
+ # Test cases
36
+ test_cases = [
37
+ # English
38
+ ("[EN] The cat sat on the", "mat", "english"),
39
+ ("[EN] I like to eat", "food", "english"),
40
+ ("[EN] Water is essential for", "life", "english"),
41
+ ("[EN] The sun rises in the", "east", "english"),
42
+
43
+ # Hindi
44
+ ("[HI] बिल्ली चटाई पर", "बैठी", "hindi"),
45
+ ("[HI] मुझे खाना खाना", "पसंद है", "hindi"),
46
+ ("[HI] पानी जीवन के लिए", "आवश्यक है", "hindi"),
47
+ ("[HI] सूरज पूर्व में", "उगता है", "hindi"),
48
+
49
+ # Punjabi
50
+ ("[PA] ਬਿੱਲੀ ਚੱਟਈ 'ਤੇ", "ਬੈਠੀ", "punjabi"),
51
+ ("[PA] ਮੈਂ ਖਾਣਾ ਖਾਣਾ", "ਪਸੰਦ ਕਰਦਾ ਹਾਂ", "punjabi"),
52
+ ("[PA] ਪਾਣੀ ਜੀਵਨ ਲਈ", "ਜ਼ਰੂਰੀ ਹੈ", "punjabi"),
53
+ ("[PA] ਸੂਰਜ ਪੂਰਬ ਵਿੱਚ", "ਉੱਗਦਾ ਹੈ", "punjabi"),
54
+
55
+ # Mixed
56
+ ("[EN] Hello [HI] नमस्ते", "दोस्तों", "mixed"),
57
+ ("[HI] यह है [EN] good", "news", "mixed"),
58
+ ]
59
+
60
+ print("\nRunning tests...")
61
+
62
+ for prompt, expected_continuation, lang in test_cases:
63
+ # Generate
64
+ input_ids = tokenizer.encode(prompt)
65
+ input_tensor = torch.tensor([input_ids], device=device)
66
+
67
+ with torch.no_grad():
68
+ output = model.generate(
69
+ input_ids=input_tensor,
70
+ max_length=len(input_ids) + 10,
71
+ temperature=0.7,
72
+ do_sample=False, # Greedy for testing
73
+ pad_token_id=0,
74
+ )
75
+
76
+ generated = tokenizer.decode(output[0].tolist())
77
+
78
+ # Check if generation continues meaningfully
79
+ generated_continuation = generated[len(prompt):].strip().lower()
80
+ expected_lower = expected_continuation.lower()
81
+
82
+ # Simple check: if expected word appears in generation
83
+ success = expected_lower in generated_continuation or len(generated_continuation) > 3
84
+
85
+ # Calculate perplexity
86
+ try:
87
+ full_text = prompt + " " + expected_continuation
88
+ text_ids = tokenizer.encode(full_text)
89
+ text_tensor = torch.tensor([text_ids], device=device)
90
+
91
+ with torch.no_grad():
92
+ outputs = model(input_ids=text_tensor, labels=text_tensor)
93
+ loss = outputs.loss
94
+ perplexity = torch.exp(loss).item()
95
+ except:
96
+ perplexity = float('inf')
97
+
98
+ # Update results
99
+ results[lang]["total"] += 1
100
+ if success:
101
+ results[lang]["success"] += 1
102
+ results[lang]["perplexities"].append(perplexity)
103
+
104
+ print(f"\n{lang.upper()}: {prompt}")
105
+ print(f" Generated: {generated_continuation[:50]}...")
106
+ print(f" Expected: {expected_continuation}")
107
+ print(f" Success: {'✓' if success else '✗'}")
108
+ print(f" Perplexity: {perplexity:.2f}")
109
+
110
+ # Calculate metrics
111
+ print("\n" + "="*60)
112
+ print("EVALUATION RESULTS")
113
+ print("="*60)
114
+
115
+ for lang in results:
116
+ if results[lang]["total"] > 0:
117
+ accuracy = results[lang]["success"] / results[lang]["total"] * 100
118
+ avg_perplexity = np.mean(results[lang]["perplexities"])
119
+ print(f"\n{lang.upper()}:")
120
+ print(f" Accuracy: {accuracy:.1f}% ({results[lang]['success']}/{results[lang]['total']})")
121
+ print(f" Avg Perplexity: {avg_perplexity:.2f}")
122
+
123
+ # Overall score
124
+ total_tests = sum(r["total"] for r in results.values())
125
+ total_success = sum(r["success"] for r in results.values())
126
+ overall_accuracy = total_success / total_tests * 100 if total_tests > 0 else 0
127
+
128
+ print(f"\nOVERALL ACCURACY: {overall_accuracy:.1f}%")
129
+
130
+ # Save results
131
+ results["overall_accuracy"] = overall_accuracy
132
+ with open("evaluation_results.json", "w", encoding="utf-8") as f:
133
+ json.dump(results, f, indent=2, ensure_ascii=False)
134
+
135
+ print("\nResults saved to evaluation_results.json")
136
+
137
+ if __name__ == "__main__":
138
+ evaluate_multilingual_capabilities()
final_corpus/multilingual_corpus.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c94985b65991bc86b55f358ebeaf16709e40529c6dd885aeab2d06a96e63be1
3
+ size 107642577
final_corpus/multilingual_corpus_train.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dcc9630ec5fc03986488bca2a394c70c885fa09f4888b87be55b458500982b
3
+ size 102242796
final_corpus/multilingual_corpus_val.txt ADDED
The diff for this file is too large to render. See raw diff
 
final_corpus/multilingual_spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf8ae3d54cbc33b749cfff520a86c0e0cbc131ac949b233b8848cb1bf5fe940
3
+ size 166057
final_corpus/multilingual_spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
model_config.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 2: Model configuration
3
+ """
4
+
5
+ from dataclasses import dataclass
6
+ from transformers import GPT2Config
7
+
8
+ @dataclass
9
+ class ModelConfig:
10
+ # Model architecture
11
+ vocab_size: int = 8000 # Updated from tokenizer
12
+ n_positions: int = 256 # Context length
13
+ n_embd: int = 512 # Hidden size
14
+ n_layer: int = 8 # Number of layers
15
+ n_head: int = 8 # Attention heads
16
+ n_inner: int = 1024 # FFN dimension
17
+
18
+ # Training - REALISTIC VALUES
19
+ batch_size: int = 8 # Per GPU batch size
20
+ gradient_accumulation: int = 4 # Effective batch = 32
21
+ learning_rate: float = 3e-4
22
+ warmup_steps: int = 1000
23
+ total_steps: int = 20000 # ~8-9 epochs, NOT 50000
24
+ weight_decay: float = 0.1
25
+ max_grad_norm: float = 1.0
26
+
27
+ # Data
28
+ train_file: str = "./final_corpus/multilingual_corpus_train.txt"
29
+ val_file: str = "./final_corpus/multilingual_corpus_val.txt"
30
+ tokenizer_path: str = "./final_corpus/multilingual_spm.model"
31
+
32
+ # Checkpoints
33
+ output_dir: str = "./checkpoints"
34
+ save_steps: int = 1000
35
+ eval_steps: int = 500
36
+ logging_steps: int = 100
37
+
38
+ # Mixed precision
39
+ fp16: bool = True
40
+
41
+ def __post_init__(self):
42
+ print(f"\nModel Configuration (REALISTIC):")
43
+ print(f" Parameters: ~{self.total_params:.1f}M")
44
+ print(f" Hidden size: {self.n_embd}")
45
+ print(f" Layers: {self.n_layer}")
46
+ print(f" Context length: {self.n_positions}")
47
+ print(f" Effective batch: {self.effective_batch_size}")
48
+ print(f" Total steps: {self.total_steps} (~8-9 epochs)")
49
+ print(f" Learning rate: {self.learning_rate}")
50
+
51
+ @property
52
+ def effective_batch_size(self):
53
+ return self.batch_size * self.gradient_accumulation
54
+
55
+ @property
56
+ def total_params(self):
57
+ # Rough estimate
58
+ embedding = self.vocab_size * self.n_embd
59
+ attention = 4 * self.n_embd * self.n_embd
60
+ ffn = 2 * self.n_embd * self.n_inner
61
+ ln = 2 * self.n_embd
62
+ per_layer = attention + ffn + ln
63
+ total = embedding + (self.n_layer * per_layer)
64
+ return total / 1e6 # Millions
model_demo.html ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <!DOCTYPE html>
3
+ <html>
4
+ <head>
5
+ <title>Multilingual LM Demo</title>
6
+ <style>
7
+ body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
8
+ .container { display: flex; flex-direction: column; gap: 20px; }
9
+ textarea { width: 100%; height: 100px; padding: 10px; font-size: 16px; }
10
+ button { padding: 10px 20px; background: #4CAF50; color: white; border: none; cursor: pointer; }
11
+ button:hover { background: #45a049; }
12
+ .output { border: 1px solid #ccc; padding: 15px; min-height: 100px; background: #f9f9f9; }
13
+ .language-tag { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; cursor: pointer; }
14
+ </style>
15
+ </head>
16
+ <body>
17
+ <div class="container">
18
+ <h1>Multilingual Language Model Demo</h1>
19
+
20
+ <div>
21
+ <strong>Language:</strong>
22
+ <span class="language-tag" onclick="setLanguage('[EN] ')">English</span>
23
+ <span class="language-tag" onclick="setLanguage('[HI] ')">Hindi</span>
24
+ <span class="language-tag" onclick="setLanguage('[PA] ')">Punjabi</span>
25
+ </div>
26
+
27
+ <textarea id="prompt" placeholder="Enter your prompt here..."></textarea>
28
+
29
+ <div>
30
+ <label>Temperature: <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7"></label>
31
+ <label>Max Length: <input type="number" id="maxlen" min="20" max="500" value="100"></label>
32
+ </div>
33
+
34
+ <button onclick="generate()">Generate</button>
35
+
36
+ <div class="output" id="output">Response will appear here...</div>
37
+ </div>
38
+
39
+ <script>
40
+ function setLanguage(tag) {
41
+ document.getElementById('prompt').value = tag;
42
+ }
43
+
44
+ async function generate() {
45
+ const prompt = document.getElementById('prompt').value;
46
+ const temp = document.getElementById('temp').value;
47
+ const maxlen = document.getElementById('maxlen').value;
48
+
49
+ document.getElementById('output').innerHTML = 'Generating...';
50
+
51
+ try {
52
+ const response = await fetch('/generate', {
53
+ method: 'POST',
54
+ headers: {'Content-Type': 'application/json'},
55
+ body: JSON.stringify({prompt, temp, maxlen})
56
+ });
57
+
58
+ const data = await response.json();
59
+ document.getElementById('output').innerHTML = data.response;
60
+ } catch (error) {
61
+ document.getElementById('output').innerHTML = 'Error: ' + error;
62
+ }
63
+ }
64
+ </script>
65
+ </body>
66
+ </html>
67
+
preprocess.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 1: Create final shuffled corpus and train tokenizer
3
+ """
4
+
5
+ import random
6
+ from pathlib import Path
7
+ import sentencepiece as spm
8
+ from collections import defaultdict
9
+ import numpy as np
10
+
11
+ def create_final_corpus(en_file, hi_file, pa_file, output_file, lang_ratios=None):
12
+ """
13
+ Create final multilingual corpus with language tags
14
+
15
+ Args:
16
+ en_file: English sentences file
17
+ hi_file: Hindi sentences file
18
+ pa_file: Punjabi sentences file
19
+ output_file: Output corpus file
20
+ lang_ratios: Dict with language ratios, {'en': 0.4, 'hi': 0.4, 'pa': 0.2}
21
+ """
22
+
23
+ print("Creating final corpus...")
24
+
25
+ # Default ratios
26
+ if lang_ratios is None:
27
+ lang_ratios = {'en': 0.4, 'hi': 0.4, 'pa': 0.2}
28
+
29
+ # Read sentences
30
+ with open(en_file, 'r', encoding='utf-8') as f:
31
+ en_sentences = [line.strip() for line in f if line.strip()]
32
+
33
+ with open(hi_file, 'r', encoding='utf-8') as f:
34
+ hi_sentences = [line.strip() for line in f if line.strip()]
35
+
36
+ with open(pa_file, 'r', encoding='utf-8') as f:
37
+ pa_sentences = [line.strip() for line in f if line.strip()]
38
+
39
+ print(f"Loaded {len(en_sentences):,} English sentences")
40
+ print(f"Loaded {len(hi_sentences):,} Hindi sentences")
41
+ print(f"Loaded {len(pa_sentences):,} Punjabi sentences")
42
+
43
+ # Determine sample sizes
44
+ total_target = min(len(en_sentences), len(hi_sentences), len(pa_sentences)) * 2
45
+ target_counts = {
46
+ 'en': int(total_target * lang_ratios['en']),
47
+ 'hi': int(total_target * lang_ratios['hi']),
48
+ 'pa': int(total_target * lang_ratios['pa'])
49
+ }
50
+
51
+ print(f"\nTarget counts:")
52
+ print(f" English: {target_counts['en']:,}")
53
+ print(f" Hindi: {target_counts['hi']:,}")
54
+ print(f" Punjabi: {target_counts['pa']:,}")
55
+
56
+ # Sample sentences
57
+ sampled_en = random.sample(en_sentences, min(target_counts['en'], len(en_sentences)))
58
+ sampled_hi = random.sample(hi_sentences, min(target_counts['hi'], len(hi_sentences)))
59
+ sampled_pa = random.sample(pa_sentences, min(target_counts['pa'], len(pa_sentences)))
60
+
61
+ # Create corpus with language tags
62
+ corpus = []
63
+ for sent in sampled_en:
64
+ corpus.append(f"[EN] {sent}")
65
+ for sent in sampled_hi:
66
+ corpus.append(f"[HI] {sent}")
67
+ for sent in sampled_pa:
68
+ corpus.append(f"[PA] {sent}")
69
+
70
+ # Shuffle
71
+ random.shuffle(corpus)
72
+
73
+ # Write to file
74
+ with open(output_file, 'w', encoding='utf-8') as f:
75
+ for line in corpus:
76
+ f.write(f"{line}\n")
77
+
78
+ # Create train/validation split (95/5)
79
+ val_size = int(len(corpus) * 0.05)
80
+ train_corpus = corpus[val_size:]
81
+ val_corpus = corpus[:val_size]
82
+
83
+ train_file = output_file.replace('.txt', '_train.txt')
84
+ val_file = output_file.replace('.txt', '_val.txt')
85
+
86
+ with open(train_file, 'w', encoding='utf-8') as f:
87
+ for line in train_corpus:
88
+ f.write(f"{line}\n")
89
+
90
+ with open(val_file, 'w', encoding='utf-8') as f:
91
+ for line in val_corpus:
92
+ f.write(f"{line}\n")
93
+
94
+ # Statistics
95
+ print(f"\nCorpus created:")
96
+ print(f" Total sentences: {len(corpus):,}")
97
+ print(f" Training sentences: {len(train_corpus):,}")
98
+ print(f" Validation sentences: {len(val_corpus):,}")
99
+
100
+ # Language distribution
101
+ lang_counts = defaultdict(int)
102
+ for line in corpus:
103
+ if line.startswith('[EN]'):
104
+ lang_counts['en'] += 1
105
+ elif line.startswith('[HI]'):
106
+ lang_counts['hi'] += 1
107
+ elif line.startswith('[PA]'):
108
+ lang_counts['pa'] += 1
109
+
110
+ print(f"\nLanguage distribution:")
111
+ for lang, count in lang_counts.items():
112
+ percentage = (count / len(corpus)) * 100
113
+ print(f" {lang.upper()}: {count:,} ({percentage:.1f}%)")
114
+
115
+ return train_file, val_file
116
+
117
+ def train_tokenizer(corpus_file, vocab_size=8000, model_prefix='multilingual'):
118
+ """
119
+ Train SentencePiece tokenizer
120
+ """
121
+ print(f"\nTraining SentencePiece tokenizer with vocab size {vocab_size}...")
122
+
123
+ # First, create a version without language tags for tokenizer training
124
+ temp_corpus = 'temp_tokenizer_corpus.txt'
125
+ with open(corpus_file, 'r', encoding='utf-8') as f_in, \
126
+ open(temp_corpus, 'w', encoding='utf-8') as f_out:
127
+ for line in f_in:
128
+ # Remove language tags for tokenizer training
129
+ if line.startswith('[EN]'):
130
+ f_out.write(line[5:]) # Remove "[EN] "
131
+ elif line.startswith('[HI]'):
132
+ f_out.write(line[5:]) # Remove "[HI] "
133
+ elif line.startswith('[PA]'):
134
+ f_out.write(line[5:]) # Remove "[PA] "
135
+ else:
136
+ f_out.write(line)
137
+
138
+ # SentencePiece training parameters
139
+ spm.SentencePieceTrainer.train(
140
+ input=temp_corpus,
141
+ model_prefix=model_prefix,
142
+ vocab_size=vocab_size,
143
+ character_coverage=0.9995, # Important for multilingual
144
+ model_type='unigram', # Better for multilingual
145
+ split_digits=True,
146
+ allow_whitespace_only_pieces=True,
147
+ remove_extra_whitespaces=False,
148
+ byte_fallback=True, # Important for Indic scripts
149
+ split_by_unicode_script=True,
150
+ input_sentence_size=1000000,
151
+ shuffle_input_sentence=True,
152
+ # Don't use normalization for Indic scripts
153
+ normalization_rule_name='identity',
154
+ seed_sentencepiece_size=1000000,
155
+ num_threads=4
156
+ )
157
+
158
+ # Load and test tokenizer
159
+ sp = spm.SentencePieceProcessor()
160
+ sp.load(f'{model_prefix}.model')
161
+
162
+ print(f"Tokenizer trained successfully!")
163
+ print(f"Vocabulary size: {sp.get_piece_size()}")
164
+
165
+ # Test tokenizer
166
+ test_sentences = [
167
+ "Hello world", # English
168
+ "नमस्ते दुनिया", # Hindi
169
+ "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ" # Punjabi
170
+ ]
171
+
172
+ print("\nTokenizer test:")
173
+ for sent in test_sentences:
174
+ tokens = sp.encode_as_pieces(sent)
175
+ ids = sp.encode_as_ids(sent)
176
+ print(f" '{sent}' -> {tokens} (ids: {ids})")
177
+
178
+ # Clean up
179
+ Path(temp_corpus).unlink()
180
+
181
+ return sp
182
+
183
+ def analyze_tokenizer(sp, corpus_file):
184
+ """Analyze tokenizer coverage"""
185
+ print("\nAnalyzing tokenizer coverage...")
186
+
187
+ languages = {'en': 0, 'hi': 0, 'pa': 0}
188
+ total_tokens = 0
189
+ lang_tokens = defaultdict(int)
190
+
191
+ with open(corpus_file, 'r', encoding='utf-8') as f:
192
+ lines = f.readlines()
193
+
194
+ # Sample 1000 sentences per language
195
+ samples_per_lang = 1000
196
+
197
+ for line in lines:
198
+ if line.startswith('[EN]'):
199
+ lang = 'en'
200
+ text = line[5:].strip()
201
+ elif line.startswith('[HI]'):
202
+ lang = 'hi'
203
+ text = line[5:].strip()
204
+ elif line.startswith('[PA]'):
205
+ lang = 'pa'
206
+ text = line[5:].strip()
207
+ else:
208
+ continue
209
+
210
+ languages[lang] += 1
211
+ if languages[lang] <= samples_per_lang:
212
+ tokens = sp.encode_as_ids(text)
213
+ total_tokens += len(tokens)
214
+ lang_tokens[lang] += len(tokens)
215
+
216
+ print(f"Token counts per language (sampled {samples_per_lang} sentences each):")
217
+ for lang in ['en', 'hi', 'pa']:
218
+ avg_tokens = lang_tokens[lang] / samples_per_lang
219
+ print(f" {lang.upper()}: {avg_tokens:.1f} tokens per sentence")
220
+
221
+ def main():
222
+ # Configuration
223
+ EN_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\en.txt"
224
+ HI_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\hi.txt"
225
+ PA_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\pa.txt"
226
+
227
+ OUTPUT_DIR = "./final_corpus"
228
+ Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
229
+
230
+ FINAL_CORPUS = f"{OUTPUT_DIR}/multilingual_corpus.txt"
231
+ TOKENIZER_PREFIX = f"{OUTPUT_DIR}/multilingual_spm"
232
+
233
+ # Create final corpus
234
+ train_file, val_file = create_final_corpus(
235
+ EN_FILE, HI_FILE, PA_FILE, FINAL_CORPUS,
236
+ lang_ratios={'en': 0.4, 'hi': 0.4, 'pa': 0.2}
237
+ )
238
+
239
+ # Train tokenizer
240
+ sp = train_tokenizer(train_file, vocab_size=8000, model_prefix=TOKENIZER_PREFIX)
241
+
242
+ # Analyze tokenizer
243
+ analyze_tokenizer(sp, train_file)
244
+
245
+ print(f"\n{'='*60}")
246
+ print("PREPROCESSING COMPLETE!")
247
+ print(f"{'='*60}")
248
+ print(f"Files created in {OUTPUT_DIR}:")
249
+ print(f" 1. {FINAL_CORPUS} - Full corpus")
250
+ print(f" 2. {train_file} - Training split")
251
+ print(f" 3. {val_file} - Validation split")
252
+ print(f" 4. {TOKENIZER_PREFIX}.model - SentencePiece model")
253
+ print(f" 5. {TOKENIZER_PREFIX}.vocab - Vocabulary")
254
+ print(f"\nNext step: Train the model with train_model.py")
255
+
256
+ if __name__ == "__main__":
257
+ # Install sentencepiece if not available
258
+ try:
259
+ import sentencepiece as spm
260
+ except ImportError:
261
+ import subprocess
262
+ import sys
263
+ print("Installing sentencepiece...")
264
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
265
+ import sentencepiece as spm
266
+
267
+ main()
test_model.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 4: Test your trained multilingual model
3
+ """
4
+
5
+ import torch
6
+ from transformers import GPT2LMHeadModel
7
+ import sentencepiece as spm
8
+ import os
9
+ from pathlib import Path
10
+
11
+ class MultilingualModel:
12
+ def __init__(self, model_path="./checkpoints_tiny/final"):
13
+ print("="*60)
14
+ print("LOADING MULTILINGUAL MODEL")
15
+ print("="*60)
16
+
17
+ # Check if model exists
18
+ if not os.path.exists(model_path):
19
+ print(f"❌ Model not found at: {model_path}")
20
+ print("Available checkpoints:")
21
+ checkpoints = list(Path("./checkpoints_tiny").glob("checkpoint-*"))
22
+ checkpoints += list(Path("./checkpoints_tiny").glob("step*"))
23
+ checkpoints += list(Path("./checkpoints_tiny").glob("final"))
24
+
25
+ for cp in checkpoints:
26
+ if cp.is_dir():
27
+ print(f" - {cp}")
28
+
29
+ if checkpoints:
30
+ model_path = str(checkpoints[-1]) # Use most recent
31
+ print(f"Using: {model_path}")
32
+ else:
33
+ raise FileNotFoundError("No checkpoints found!")
34
+
35
+ # Load tokenizer
36
+ tokenizer_path = os.path.join(model_path, "tokenizer", "spiece.model")
37
+ if not os.path.exists(tokenizer_path):
38
+ tokenizer_path = "./final_corpus/multilingual_spm.model"
39
+
40
+ print(f"Loading tokenizer from: {tokenizer_path}")
41
+ self.tokenizer = spm.SentencePieceProcessor()
42
+ self.tokenizer.load(tokenizer_path)
43
+
44
+ # Load model
45
+ print(f"Loading model from: {model_path}")
46
+ self.model = GPT2LMHeadModel.from_pretrained(model_path)
47
+
48
+ # Setup device
49
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
+ self.model.to(self.device)
51
+ self.model.eval()
52
+
53
+ print(f"✅ Model loaded on: {self.device}")
54
+ print(f" Parameters: {sum(p.numel() for p in self.model.parameters())/1e6:.1f}M")
55
+ print("="*60)
56
+
57
+ def generate(self, prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
58
+ """Generate text from prompt"""
59
+ # Add language tag if missing
60
+ if not any(prompt.startswith(tag) for tag in ['[EN]', '[HI]', '[PA]']):
61
+ # Try to detect language
62
+ if any(char in prompt for char in 'अआइईउऊएऐओऔकखगघचछजझटठडढणतथदधनपफबभमयरलवशषसह'):
63
+ prompt = f"[HI] {prompt}"
64
+ elif any(char in prompt for char in 'ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਚਛਜਝਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਵਸ਼ਸਹ'):
65
+ prompt = f"[PA] {prompt}"
66
+ else:
67
+ prompt = f"[EN] {prompt}"
68
+
69
+ # Encode
70
+ input_ids = self.tokenizer.encode(prompt)
71
+ input_tensor = torch.tensor([input_ids], device=self.device)
72
+
73
+ # Generate
74
+ with torch.no_grad():
75
+ output = self.model.generate(
76
+ input_ids=input_tensor,
77
+ max_length=max_length,
78
+ temperature=temperature,
79
+ do_sample=True,
80
+ top_k=top_k,
81
+ top_p=top_p,
82
+ pad_token_id=self.tokenizer.pad_id() if self.tokenizer.pad_id() > 0 else 0,
83
+ eos_token_id=self.tokenizer.eos_id() if self.tokenizer.eos_id() > 0 else 2,
84
+ repetition_penalty=1.1,
85
+ )
86
+
87
+ # Decode
88
+ generated = self.tokenizer.decode(output[0].tolist())
89
+
90
+ # Clean up (remove prompt if it's repeated)
91
+ if generated.startswith(prompt):
92
+ result = generated[len(prompt):].strip()
93
+ else:
94
+ result = generated
95
+
96
+ return result
97
+
98
+ def batch_generate(self, prompts, **kwargs):
99
+ """Generate for multiple prompts"""
100
+ results = []
101
+ for prompt in prompts:
102
+ result = self.generate(prompt, **kwargs)
103
+ results.append(result)
104
+ return results
105
+
106
+ def calculate_perplexity(self, text):
107
+ """Calculate perplexity of given text"""
108
+ input_ids = self.tokenizer.encode(text)
109
+ if len(input_ids) < 2:
110
+ return float('inf')
111
+
112
+ input_tensor = torch.tensor([input_ids], device=self.device)
113
+
114
+ with torch.no_grad():
115
+ outputs = self.model(input_ids=input_tensor, labels=input_tensor)
116
+ loss = outputs.loss
117
+
118
+ perplexity = torch.exp(loss).item()
119
+ return perplexity
120
+
121
+ def interactive_mode(self):
122
+ """Interactive chat with model"""
123
+ print("\n" + "="*60)
124
+ print("INTERACTIVE MODE")
125
+ print("="*60)
126
+ print("Enter prompts in any language (add [EN], [HI], [PA] tags)")
127
+ print("Commands: /temp X, /len X, /quit, /help")
128
+ print("="*60)
129
+
130
+ temperature = 0.7
131
+ max_length = 100
132
+
133
+ while True:
134
+ try:
135
+ user_input = input("\nYou: ").strip()
136
+
137
+ if not user_input:
138
+ continue
139
+
140
+ # Handle commands
141
+ if user_input.startswith('/'):
142
+ if user_input == '/quit':
143
+ break
144
+ elif user_input == '/help':
145
+ print("Commands:")
146
+ print(" /temp X - Set temperature (0.1 to 2.0)")
147
+ print(" /len X - Set max length (20 to 500)")
148
+ print(" /quit - Exit")
149
+ print(" /help - Show this help")
150
+ continue
151
+ elif user_input.startswith('/temp'):
152
+ try:
153
+ temp = float(user_input.split()[1])
154
+ if 0.1 <= temp <= 2.0:
155
+ temperature = temp
156
+ print(f"Temperature set to {temperature}")
157
+ else:
158
+ print("Temperature must be between 0.1 and 2.0")
159
+ except:
160
+ print("Usage: /temp 0.7")
161
+ continue
162
+ elif user_input.startswith('/len'):
163
+ try:
164
+ length = int(user_input.split()[1])
165
+ if 20 <= length <= 500:
166
+ max_length = length
167
+ print(f"Max length set to {max_length}")
168
+ else:
169
+ print("Length must be between 20 and 500")
170
+ except:
171
+ print("Usage: /len 100")
172
+ continue
173
+
174
+ # Generate response
175
+ print("Model: ", end="", flush=True)
176
+ response = self.generate(user_input, max_length=max_length, temperature=temperature)
177
+ print(response)
178
+
179
+ except KeyboardInterrupt:
180
+ print("\n\nExiting...")
181
+ break
182
+ except Exception as e:
183
+ print(f"Error: {e}")
184
+
185
+ def run_tests():
186
+ """Run comprehensive tests"""
187
+ print("\n" + "="*60)
188
+ print("COMPREHENSIVE MODEL TESTS")
189
+ print("="*60)
190
+
191
+ # Load model
192
+ model = MultilingualModel()
193
+
194
+ # Test prompts by language
195
+ test_suites = {
196
+ "English": [
197
+ "[EN] The weather today is",
198
+ "[EN] I want to learn",
199
+ "[EN] Artificial intelligence",
200
+ "[EN] The capital of India is",
201
+ "[EN] Once upon a time",
202
+ ],
203
+ "Hindi": [
204
+ "[HI] आज का मौसम",
205
+ "[HI] मैं सीखना चाहता हूं",
206
+ "[HI] कृत्रिम बुद्धिमत्ता",
207
+ "[HI] भारत की राजधानी है",
208
+ "[HI] एक बार की बात है",
209
+ ],
210
+ "Punjabi": [
211
+ "[PA] ਅੱਜ ਦਾ ਮੌਸਮ",
212
+ "[PA] ਮੈਂ ਸਿੱਖਣਾ ਚਾਹੁੰਦਾ ਹਾਂ",
213
+ "[PA] ਕ੍ਰਿਤਰਿਮ ਬੁੱਧੀ",
214
+ "[PA] ਭਾਰਤ ਦੀ ਰਾਜਧਾਨੀ ਹੈ",
215
+ "[PA] ਇੱਕ ਵਾਰ ਦੀ ਗੱਲ ਹੈ",
216
+ ],
217
+ "Language Switching": [
218
+ "[EN] Hello [HI] नमस्ते",
219
+ "[HI] यह अच्छा है [EN] this is good",
220
+ "[PA] ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ [EN] Hello everyone",
221
+ ],
222
+ "Code Mixing": [
223
+ "Hello दुनिया", # No tag, should auto-detect
224
+ "मेरा name है", # Hindi + English
225
+ "Today मौसम is good", # English + Hindi
226
+ ]
227
+ }
228
+
229
+ for suite_name, prompts in test_suites.items():
230
+ print(f"\n{'='*40}")
231
+ print(f"{suite_name.upper()} TESTS")
232
+ print('='*40)
233
+
234
+ for i, prompt in enumerate(prompts):
235
+ print(f"\nTest {i+1}:")
236
+ print(f"Prompt: {prompt}")
237
+
238
+ # Generate
239
+ response = model.generate(prompt, max_length=50, temperature=0.7)
240
+ print(f"Response: {response}")
241
+
242
+ # Calculate perplexity
243
+ try:
244
+ perplexity = model.calculate_perplexity(response)
245
+ print(f"Perplexity: {perplexity:.2f}")
246
+ except:
247
+ pass
248
+
249
+ print("-" * 40)
250
+
251
+ def benchmark_model():
252
+ """Benchmark model performance"""
253
+ print("\n" + "="*60)
254
+ print("MODEL BENCHMARK")
255
+ print("="*60)
256
+
257
+ model = MultilingualModel()
258
+
259
+ import time
260
+
261
+ # Test generation speed
262
+ test_prompt = "[EN] The quick brown fox"
263
+
264
+ times = []
265
+ for _ in range(10):
266
+ start = time.time()
267
+ model.generate(test_prompt, max_length=50)
268
+ end = time.time()
269
+ times.append(end - start)
270
+
271
+ avg_time = sum(times) / len(times)
272
+ print(f"Average generation time (50 tokens): {avg_time:.3f}s")
273
+ print(f"Tokens per second: {50/avg_time:.1f}")
274
+
275
+ # Memory usage
276
+ if torch.cuda.is_available():
277
+ memory_allocated = torch.cuda.memory_allocated() / 1e9
278
+ memory_reserved = torch.cuda.memory_reserved() / 1e9
279
+ print(f"GPU Memory allocated: {memory_allocated:.2f} GB")
280
+ print(f"GPU Memory reserved: {memory_reserved:.2f} GB")
281
+
282
+ def create_web_interface():
283
+ """Simple web interface for the model"""
284
+ html_code = """
285
+ <!DOCTYPE html>
286
+ <html>
287
+ <head>
288
+ <title>Multilingual LM Demo</title>
289
+ <style>
290
+ body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
291
+ .container { display: flex; flex-direction: column; gap: 20px; }
292
+ textarea { width: 100%; height: 100px; padding: 10px; font-size: 16px; }
293
+ button { padding: 10px 20px; background: #4CAF50; color: white; border: none; cursor: pointer; }
294
+ button:hover { background: #45a049; }
295
+ .output { border: 1px solid #ccc; padding: 15px; min-height: 100px; background: #f9f9f9; }
296
+ .language-tag { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; cursor: pointer; }
297
+ </style>
298
+ </head>
299
+ <body>
300
+ <div class="container">
301
+ <h1>Multilingual Language Model Demo</h1>
302
+
303
+ <div>
304
+ <strong>Language:</strong>
305
+ <span class="language-tag" onclick="setLanguage('[EN] ')">English</span>
306
+ <span class="language-tag" onclick="setLanguage('[HI] ')">Hindi</span>
307
+ <span class="language-tag" onclick="setLanguage('[PA] ')">Punjabi</span>
308
+ </div>
309
+
310
+ <textarea id="prompt" placeholder="Enter your prompt here..."></textarea>
311
+
312
+ <div>
313
+ <label>Temperature: <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7"></label>
314
+ <label>Max Length: <input type="number" id="maxlen" min="20" max="500" value="100"></label>
315
+ </div>
316
+
317
+ <button onclick="generate()">Generate</button>
318
+
319
+ <div class="output" id="output">Response will appear here...</div>
320
+ </div>
321
+
322
+ <script>
323
+ function setLanguage(tag) {
324
+ document.getElementById('prompt').value = tag;
325
+ }
326
+
327
+ async function generate() {
328
+ const prompt = document.getElementById('prompt').value;
329
+ const temp = document.getElementById('temp').value;
330
+ const maxlen = document.getElementById('maxlen').value;
331
+
332
+ document.getElementById('output').innerHTML = 'Generating...';
333
+
334
+ try {
335
+ const response = await fetch('/generate', {
336
+ method: 'POST',
337
+ headers: {'Content-Type': 'application/json'},
338
+ body: JSON.stringify({prompt, temp, maxlen})
339
+ });
340
+
341
+ const data = await response.json();
342
+ document.getElementById('output').innerHTML = data.response;
343
+ } catch (error) {
344
+ document.getElementById('output').innerHTML = 'Error: ' + error;
345
+ }
346
+ }
347
+ </script>
348
+ </body>
349
+ </html>
350
+ """
351
+
352
+ # Save HTML
353
+ with open("model_demo.html", "w", encoding="utf-8") as f:
354
+ f.write(html_code)
355
+
356
+ print("Web interface saved as model_demo.html")
357
+ print("To use it, you need a backend server (see create_server.py)")
358
+
359
+ def main():
360
+ """Main function"""
361
+ print("\n" + "="*60)
362
+ print("MULTILINGUAL MODEL PLAYGROUND")
363
+ print("="*60)
364
+ print("\nOptions:")
365
+ print("1. Interactive chat")
366
+ print("2. Run comprehensive tests")
367
+ print("3. Benchmark model")
368
+ print("4. Create web interface")
369
+ print("5. Quick generation test")
370
+ print("6. Exit")
371
+
372
+ # Load model once
373
+ model = None
374
+
375
+ while True:
376
+ try:
377
+ choice = input("\nSelect option (1-6): ").strip()
378
+
379
+ if choice == '1':
380
+ if model is None:
381
+ model = MultilingualModel()
382
+ model.interactive_mode()
383
+
384
+ elif choice == '2':
385
+ run_tests()
386
+
387
+ elif choice == '3':
388
+ benchmark_model()
389
+
390
+ elif choice == '4':
391
+ create_web_interface()
392
+
393
+ elif choice == '5':
394
+ if model is None:
395
+ model = MultilingualModel()
396
+
397
+ prompt = input("Enter prompt: ").strip()
398
+ if prompt:
399
+ response = model.generate(prompt)
400
+ print(f"\nResponse: {response}")
401
+
402
+ elif choice == '6':
403
+ print("Goodbye!")
404
+ break
405
+
406
+ else:
407
+ print("Invalid choice. Please enter 1-6.")
408
+
409
+ except KeyboardInterrupt:
410
+ print("\n\nExiting...")
411
+ break
412
+ except Exception as e:
413
+ print(f"Error: {e}")
414
+ import traceback
415
+ traceback.print_exc()
416
+
417
+ if __name__ == "__main__":
418
+ main()
train_model.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 3: STREAMLINED Training - Minimal, Fast
3
+ """
4
+
5
+ import torch
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from transformers import GPT2LMHeadModel, GPT2Config
8
+ import sentencepiece as spm
9
+ from tqdm import tqdm
10
+ import time
11
+
12
+ # ===== CONFIG =====
13
+ CONFIG = {
14
+ 'train_file': './final_corpus/multilingual_corpus_train.txt',
15
+ 'val_file': './final_corpus/multilingual_corpus_val.txt',
16
+ 'tokenizer_path': './final_corpus/multilingual_spm.model',
17
+
18
+ # Tiny model for fast training
19
+ 'n_positions': 128,
20
+ 'n_embd': 256,
21
+ 'n_layer': 4,
22
+ 'n_head': 4,
23
+ 'n_inner': 512,
24
+
25
+ # Training
26
+ 'batch_size': 2, # Small batch for 4GB
27
+ 'grad_accum': 8, # Effective batch = 16
28
+ 'learning_rate': 2e-4,
29
+ 'total_steps': 5000, # Train for 5000 steps only
30
+ 'save_every': 1000,
31
+ }
32
+
33
+ class SimpleDataset(Dataset):
34
+ def __init__(self, filepath, tokenizer, block_size):
35
+ self.tokenizer = tokenizer
36
+ self.block_size = block_size
37
+
38
+ print("Loading data...")
39
+ with open(filepath, 'r', encoding='utf-8') as f:
40
+ lines = [line.strip() for line in f if line.strip()]
41
+
42
+ # Tokenize all at once
43
+ self.examples = []
44
+ for line in tqdm(lines[:600000], desc="Tokenizing"): # Use only 50K lines
45
+ tokens = tokenizer.encode(line)
46
+ if len(tokens) > 10:
47
+ if len(tokens) > block_size:
48
+ tokens = tokens[:block_size]
49
+ else:
50
+ tokens = tokens + [0] * (block_size - len(tokens))
51
+ self.examples.append(tokens)
52
+
53
+ print(f"Created {len(self.examples)} examples")
54
+
55
+ def __len__(self):
56
+ return len(self.examples)
57
+
58
+ def __getitem__(self, idx):
59
+ return torch.tensor(self.examples[idx], dtype=torch.long)
60
+
61
+ def train_streamlined():
62
+ print("\n" + "="*60)
63
+ print("STREAMLINED TRAINING - FASTEST POSSIBLE")
64
+ print("="*60)
65
+
66
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
67
+ print(f"Device: {device}")
68
+
69
+ # Load tokenizer
70
+ tokenizer = spm.SentencePieceProcessor()
71
+ tokenizer.load(CONFIG['tokenizer_path'])
72
+ vocab_size = tokenizer.get_piece_size()
73
+
74
+ # Create tiny model
75
+ config = GPT2Config(
76
+ vocab_size=vocab_size,
77
+ n_positions=CONFIG['n_positions'],
78
+ n_embd=CONFIG['n_embd'],
79
+ n_layer=CONFIG['n_layer'],
80
+ n_head=CONFIG['n_head'],
81
+ n_inner=CONFIG['n_inner'],
82
+ pad_token_id=0,
83
+ )
84
+
85
+ model = GPT2LMHeadModel(config)
86
+ model.to(device)
87
+ model.train()
88
+
89
+ # Enable gradient checkpointing
90
+ model.gradient_checkpointing_enable()
91
+
92
+ # Create dataset (small)
93
+ dataset = SimpleDataset(CONFIG['train_file'], tokenizer, CONFIG['n_positions'])
94
+ dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
95
+
96
+ # Optimizer
97
+ optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
98
+
99
+ print(f"\nModel: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params")
100
+ print(f"Training steps: {CONFIG['total_steps']}")
101
+ print(f"Estimated time: {CONFIG['total_steps']*0.3/3600:.1f} hours\n")
102
+
103
+ # Training loop
104
+ global_step = 0
105
+ accumulation_steps = 0
106
+ start_time = time.time()
107
+
108
+ while global_step < CONFIG['total_steps']:
109
+ for batch in dataloader:
110
+ batch = batch.to(device)
111
+
112
+ # Forward
113
+ outputs = model(input_ids=batch, labels=batch)
114
+ loss = outputs.loss / CONFIG['grad_accum']
115
+
116
+ # Backward
117
+ loss.backward()
118
+ accumulation_steps += 1
119
+
120
+ # Gradient accumulation
121
+ if accumulation_steps == CONFIG['grad_accum']:
122
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
123
+ optimizer.step()
124
+ optimizer.zero_grad()
125
+
126
+ global_step += 1
127
+ accumulation_steps = 0
128
+
129
+ # Print progress
130
+ if global_step % 100 == 0:
131
+ elapsed = time.time() - start_time
132
+ steps_per_second = global_step / elapsed
133
+ remaining = (CONFIG['total_steps'] - global_step) / steps_per_second
134
+
135
+ print(f"Step {global_step}/{CONFIG['total_steps']} | "
136
+ f"Loss: {loss.item()*CONFIG['grad_accum']:.3f} | "
137
+ f"Remaining: {remaining/3600:.1f}h")
138
+
139
+ # Save checkpoint
140
+ if global_step % CONFIG['save_every'] == 0:
141
+ save_path = f"./checkpoints_tiny/step{global_step}"
142
+ model.save_pretrained(save_path)
143
+ print(f"Saved checkpoint: {save_path}")
144
+
145
+ # Stop if reached total steps
146
+ if global_step >= CONFIG['total_steps']:
147
+ break
148
+
149
+ print(f"\nTraining completed in {(time.time()-start_time)/3600:.2f} hours")
150
+
151
+ # Save final model
152
+ model.save_pretrained("./checkpoints_tiny/final")
153
+ print("Final model saved to ./checkpoints_tiny/final")
154
+
155
+ if __name__ == "__main__":
156
+ train_streamlined()
web_interface.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ Simple web interface using Gradio
4
+ """
5
+
6
+ import torch
7
+ from transformers import GPT2LMHeadModel
8
+ import sentencepiece as spm
9
+ import gradio as gr
10
+ import os
11
+
12
+ class SimpleModel:
13
+ def __init__(self, model_path="./checkpoints_tiny/final"):
14
+ # Load tokenizer
15
+ tokenizer_path = os.path.join(model_path, "tokenizer", "spiece.model")
16
+ if not os.path.exists(tokenizer_path):
17
+ tokenizer_path = "./final_corpus/multilingual_spm.model"
18
+
19
+ self.tokenizer = spm.SentencePieceProcessor()
20
+ self.tokenizer.load(tokenizer_path)
21
+
22
+ # Load model
23
+ self.model = GPT2LMHeadModel.from_pretrained(model_path)
24
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ self.model.to(self.device)
26
+ self.model.eval()
27
+
28
+ def generate(self, prompt, max_length=100, temperature=0.7, top_p=0.95):
29
+ # Add language tag if missing
30
+ if not any(prompt.startswith(tag) for tag in ['[EN]', '[HI]', '[PA]']):
31
+ prompt = f"[EN] {prompt}"
32
+
33
+ input_ids = self.tokenizer.encode(prompt)
34
+ input_tensor = torch.tensor([input_ids], device=self.device)
35
+
36
+ with torch.no_grad():
37
+ output = self.model.generate(
38
+ input_ids=input_tensor,
39
+ max_length=max_length,
40
+ temperature=temperature,
41
+ do_sample=True,
42
+ top_p=top_p,
43
+ pad_token_id=0,
44
+ repetition_penalty=1.1,
45
+ )
46
+
47
+ generated = self.tokenizer.decode(output[0].tolist())
48
+ if generated.startswith(prompt):
49
+ return generated[len(prompt):].strip()
50
+ return generated
51
+
52
+ def create_gradio_interface():
53
+ # Initialize model
54
+ model = SimpleModel()
55
+
56
+ def generate_text(prompt, max_length, temperature, top_p):
57
+ try:
58
+ result = model.generate(prompt, int(max_length), float(temperature), float(top_p))
59
+ return result
60
+ except Exception as e:
61
+ return f"Error: {str(e)}"
62
+
63
+ # Create interface
64
+ with gr.Blocks(title="Multilingual LM Demo", theme=gr.themes.Soft()) as demo:
65
+ gr.Markdown("# 🌍 Multilingual Language Model")
66
+ gr.Markdown("Generate text in English, Hindi, or Punjabi")
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ prompt = gr.Textbox(
71
+ label="Enter prompt",
72
+ placeholder="Start with [EN], [HI], or [PA] for language...",
73
+ lines=3
74
+ )
75
+
76
+ with gr.Row():
77
+ max_length = gr.Slider(20, 500, value=100, label="Max Length")
78
+ temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature")
79
+ top_p = gr.Slider(0.1, 1.0, value=0.95, label="Top-p")
80
+
81
+ generate_btn = gr.Button("Generate", variant="primary")
82
+
83
+ with gr.Column():
84
+ output = gr.Textbox(label="Generated Text", lines=10)
85
+
86
+ # Examples
87
+ gr.Examples(
88
+ examples=[
89
+ ["[EN] The weather today is"],
90
+ ["[HI] आज का मौसम"],
91
+ ["[PA] ਅੱਜ ਦਾ ਮੌਸਮ"],
92
+ ["[EN] Once upon a time in India"],
93
+ ["[HI] भारत एक महान देश है"],
94
+ ["[PA] ਭਾਰਤ ਇੱਕ ਮਹਾਨ ਦੇਸ਼ ਹੈ"],
95
+ ],
96
+ inputs=prompt,
97
+ label="Try these examples:"
98
+ )
99
+
100
+ # Button click
101
+ generate_btn.click(
102
+ fn=generate_text,
103
+ inputs=[prompt, max_length, temperature, top_p],
104
+ outputs=output
105
+ )
106
+
107
+ # Also generate on Enter key
108
+ prompt.submit(
109
+ fn=generate_text,
110
+ inputs=[prompt, max_length, temperature, top_p],
111
+ outputs=output
112
+ )
113
+
114
+ return demo
115
+
116
+ if __name__ == "__main__":
117
+ # Install gradio if not installed
118
+ try:
119
+ import gradio as gr
120
+ except ImportError:
121
+ print("Installing gradio...")
122
+ import subprocess
123
+ subprocess.check_call(["pip", "install", "gradio"])
124
+ import gradio as gr
125
+
126
+ # Create and launch interface
127
+ demo = create_gradio_interface()
128
+ demo.launch(
129
+ server_name="0.0.0.0",
130
+ server_port=7860,
131
+ share=False, # Set to True to get public link
132
+ debug=False
133
+ )