Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

README.md +53 -0
added_tokens.json +3 -0
config.json +32 -0
generation_config.json +7 -0
merges.txt +25 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +53 -0
training_args.bin +3 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+language:
+- tr
+- otk
+tags:
+- gokturk
+- text-generation
+- hobby
+license: mit
+---
+# Bitig-Nano
+This is a small AI model that can write text in the Göktürk (Old Turkic) script. It was trained on the Turkish Wikipedia dataset, which was converted into Göktürk letters.
+> [!IMPORTANT]
+> **Disclaimer:** This project is for **fun and hobby purposes only**. It is not a professional tool. The model might make mistakes or write things that are not historically accurate. It is a "Nano" sized model created for educational experiments.
+## How to Use
+You can use this model with the Python `transformers` library.
+```python
+from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
+model_name = "eokayakca/Bitig-Nano"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
+model = GPT2LMHeadModel.from_pretrained(model_name)
+prompt = "𐱅𐰇𐰼" # Start with "Tür"
+input_ids = tokenizer.encode(prompt, return_tensors="pt")
+output = model.generate(input_ids, max_length=50)
+generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+# The output is in Logical Order (Left-to-Right).
+# For correct display, you might need to reverse it to Right-to-Left.
+print(f"Logical (LTR): {generated_text}")
+print(f"Visual (RTL):  {generated_text[::-1]}")
+```
+## About the Data
+The model learned from Turkish Wikipedia articles. We changed the Latin letters to Göktürk letters using a custom converter script.
+**Technical Note:** The text is stored in **Logical Order (Left-to-Right)** for Unicode compatibility. However, Göktürk script is historically written and read from **Right-to-Left**. When you view the output, you may need to reverse it visually.
+## Limitations
+- The model is very small (Nano size).
+- It may generate nonsense words or grammatically incorrect sentences.
+- It is designed for testing and learning, not for serious translation or historical research.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<|endoftext|>": 285
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 0,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 384,
+  "n_head": 6,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 256,
+  "pad_token_id": 1,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "use_cache": true,
+  "vocab_size": 8000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.52.4"
+}

merges.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+#version: 0.2
+ð Ĳ
+ðĲ °
+Ġ Ġ
+ðĲ° Ģ
+ðĲ°Ģ ðĲ°
+ĥ ðĲ°
+ðĲ° ĥðĲ°
+ðĲ° º
+ĠĠ ĠĠ
+ðĲ ±
+ðĲ° Ĩ
+ðĲ° ĩ
+Ġ ðĲ°ĢðĲ°
+ðĲ°Ĩ ðĲ°
+£ ðĲ°ĢðĲ°
+² ðĲ°
+´ ðĲ°º
+Ġ :
+Ġ ðĲ°º
+ðĲ° į
+ðĲ°ĢðĲ° ¢
+ðĲ°ĢðĲ° ĺ
+ðĲ± ĥ
+ðĲ± ĥðĲ°

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbd3c1a2f371e24a6101de87edc10f8979c6d121f18e1386c8bafeab159508d4
+size 55278960

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e69ad3250b6174dc2a2582ab4078c80c27638de330a9511d7b48b1d50b2fbe8
+size 5713

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"<s>":0,"<pad>":1,"</s>":2,"<unk>":3,"<mask>":4,"!":5,"\"":6,"#":7,"$":8,"%":9,"&":10,"'":11,"(":12,")":13,"*":14,"+":15,",":16,"-":17,".":18,"/":19,"0":20,"1":21,"2":22,"3":23,"4":24,"5":25,"6":26,"7":27,"8":28,"9":29,":":30,";":31,"<":32,"=":33,">":34,"?":35,"@":36,"A":37,"B":38,"C":39,"D":40,"E":41,"F":42,"G":43,"H":44,"I":45,"J":46,"K":47,"L":48,"M":49,"N":50,"O":51,"P":52,"Q":53,"R":54,"S":55,"T":56,"U":57,"V":58,"W":59,"X":60,"Y":61,"Z":62,"[":63,"\\":64,"]":65,"^":66,"_":67,"`":68,"a":69,"b":70,"c":71,"d":72,"e":73,"f":74,"g":75,"h":76,"i":77,"j":78,"k":79,"l":80,"m":81,"n":82,"o":83,"p":84,"q":85,"r":86,"s":87,"t":88,"u":89,"v":90,"w":91,"x":92,"y":93,"z":94,"{":95,"|":96,"}":97,"~":98,"¡":99,"¢":100,"£":101,"¤":102,"¥":103,"¦":104,"§":105,"¨":106,"©":107,"ª":108,"«":109,"¬":110,"®":111,"¯":112,"°":113,"±":114,"²":115,"³":116,"´":117,"µ":118,"¶":119,"·":120,"¸":121,"¹":122,"º":123,"»":124,"¼":125,"½":126,"¾":127,"¿":128,"À":129,"Á":130,"Â":131,"Ã":132,"Ä":133,"Å":134,"Æ":135,"Ç":136,"È":137,"É":138,"Ê":139,"Ë":140,"Ì":141,"Í":142,"Î":143,"Ï":144,"Ð":145,"Ñ":146,"Ò":147,"Ó":148,"Ô":149,"Õ":150,"Ö":151,"×":152,"Ø":153,"Ù":154,"Ú":155,"Û":156,"Ü":157,"Ý":158,"Þ":159,"ß":160,"à":161,"á":162,"â":163,"ã":164,"ä":165,"å":166,"æ":167,"ç":168,"è":169,"é":170,"ê":171,"ë":172,"ì":173,"í":174,"î":175,"ï":176,"ð":177,"ñ":178,"ò":179,"ó":180,"ô":181,"õ":182,"ö":183,"÷":184,"ø":185,"ù":186,"ú":187,"û":188,"ü":189,"ý":190,"þ":191,"ÿ":192,"Ā":193,"ā":194,"Ă":195,"ă":196,"Ą":197,"ą":198,"Ć":199,"ć":200,"Ĉ":201,"ĉ":202,"Ċ":203,"ċ":204,"Č":205,"č":206,"Ď":207,"ď":208,"Đ":209,"đ":210,"Ē":211,"ē":212,"Ĕ":213,"ĕ":214,"Ė":215,"ė":216,"Ę":217,"ę":218,"Ě":219,"ě":220,"Ĝ":221,"ĝ":222,"Ğ":223,"ğ":224,"Ġ":225,"ġ":226,"Ģ":227,"ģ":228,"Ĥ":229,"ĥ":230,"Ħ":231,"ħ":232,"Ĩ":233,"ĩ":234,"Ī":235,"ī":236,"Ĭ":237,"ĭ":238,"Į":239,"į":240,"İ":241,"ı":242,"Ĳ":243,"ĳ":244,"Ĵ":245,"ĵ":246,"Ķ":247,"ķ":248,"ĸ":249,"Ĺ":250,"ĺ":251,"Ļ":252,"ļ":253,"Ľ":254,"ľ":255,"Ŀ":256,"ŀ":257,"Ł":258,"ł":259,"Ń":260,"ðĲ":261,"ðĲ°":262,"ĠĠ":263,"ðĲ°Ģ":264,"ðĲ°ĢðĲ°":265,"ĥðĲ°":266,"ðĲ°ĥðĲ°":267,"ðĲ°º":268,"ĠĠĠĠ":269,"ðĲ±":270,"ðĲ°Ĩ":271,"ðĲ°ĩ":272,"ĠðĲ°ĢðĲ°":273,"ðĲ°ĨðĲ°":274,"£ðĲ°ĢðĲ°":275,"²ðĲ°":276,"´ðĲ°º":277,"Ġ:":278,"ĠðĲ°º":279,"ðĲ°į":280,"ðĲ°ĢðĲ°¢":281,"ðĲ°ĢðĲ°ĺ":282,"ðĲ±ĥ":283,"ðĲ±ĥðĲ°":284}