Hanbaike commited on
Commit
059fc10
·
verified ·
1 Parent(s): bc4b166

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ kyrgyz_clean_sentences.txt filter=lfs diff=lfs merge=lfs -text
37
+ text/kir_community_2017-sentences.txt filter=lfs diff=lfs merge=lfs -text
38
+ text/kir_newscrawl_2011_300K-sentences.txt filter=lfs diff=lfs merge=lfs -text
39
+ text/kir_newscrawl_2016_1M-sentences.txt filter=lfs diff=lfs merge=lfs -text
40
+ text/kir_wikipedia_2016_300K-sentences.txt filter=lfs diff=lfs merge=lfs -text
41
+ text/kir_wikipedia_2021_300K-sentences.txt filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,18 +0,0 @@
1
- # Ignore big text files
2
- # The original corpus files are too large to be included in the repository.
3
- text/*.txt
4
-
5
- # Ignore large corpus files
6
- kyrgyz_clean_sentences.txt
7
-
8
- upload_models.py
9
-
10
- # Python cache
11
- __pycache__/
12
- *.pyc
13
-
14
- # Jupyter
15
- .ipynb_checkpoints/
16
-
17
- # System files
18
- .DS_Store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
kyrgyz_clean_sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fd06f17964e5b6f2d6b7ed5084ad12314009bd1da8120685dc36826cf790006
3
+ size 299850804
text/kir_community_2017-sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a42efd45d7431a731b3ba7c65a2d05114291cd5775ce9886eeb57c6f8ffbecc
3
+ size 55906279
text/kir_newscrawl_2011_300K-sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f45a5609f28be72ffbdbe31d417af10e6bc2739d6db65391409afb83fa39370f
3
+ size 58955097
text/kir_newscrawl_2016_1M-sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99c33b32d3f30313984bef189075c5f555888e4a1eb3665b27270e848544812
3
+ size 211331519
text/kir_wikipedia_2010_10K-sentences.txt ADDED
The diff for this file is too large to render. See raw diff
 
text/kir_wikipedia_2016_300K-sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:438ac5addefe83a59fe5d70fe50567073c22e1018426c4ced4ae6e168e5e5288
3
+ size 57890718
text/kir_wikipedia_2021_300K-sentences.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cceaeb7dc03b6044565a6d7e09f0eafe0d035c8be8497d8156abed54cca0b06
3
+ size 56255562
upload_models.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from huggingface_hub import create_repo, upload_file
4
+ from tokenizers import Tokenizer, pre_tokenizers, decoders, processors
5
+ from tokenizers.models import SentencePiece as HF_SentencePiece
6
+ import sentencepiece as spm
7
+
8
+ username = "Hanbiike"
9
+ model_folder = "models"
10
+ graph_file = "graph.jpg"
11
+ readme_file = "README.md"
12
+ special_tokens_file = "special_tokens_map.json"
13
+
14
+ def generate_tokenizer_config(model_type: str, model_file: str) -> dict:
15
+ return {
16
+ "model_type": model_type,
17
+ "unk_token": "<unk>",
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "<pad>",
21
+ "tokenizer_class": "PreTrainedTokenizerFast",
22
+ "tokenizer_file": model_file
23
+ }
24
+
25
+ # 📁 Получаем все .model файлы
26
+ model_files = [f for f in os.listdir(model_folder) if f.endswith(".model")]
27
+
28
+ # 📖 Загружаем карту специальных токенов
29
+ special_token_ids = {}
30
+ if os.path.exists(special_tokens_file):
31
+ with open(special_tokens_file, "r", encoding="utf-8") as f:
32
+ special_tokens = json.load(f)
33
+ for token_type, token in special_tokens.items():
34
+ special_token_ids[token] = None # ID будет определён позже через spm
35
+
36
+ for model_file in model_files:
37
+ model_name = model_file.replace(".model", "")
38
+ vocab_file = model_name + ".vocab"
39
+ repo_id = f"{username}/{model_name}"
40
+
41
+ print(f"\n📦 Создаю репозиторий: {repo_id}")
42
+ create_repo(repo_id, repo_type="model", exist_ok=True)
43
+
44
+ # ✅ Загрузка .model
45
+ upload_file(
46
+ path_or_fileobj=os.path.join(model_folder, model_file),
47
+ path_in_repo=model_file,
48
+ repo_id=repo_id,
49
+ repo_type="model"
50
+ )
51
+
52
+ # ✅ Загрузка .vocab (если есть)
53
+ vocab_path = os.path.join(model_folder, vocab_file)
54
+ if os.path.exists(vocab_path):
55
+ upload_file(
56
+ path_or_fileobj=vocab_path,
57
+ path_in_repo=vocab_file,
58
+ repo_id=repo_id,
59
+ repo_type="model"
60
+ )
61
+
62
+ # ✅ Загрузка graph.jpg
63
+ if os.path.exists(graph_file):
64
+ upload_file(
65
+ path_or_fileobj=graph_file,
66
+ path_in_repo="graph.jpg",
67
+ repo_id=repo_id,
68
+ repo_type="model"
69
+ )
70
+
71
+ # ✅ Загрузка special_tokens_map.json
72
+ if os.path.exists(special_tokens_file):
73
+ upload_file(
74
+ path_or_fileobj=special_tokens_file,
75
+ path_in_repo="special_tokens_map.json",
76
+ repo_id=repo_id,
77
+ repo_type="model"
78
+ )
79
+
80
+ # ✅ Генерация tokenizer_config.json
81
+ model_type = "bpe" if "bpe" in model_name.lower() else "unigram"
82
+ tokenizer_config = generate_tokenizer_config(model_type, model_file)
83
+
84
+ config_path = "tokenizer_config.json"
85
+ with open(config_path, "w", encoding="utf-8") as f:
86
+ json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
87
+
88
+ upload_file(
89
+ path_or_fileobj=config_path,
90
+ path_in_repo="tokenizer_config.json",
91
+ repo_id=repo_id,
92
+ repo_type="model"
93
+ )
94
+
95
+ # ✅ Генерация tokenizer.json (универсально для BPE и Unigram)
96
+ try:
97
+ sp_model_path = os.path.join(model_folder, model_file)
98
+ sp = spm.SentencePieceProcessor()
99
+ sp.load(sp_model_path)
100
+
101
+ # Получаем ID специальных токенов
102
+ for token in special_token_ids:
103
+ try:
104
+ special_token_ids[token] = sp.piece_to_id(token)
105
+ except:
106
+ special_token_ids[token] = 0 # fallback
107
+
108
+ tokenizer = Tokenizer(HF_SentencePiece(sp_model_path))
109
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
110
+ tokenizer.decoder = decoders.Replace("▁", " ")
111
+
112
+ tokenizer.post_processor = processors.TemplateProcessing(
113
+ single=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')}",
114
+ pair=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')} {special_tokens.get('bos_token', '<s>')} $B {special_tokens.get('eos_token', '</s>')}",
115
+ special_tokens=[
116
+ (special_tokens.get("bos_token", "<s>"), special_token_ids.get(special_tokens.get("bos_token", "<s>"), 1)),
117
+ (special_tokens.get("eos_token", "</s>"), special_token_ids.get(special_tokens.get("eos_token", "</s>"), 2))
118
+ ]
119
+ )
120
+
121
+ tokenizer.enable_truncation(max_length=512)
122
+
123
+ tokenizer_path = "tokenizer.json"
124
+ tokenizer.save(tokenizer_path)
125
+
126
+ upload_file(
127
+ path_or_fileobj=tokenizer_path,
128
+ path_in_repo="tokenizer.json",
129
+ repo_id=repo_id,
130
+ repo_type="model"
131
+ )
132
+ except Exception as e:
133
+ print(f"⚠️ Не удалось создать tokenizer.json для {model_name}: {e}")
134
+
135
+ print(f"✅ Загружено: {repo_id}")