| | import os
|
| | import json
|
| | from huggingface_hub import create_repo, upload_file
|
| | from tokenizers import Tokenizer, pre_tokenizers, decoders, processors
|
| | from tokenizers.models import SentencePiece as HF_SentencePiece
|
| | import sentencepiece as spm
|
| |
|
| | username = "Hanbiike"
|
| | model_folder = "models"
|
| | graph_file = "graph.jpg"
|
| | readme_file = "README.md"
|
| | special_tokens_file = "special_tokens_map.json"
|
| |
|
| | def generate_tokenizer_config(model_type: str, model_file: str) -> dict:
|
| | return {
|
| | "model_type": model_type,
|
| | "unk_token": "<unk>",
|
| | "bos_token": "<s>",
|
| | "eos_token": "</s>",
|
| | "pad_token": "<pad>",
|
| | "tokenizer_class": "PreTrainedTokenizerFast",
|
| | "tokenizer_file": model_file
|
| | }
|
| |
|
| |
|
| | model_files = [f for f in os.listdir(model_folder) if f.endswith(".model")]
|
| |
|
| |
|
| | special_token_ids = {}
|
| | if os.path.exists(special_tokens_file):
|
| | with open(special_tokens_file, "r", encoding="utf-8") as f:
|
| | special_tokens = json.load(f)
|
| | for token_type, token in special_tokens.items():
|
| | special_token_ids[token] = None
|
| |
|
| | for model_file in model_files:
|
| | model_name = model_file.replace(".model", "")
|
| | vocab_file = model_name + ".vocab"
|
| | repo_id = f"{username}/{model_name}"
|
| |
|
| | print(f"\n📦 Создаю репозиторий: {repo_id}")
|
| | create_repo(repo_id, repo_type="model", exist_ok=True)
|
| |
|
| |
|
| | upload_file(
|
| | path_or_fileobj=os.path.join(model_folder, model_file),
|
| | path_in_repo=model_file,
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| |
|
| |
|
| | vocab_path = os.path.join(model_folder, vocab_file)
|
| | if os.path.exists(vocab_path):
|
| | upload_file(
|
| | path_or_fileobj=vocab_path,
|
| | path_in_repo=vocab_file,
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| |
|
| |
|
| | if os.path.exists(graph_file):
|
| | upload_file(
|
| | path_or_fileobj=graph_file,
|
| | path_in_repo="graph.jpg",
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| |
|
| |
|
| | if os.path.exists(special_tokens_file):
|
| | upload_file(
|
| | path_or_fileobj=special_tokens_file,
|
| | path_in_repo="special_tokens_map.json",
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| |
|
| |
|
| | model_type = "bpe" if "bpe" in model_name.lower() else "unigram"
|
| | tokenizer_config = generate_tokenizer_config(model_type, model_file)
|
| |
|
| | config_path = "tokenizer_config.json"
|
| | with open(config_path, "w", encoding="utf-8") as f:
|
| | json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
|
| |
|
| | upload_file(
|
| | path_or_fileobj=config_path,
|
| | path_in_repo="tokenizer_config.json",
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| |
|
| |
|
| | try:
|
| | sp_model_path = os.path.join(model_folder, model_file)
|
| | sp = spm.SentencePieceProcessor()
|
| | sp.load(sp_model_path)
|
| |
|
| |
|
| | for token in special_token_ids:
|
| | try:
|
| | special_token_ids[token] = sp.piece_to_id(token)
|
| | except:
|
| | special_token_ids[token] = 0
|
| |
|
| | tokenizer = Tokenizer(HF_SentencePiece(sp_model_path))
|
| | tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| | tokenizer.decoder = decoders.Replace("▁", " ")
|
| |
|
| | tokenizer.post_processor = processors.TemplateProcessing(
|
| | single=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')}",
|
| | pair=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')} {special_tokens.get('bos_token', '<s>')} $B {special_tokens.get('eos_token', '</s>')}",
|
| | special_tokens=[
|
| | (special_tokens.get("bos_token", "<s>"), special_token_ids.get(special_tokens.get("bos_token", "<s>"), 1)),
|
| | (special_tokens.get("eos_token", "</s>"), special_token_ids.get(special_tokens.get("eos_token", "</s>"), 2))
|
| | ]
|
| | )
|
| |
|
| | tokenizer.enable_truncation(max_length=512)
|
| |
|
| | tokenizer_path = "tokenizer.json"
|
| | tokenizer.save(tokenizer_path)
|
| |
|
| | upload_file(
|
| | path_or_fileobj=tokenizer_path,
|
| | path_in_repo="tokenizer.json",
|
| | repo_id=repo_id,
|
| | repo_type="model"
|
| | )
|
| | except Exception as e:
|
| | print(f"⚠️ Не удалось создать tokenizer.json для {model_name}: {e}")
|
| |
|
| | print(f"✅ Загружено: {repo_id}")
|
| |
|