Spaces:
Runtime error
Runtime error
| import os | |
| import gc | |
| import json | |
| import time | |
| import random | |
| import threading | |
| import traceback | |
| from pathlib import Path | |
| from datetime import datetime | |
| import gradio as gr | |
| import psutil | |
| from tqdm.auto import tqdm | |
| from datasets import load_dataset | |
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import ByteLevel | |
| from tokenizers.decoders import ByteLevel as ByteLevelDecoder | |
| from tokenizers.processors import TemplateProcessing | |
| from huggingface_hub import HfApi, create_repo, upload_folder | |
| # ============================================================ | |
| # Env-driven config | |
| # ============================================================ | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() | |
| REPO_ID = os.environ.get("MODEL_REPO_ID", "Efe2898/new-model").strip() | |
| PUSH_TO_HUB = os.environ.get("PUSH_TO_HUB", "1").strip().lower() in {"1", "true", "yes", "y"} | |
| DATASET_ID = os.environ.get("DATASET_ID", "turkish-nlp-suite/BellaTurca").strip() | |
| SUBSETS = [ | |
| s.strip() | |
| for s in os.environ.get( | |
| "SUBSETS", | |
| "AkademikDerlem,OzenliDerlem,temiz-OSCAR,temiz-mC4", | |
| ).split(",") | |
| if s.strip() | |
| ] | |
| TEXT_COLUMN = os.environ.get("TEXT_COLUMN", "text").strip() | |
| SPLIT = os.environ.get("SPLIT", "train").strip() | |
| VOCAB_SIZE = int(os.environ.get("VOCAB_SIZE", "65536")) | |
| MODEL_MAX_LENGTH = int(os.environ.get("MODEL_MAX_LENGTH", "262144")) | |
| BPE_MIN_FREQUENCY = int(os.environ.get("BPE_MIN_FREQUENCY", "3")) | |
| TARGET_TOTAL_EST_TOKENS = int(os.environ.get("TARGET_TOTAL_EST_TOKENS", "700000000")) | |
| TARGET_PER_SUBSET_EST_TOKENS = TARGET_TOTAL_EST_TOKENS // max(1, len(SUBSETS)) | |
| CHARS_PER_EST_TOKEN = float(os.environ.get("CHARS_PER_EST_TOKEN", "4.0")) | |
| CPU_THREADS = int(os.environ.get("CPU_THREADS", "2")) | |
| ROW_BATCH_SIZE = int(os.environ.get("ROW_BATCH_SIZE", "2048")) | |
| MAX_CHARS_PER_BATCH = int(os.environ.get("MAX_CHARS_PER_BATCH", "2000000")) | |
| MAX_TEXT_CHARS = int(os.environ.get("MAX_TEXT_CHARS", "512000")) | |
| MIN_TEXT_CHARS = int(os.environ.get("MIN_TEXT_CHARS", "20")) | |
| ENABLE_CPU_COOLDOWN = os.environ.get("ENABLE_CPU_COOLDOWN", "1").strip().lower() in {"1", "true", "yes", "y"} | |
| MICRO_SLEEP_AFTER_EACH_BATCH = float(os.environ.get("MICRO_SLEEP_AFTER_EACH_BATCH", "0.25")) | |
| COOLDOWN_EVERY_SECONDS = int(os.environ.get("COOLDOWN_EVERY_SECONDS", "60")) | |
| COOLDOWN_SECONDS = float(os.environ.get("COOLDOWN_SECONDS", "5")) | |
| CPU_CHECK_INTERVAL = float(os.environ.get("CPU_CHECK_INTERVAL", "0.10")) | |
| CPU_SOFT_LIMIT_PERCENT = float(os.environ.get("CPU_SOFT_LIMIT_PERCENT", "80")) | |
| CPU_HARD_LIMIT_PERCENT = float(os.environ.get("CPU_HARD_LIMIT_PERCENT", "95")) | |
| CPU_SOFT_SLEEP_SECONDS = float(os.environ.get("CPU_SOFT_SLEEP_SECONDS", "2")) | |
| CPU_HARD_SLEEP_SECONDS = float(os.environ.get("CPU_HARD_SLEEP_SECONDS", "6")) | |
| RAM_HARD_LIMIT_PERCENT = float(os.environ.get("RAM_HARD_LIMIT_PERCENT", "90")) | |
| SHUFFLE_BATCH = os.environ.get("SHUFFLE_BATCH", "0").strip().lower() in {"1", "true", "yes", "y"} | |
| RANDOM_SEED = int(os.environ.get("RANDOM_SEED", "42")) | |
| AUTO_START = os.environ.get("AUTO_START", "0").strip().lower() in {"1", "true", "yes", "y"} | |
| SPACE_PASSWORD = os.environ.get("SPACE_PASSWORD", "738825").strip() | |
| def gradio_auth(username: str, password: str) -> bool: | |
| """Protect the Space UI. Username is ignored; only the password matters.""" | |
| return str(password).strip() == SPACE_PASSWORD | |
| # Use /data when persistent storage is attached; otherwise Space working dir. | |
| ROOT_DIR = Path(os.environ.get("OUTPUT_ROOT", "/data" if Path("/data").exists() else ".")) | |
| OUT_DIR = ROOT_DIR / "rslm_tokenizer_bellaturca_65k" | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| LOG_PATH = OUT_DIR / "training.log" | |
| STATUS_PATH = OUT_DIR / "status.json" | |
| SPECIAL_TOKENS = [ | |
| "<|pad|>", | |
| "<|bos|>", | |
| "<|eos|>", | |
| "<|unk|>", | |
| "<|system|>", | |
| "<|user|>", | |
| "<|assistant|>", | |
| "<|answer|>", | |
| "<|end|>", | |
| "<think>", | |
| "</think>", | |
| ] | |
| # Keep CPU usage sane. | |
| os.environ["RAYON_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["OMP_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["MKL_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["OPENBLAS_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["NUMEXPR_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["VECLIB_MAXIMUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["BLIS_NUM_THREADS"] = str(CPU_THREADS) | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") | |
| _proc = psutil.Process(os.getpid()) | |
| psutil.cpu_percent(interval=None) | |
| _proc.cpu_percent(interval=None) | |
| random.seed(RANDOM_SEED) | |
| _state = { | |
| "thread": None, | |
| "running": False, | |
| "finished": False, | |
| "error": None, | |
| "uploaded": False, | |
| } | |
| def log(msg: str): | |
| ts = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") | |
| line = f"[{ts} UTC] {msg}" | |
| print(line, flush=True) | |
| with open(LOG_PATH, "a", encoding="utf-8") as f: | |
| f.write(line + "\n") | |
| def write_status(**kwargs): | |
| data = { | |
| "running": _state["running"], | |
| "finished": _state["finished"], | |
| "error": _state["error"], | |
| "uploaded": _state["uploaded"], | |
| "repo_id": REPO_ID, | |
| "out_dir": str(OUT_DIR), | |
| "updated_at_utc": datetime.utcnow().isoformat() + "Z", | |
| } | |
| data.update(kwargs) | |
| data["progress_percent"] = round(calculate_progress_percent(data), 4) | |
| stats_obj = data.get("stats") | |
| if isinstance(stats_obj, dict): | |
| per_subset = {} | |
| for subset_name, subset_stats in stats_obj.items(): | |
| try: | |
| subset_est = int(subset_stats.get("est_tokens", 0)) | |
| per_subset[subset_name] = round( | |
| max(0.0, min(100.0, 100.0 * subset_est / max(1, TARGET_PER_SUBSET_EST_TOKENS))), | |
| 4, | |
| ) | |
| except Exception: | |
| pass | |
| data["subset_progress_percent"] = per_subset | |
| with open(STATUS_PATH, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| def calculate_progress_percent(status_data=None): | |
| status_data = status_data or {} | |
| total = status_data.get("total_est_tokens") | |
| stats_obj = status_data.get("stats") | |
| if total is None and isinstance(stats_obj, dict): | |
| try: | |
| total = sum(int(v.get("est_tokens", 0)) for v in stats_obj.values()) | |
| except Exception: | |
| total = 0 | |
| try: | |
| total = int(total or 0) | |
| except Exception: | |
| total = 0 | |
| target = max(1, int(TARGET_TOTAL_EST_TOKENS)) | |
| percent = max(0.0, min(100.0, 100.0 * total / target)) | |
| return percent | |
| def load_status_file(): | |
| if not STATUS_PATH.exists(): | |
| return {} | |
| try: | |
| return json.loads(STATUS_PATH.read_text(encoding="utf-8")) | |
| except Exception: | |
| return {} | |
| def tail_log(n=220): | |
| if not LOG_PATH.exists(): | |
| return "Henüz log yok." | |
| lines = LOG_PATH.read_text(encoding="utf-8", errors="replace").splitlines() | |
| return "\n".join(lines[-n:]) | |
| def system_snapshot(): | |
| vm = psutil.virtual_memory() | |
| cpu = psutil.cpu_percent(interval=CPU_CHECK_INTERVAL) | |
| proc_cpu = _proc.cpu_percent(interval=None) | |
| proc_cpu_norm = proc_cpu / max(1, CPU_THREADS) | |
| return { | |
| "cpu": cpu, | |
| "proc_cpu": proc_cpu, | |
| "proc_cpu_norm": proc_cpu_norm, | |
| "ram_percent": vm.percent, | |
| "rss_gb": _proc.memory_info().rss / (1024 ** 3), | |
| "available_gb": vm.available / (1024 ** 3), | |
| } | |
| _last_cooldown_at = time.time() | |
| def maybe_cooldown(reason="batch"): | |
| global _last_cooldown_at | |
| if not ENABLE_CPU_COOLDOWN: | |
| return | |
| if MICRO_SLEEP_AFTER_EACH_BATCH > 0: | |
| time.sleep(MICRO_SLEEP_AFTER_EACH_BATCH) | |
| snap = system_snapshot() | |
| if snap["ram_percent"] >= RAM_HARD_LIMIT_PERCENT: | |
| log( | |
| f"[RAM-FREN] ram={snap['ram_percent']:.1f}% rss={snap['rss_gb']:.2f}GB " | |
| f"available={snap['available_gb']:.2f}GB -> gc + sleep {CPU_HARD_SLEEP_SECONDS}s" | |
| ) | |
| gc.collect() | |
| time.sleep(CPU_HARD_SLEEP_SECONDS) | |
| _last_cooldown_at = time.time() | |
| return | |
| cpu_load = max(snap["cpu"], snap["proc_cpu_norm"]) | |
| if cpu_load >= CPU_HARD_LIMIT_PERCENT: | |
| log( | |
| f"[CPU-HARD] reason={reason} cpu={snap['cpu']:.1f}% " | |
| f"proc_norm={snap['proc_cpu_norm']:.1f}% -> sleep {CPU_HARD_SLEEP_SECONDS}s" | |
| ) | |
| time.sleep(CPU_HARD_SLEEP_SECONDS) | |
| _last_cooldown_at = time.time() | |
| elif cpu_load >= CPU_SOFT_LIMIT_PERCENT: | |
| log( | |
| f"[CPU-SOFT] reason={reason} cpu={snap['cpu']:.1f}% " | |
| f"proc_norm={snap['proc_cpu_norm']:.1f}% -> sleep {CPU_SOFT_SLEEP_SECONDS}s" | |
| ) | |
| time.sleep(CPU_SOFT_SLEEP_SECONDS) | |
| _last_cooldown_at = time.time() | |
| now = time.time() | |
| if now - _last_cooldown_at >= COOLDOWN_EVERY_SECONDS: | |
| log( | |
| f"[COOLDOWN] reason={reason} regular sleep {COOLDOWN_SECONDS}s | " | |
| f"cpu={snap['cpu']:.1f}% proc_norm={snap['proc_cpu_norm']:.1f}% " | |
| f"rss={snap['rss_gb']:.2f}GB ram={snap['ram_percent']:.1f}%" | |
| ) | |
| gc.collect() | |
| time.sleep(COOLDOWN_SECONDS) | |
| _last_cooldown_at = time.time() | |
| def clean_text(value): | |
| if value is None: | |
| return None | |
| if not isinstance(value, str): | |
| value = str(value) | |
| value = value.strip() | |
| if len(value) < MIN_TEXT_CHARS: | |
| return None | |
| if MAX_TEXT_CHARS and len(value) > MAX_TEXT_CHARS: | |
| value = value[:MAX_TEXT_CHARS] | |
| return value | |
| def estimate_tokens(text: str) -> int: | |
| return max(1, int(len(text) / CHARS_PER_EST_TOKEN)) | |
| def train_tokenizer(): | |
| _state["running"] = True | |
| _state["finished"] = False | |
| _state["error"] = None | |
| _state["uploaded"] = False | |
| write_status(phase="starting") | |
| stats = { | |
| subset: { | |
| "rows": 0, | |
| "chars": 0, | |
| "est_tokens": 0, | |
| "batches": 0, | |
| "exhausted": False, | |
| "started_at": None, | |
| "finished_at": None, | |
| } | |
| for subset in SUBSETS | |
| } | |
| def total_est_tokens_seen(): | |
| return sum(stats[s]["est_tokens"] for s in SUBSETS) | |
| def print_stats(prefix=""): | |
| snap = system_snapshot() | |
| total_rows = sum(stats[s]["rows"] for s in SUBSETS) | |
| total_chars = sum(stats[s]["chars"] for s in SUBSETS) | |
| total_est = total_est_tokens_seen() | |
| log( | |
| f"{prefix} cpu={snap['cpu']:.1f}% proc_norm={snap['proc_cpu_norm']:.1f}% " | |
| f"rss={snap['rss_gb']:.2f}GB ram={snap['ram_percent']:.1f}%" | |
| ) | |
| log(f"TOTAL rows={total_rows:,} chars={total_chars:,} est_tokens={total_est:,}") | |
| for subset in SUBSETS: | |
| st = stats[subset] | |
| pct = 100 * st["est_tokens"] / max(1, TARGET_PER_SUBSET_EST_TOKENS) | |
| log( | |
| f" {subset:16s} rows={st['rows']:,} batches={st['batches']:,} " | |
| f"chars={st['chars']:,} est={st['est_tokens']:,} target%={pct:.2f} " | |
| f"exhausted={st['exhausted']}" | |
| ) | |
| write_status(phase="training", stats=stats, total_est_tokens=total_est) | |
| def load_stream(subset: str): | |
| log(f"Loading stream: {DATASET_ID} / {subset}") | |
| kwargs = dict(path=DATASET_ID, name=subset, split=SPLIT, streaming=True) | |
| if HF_TOKEN: | |
| kwargs["token"] = HF_TOKEN | |
| return iter(load_dataset(**kwargs)) | |
| def pull_batch(iterator, subset: str): | |
| batch = [] | |
| batch_chars = 0 | |
| pulled = 0 | |
| while ( | |
| pulled < ROW_BATCH_SIZE | |
| and batch_chars < MAX_CHARS_PER_BATCH | |
| and stats[subset]["est_tokens"] < TARGET_PER_SUBSET_EST_TOKENS | |
| and total_est_tokens_seen() < TARGET_TOTAL_EST_TOKENS | |
| ): | |
| try: | |
| row = next(iterator) | |
| except StopIteration: | |
| stats[subset]["exhausted"] = True | |
| log(f"[WARN] Subset exhausted before target: {subset}") | |
| break | |
| text = clean_text(row.get(TEXT_COLUMN)) | |
| if text is None: | |
| continue | |
| batch.append(text) | |
| stats[subset]["rows"] += 1 | |
| stats[subset]["chars"] += len(text) | |
| stats[subset]["est_tokens"] += estimate_tokens(text) | |
| batch_chars += len(text) | |
| pulled += 1 | |
| if batch: | |
| stats[subset]["batches"] += 1 | |
| return batch | |
| def text_batches(): | |
| start = time.time() | |
| last_total = 0 | |
| global_batch_idx = 0 | |
| for subset in SUBSETS: | |
| if total_est_tokens_seen() >= TARGET_TOTAL_EST_TOKENS: | |
| break | |
| stats[subset]["started_at"] = time.time() | |
| subset_start_est = total_est_tokens_seen() | |
| stream = load_stream(subset) | |
| log(f"===== START SUBSET: {subset} =====") | |
| while ( | |
| stats[subset]["est_tokens"] < TARGET_PER_SUBSET_EST_TOKENS | |
| and total_est_tokens_seen() < TARGET_TOTAL_EST_TOKENS | |
| and not stats[subset]["exhausted"] | |
| ): | |
| batch = pull_batch(stream, subset) | |
| if not batch: | |
| break | |
| if SHUFFLE_BATCH: | |
| random.shuffle(batch) | |
| total_est = total_est_tokens_seen() | |
| global_batch_idx += 1 | |
| if global_batch_idx % 10 == 0: | |
| elapsed = max(1e-9, time.time() - start) | |
| speed = (total_est - last_total) / max(1e-9, elapsed) / 1e6 | |
| last_total = total_est | |
| print_stats(prefix=f"[batch {global_batch_idx}] elapsed={elapsed/60:.1f}min speed~{speed:.3f}M est_tok/s") | |
| yield batch | |
| maybe_cooldown(reason=f"batch_{global_batch_idx}_{subset}") | |
| stats[subset]["finished_at"] = time.time() | |
| new_est = total_est_tokens_seen() - subset_start_est | |
| print_stats(prefix=f"===== FINISH SUBSET: {subset} | new_est={new_est:,} =====") | |
| del stream | |
| gc.collect() | |
| maybe_cooldown(reason=f"finish_{subset}") | |
| print_stats(prefix="Final iterator stats:") | |
| try: | |
| LOG_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| log("Tokenizer training started.") | |
| log(f"Repo target: {REPO_ID}") | |
| log(f"Dataset: {DATASET_ID} / subsets={SUBSETS}") | |
| log(f"VOCAB_SIZE={VOCAB_SIZE} target_est_tokens={TARGET_TOTAL_EST_TOKENS:,}") | |
| log(f"CPU_THREADS={CPU_THREADS} ROW_BATCH_SIZE={ROW_BATCH_SIZE} MAX_CHARS_PER_BATCH={MAX_CHARS_PER_BATCH:,}") | |
| if PUSH_TO_HUB and not HF_TOKEN: | |
| raise RuntimeError("HF_TOKEN secret/env yok. Push için Space Settings > Secrets içine HF_TOKEN ekle.") | |
| try: | |
| bpe_model = BPE(unk_token="<|unk|>", fuse_unk=True, byte_fallback=True) | |
| except TypeError: | |
| log("[WARN] tokenizers BPE(byte_fallback=True) desteklemiyor; byte_fallback olmadan devam.") | |
| bpe_model = BPE(unk_token="<|unk|>", fuse_unk=True) | |
| tokenizer = Tokenizer(bpe_model) | |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) | |
| tokenizer.decoder = ByteLevelDecoder() | |
| trainer = BpeTrainer( | |
| vocab_size=VOCAB_SIZE, | |
| min_frequency=BPE_MIN_FREQUENCY, | |
| special_tokens=SPECIAL_TOKENS, | |
| initial_alphabet=ByteLevel.alphabet(), | |
| show_progress=True, | |
| ) | |
| t0 = time.time() | |
| tokenizer.train_from_iterator(text_batches(), trainer=trainer, length=TARGET_TOTAL_EST_TOKENS) | |
| elapsed = time.time() - t0 | |
| log(f"Tokenizer training finished in {elapsed/60:.2f} minutes.") | |
| log(f"Final vocab size: {tokenizer.get_vocab_size()}") | |
| bos_id = tokenizer.token_to_id("<|bos|>") | |
| eos_id = tokenizer.token_to_id("<|eos|>") | |
| assert bos_id is not None and eos_id is not None, "BOS/EOS token id bulunamadı." | |
| tokenizer.post_processor = TemplateProcessing( | |
| single="<|bos|> $A <|eos|>", | |
| pair="<|bos|> $A <|eos|> $B:1 <|eos|>:1", | |
| special_tokens=[("<|bos|>", bos_id), ("<|eos|>", eos_id)], | |
| ) | |
| tokenizer.save(str(OUT_DIR / "tokenizer.json")) | |
| special_tokens_map = { | |
| "bos_token": "<|bos|>", | |
| "eos_token": "<|eos|>", | |
| "unk_token": "<|unk|>", | |
| "pad_token": "<|pad|>", | |
| "additional_special_tokens": [ | |
| "<|system|>", | |
| "<|user|>", | |
| "<|assistant|>", | |
| "<|answer|>", | |
| "<|end|>", | |
| "<think>", | |
| "</think>", | |
| ], | |
| } | |
| chat_template = ( | |
| "{% for message in messages %}" | |
| "{% if message['role'] == 'system' %}<|system|>\n{{ message['content'] }}<|end|>\n" | |
| "{% elif message['role'] == 'user' %}<|user|>\n{{ message['content'] }}<|end|>\n" | |
| "{% elif message['role'] == 'assistant' %}<|assistant|>\n{{ message['content'] }}<|end|>\n" | |
| "{% endif %}" | |
| "{% endfor %}" | |
| "{% if add_generation_prompt %}<|assistant|>\n{% endif %}" | |
| ) | |
| tokenizer_config = { | |
| "model_max_length": MODEL_MAX_LENGTH, | |
| "tokenizer_class": "PreTrainedTokenizerFast", | |
| "clean_up_tokenization_spaces": False, | |
| "padding_side": "right", | |
| "truncation_side": "right", | |
| "bos_token": "<|bos|>", | |
| "eos_token": "<|eos|>", | |
| "unk_token": "<|unk|>", | |
| "pad_token": "<|pad|>", | |
| "additional_special_tokens": special_tokens_map["additional_special_tokens"], | |
| "chat_template": chat_template, | |
| } | |
| with open(OUT_DIR / "special_tokens_map.json", "w", encoding="utf-8") as f: | |
| json.dump(special_tokens_map, f, ensure_ascii=False, indent=2) | |
| with open(OUT_DIR / "tokenizer_config.json", "w", encoding="utf-8") as f: | |
| json.dump(tokenizer_config, f, ensure_ascii=False, indent=2) | |
| report = { | |
| "created_at_utc": datetime.utcnow().isoformat() + "Z", | |
| "dataset_id": DATASET_ID, | |
| "subsets": SUBSETS, | |
| "split": SPLIT, | |
| "text_column": TEXT_COLUMN, | |
| "vocab_size": VOCAB_SIZE, | |
| "actual_vocab_size": tokenizer.get_vocab_size(), | |
| "bpe_min_frequency": BPE_MIN_FREQUENCY, | |
| "model_max_length": MODEL_MAX_LENGTH, | |
| "target_total_est_tokens": TARGET_TOTAL_EST_TOKENS, | |
| "target_per_subset_est_tokens": TARGET_PER_SUBSET_EST_TOKENS, | |
| "chars_per_est_token": CHARS_PER_EST_TOKEN, | |
| "cpu_threads": CPU_THREADS, | |
| "row_batch_size": ROW_BATCH_SIZE, | |
| "max_chars_per_batch": MAX_CHARS_PER_BATCH, | |
| "max_text_chars": MAX_TEXT_CHARS, | |
| "stats": stats, | |
| "special_tokens": SPECIAL_TOKENS, | |
| } | |
| with open(OUT_DIR / "tokenizer_training_report.json", "w", encoding="utf-8") as f: | |
| json.dump(report, f, ensure_ascii=False, indent=2) | |
| subsets_md = "\n".join(f"- `{s}`" for s in SUBSETS) | |
| specials_md = "\n".join(f"- `{t}`" for t in SPECIAL_TOKENS) | |
| readme = f"""--- | |
| library_name: tokenizers | |
| language: | |
| - tr | |
| tags: | |
| - turkish | |
| - tokenizer | |
| - byte-level-bpe | |
| - rslm | |
| - think | |
| --- | |
| # RSLM Tokenizer 65K | |
| CPU-safe Byte-Level BPE tokenizer for RSLM. | |
| ## Training data | |
| Dataset: `{DATASET_ID}` | |
| Subsets: | |
| {subsets_md} | |
| Column: `{TEXT_COLUMN}` | |
| Target estimated tokens: `{TARGET_TOTAL_EST_TOKENS:,}` total, approximately `{TARGET_PER_SUBSET_EST_TOKENS:,}` per subset. | |
| ## Vocab | |
| - Requested vocab size: `{VOCAB_SIZE:,}` | |
| - Actual vocab size: `{tokenizer.get_vocab_size():,}` | |
| - BPE min frequency: `{BPE_MIN_FREQUENCY}` | |
| ## Special tokens | |
| {specials_md} | |
| """ | |
| with open(OUT_DIR / "README.md", "w", encoding="utf-8") as f: | |
| f.write(readme) | |
| log("Saved files:") | |
| for p in sorted(OUT_DIR.iterdir()): | |
| log(f" - {p.name} ({p.stat().st_size / 1024 / 1024:.3f} MB)") | |
| if PUSH_TO_HUB: | |
| log("Creating/updating model repo and uploading tokenizer files...") | |
| create_repo(repo_id=REPO_ID, token=HF_TOKEN, repo_type="model", exist_ok=True) | |
| upload_folder( | |
| repo_id=REPO_ID, | |
| folder_path=str(OUT_DIR), | |
| token=HF_TOKEN, | |
| repo_type="model", | |
| commit_message="Add 65K RSLM tokenizer trained on BellaTurca from Space", | |
| ignore_patterns=["training.log", "status.json"], | |
| ) | |
| _state["uploaded"] = True | |
| log(f"Uploaded tokenizer to: https://huggingface.co/{REPO_ID}") | |
| _state["finished"] = True | |
| write_status(phase="finished", stats=stats) | |
| except Exception as e: | |
| _state["error"] = repr(e) | |
| log("ERROR: " + repr(e)) | |
| log(traceback.format_exc()) | |
| write_status(phase="error") | |
| finally: | |
| _state["running"] = False | |
| def start_training(): | |
| if _state["running"]: | |
| return "Zaten çalışıyor.\n\n" + tail_log() | |
| if _state["finished"]: | |
| return "Bu oturumda eğitim bitmiş görünüyor. Yeniden başlatmak için Space'i restart edip tekrar deneyebilirsin.\n\n" + tail_log() | |
| t = threading.Thread(target=train_tokenizer, daemon=True) | |
| _state["thread"] = t | |
| t.start() | |
| return "Eğitim başlatıldı. Logları Refresh ile takip et.\n\n" + tail_log() | |
| def refresh(): | |
| saved_status = load_status_file() | |
| percent = calculate_progress_percent(saved_status) | |
| status = { | |
| "running": _state["running"], | |
| "finished": _state["finished"], | |
| "error": _state["error"], | |
| "uploaded": _state["uploaded"], | |
| "repo_id": REPO_ID, | |
| "out_dir": str(OUT_DIR), | |
| "progress_percent": round(percent, 4), | |
| } | |
| if saved_status: | |
| status.update(saved_status) | |
| status["running"] = _state["running"] | |
| status["finished"] = _state["finished"] or saved_status.get("phase") == "finished" | |
| status["error"] = _state["error"] or saved_status.get("error") | |
| status["uploaded"] = _state["uploaded"] or bool(saved_status.get("uploaded")) | |
| percent = calculate_progress_percent(status) | |
| status["progress_percent"] = round(percent, 4) | |
| return json.dumps(status, ensure_ascii=False, indent=2), tail_log(), round(percent, 4) | |
| if AUTO_START and not _state["running"] and not _state["finished"]: | |
| t = threading.Thread(target=train_tokenizer, daemon=True) | |
| _state["thread"] = t | |
| t.start() | |
| with gr.Blocks(title="RSLM Tokenizer Trainer") as demo: | |
| gr.Markdown( | |
| """ | |
| # RSLM Tokenizer Trainer | |
| Bu Space, BellaTurca üzerinden 65K Byte-Level BPE tokenizer eğitir ve sonucu Hugging Face model repo'suna push eder. | |
| Tokeni koda yazma. Space Settings → Secrets içine `HF_TOKEN` olarak ekle. | |
| """ | |
| ) | |
| with gr.Row(): | |
| start_btn = gr.Button("Start training", variant="primary") | |
| refresh_btn = gr.Button("Refresh logs") | |
| progress_box = gr.Slider( | |
| minimum=0, | |
| maximum=100, | |
| value=0, | |
| step=0.01, | |
| label="Toplam ilerleme (%)", | |
| interactive=False, | |
| ) | |
| status_box = gr.Code(label="Status", language="json") | |
| log_box = gr.Textbox(label="Logs", lines=28, max_lines=50) | |
| start_btn.click(start_training, outputs=log_box) | |
| refresh_btn.click(refresh, outputs=[status_box, log_box, progress_box]) | |
| demo.load(refresh, outputs=[status_box, log_box, progress_box]) | |
| demo.queue(default_concurrency_limit=1).launch(auth=gradio_auth) | |