{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%writefile install.sh\npip install -q transformers datasets tokenizers evaluate accelerate lm-eval","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:37:26.466055Z","iopub.execute_input":"2026-04-26T08:37:26.466325Z","iopub.status.idle":"2026-04-26T08:37:26.477305Z","shell.execute_reply.started":"2026-04-26T08:37:26.466291Z","shell.execute_reply":"2026-04-26T08:37:26.476356Z"}},"outputs":[{"name":"stdout","text":"Writing install.sh\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"!bash install.sh","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:08.522844Z","iopub.execute_input":"2026-04-26T08:38:08.523643Z","iopub.status.idle":"2026-04-26T08:38:24.439891Z","shell.execute_reply.started":"2026-04-26T08:38:08.523613Z","shell.execute_reply":"2026-04-26T08:38:24.439193Z"}},"outputs":[{"name":"stdout","text":"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.4/56.4 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m100.8/100.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m91.1/91.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25h Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n Building wheel for word2number (setup.py) ... \u001b[?25l\u001b[?25hdone\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"%%writefile train_tokenizer.py\nfrom tokenizers import ByteLevelBPETokenizer\nfrom datasets import load_dataset\nimport os\nfrom transformers import PreTrainedTokenizerFast\n\nprint(\"Loading dataset for tokenizer training...\")\ndataset = load_dataset(\"HuggingFaceFW/fineweb-edu\", name=\"sample-10BT\", split='train', streaming=True)\n\ndef get_training_corpus():\n for i, example in enumerate(dataset):\n yield example[\"text\"]\n if i >= 50_000:\n break\n\nprint(\"Training tokenizer (4096 vocab)...\")\ntokenizer = ByteLevelBPETokenizer()\ntokenizer.train_from_iterator(\n get_training_corpus(),\n vocab_size=4096,\n min_frequency=2,\n special_tokens=[\"<|endoftext|>\"]\n)\n\nfast_tokenizer = PreTrainedTokenizerFast(\n tokenizer_object=tokenizer,\n bos_token=\"<|endoftext|>\",\n eos_token=\"<|endoftext|>\",\n unk_token=\"<|endoftext|>\",\n pad_token=\"<|endoftext|>\"\n)\n\nos.makedirs(\"spark_tokenizer\", exist_ok=True)\nfast_tokenizer.save_pretrained(\"spark_tokenizer\")\nprint(\"Tokenizer saved in 'spark_tokenizer'!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:35.756835Z","iopub.execute_input":"2026-04-26T08:38:35.757137Z","iopub.status.idle":"2026-04-26T08:38:35.763094Z","shell.execute_reply.started":"2026-04-26T08:38:35.757110Z","shell.execute_reply":"2026-04-26T08:38:35.762397Z"}},"outputs":[{"name":"stdout","text":"Writing train_tokenizer.py\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"!python3 train_tokenizer.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:38:37.912612Z","iopub.execute_input":"2026-04-26T08:38:37.913465Z","iopub.status.idle":"2026-04-26T08:39:53.269430Z","shell.execute_reply.started":"2026-04-26T08:38:37.913416Z","shell.execute_reply":"2026-04-26T08:39:53.268694Z"}},"outputs":[{"name":"stdout","text":"Lade Datensatz für Tokenizer-Training...\nREADME.md: 26.4kB [00:00, 51.7MB/s]\nWarning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\nResolving data files: 100%|██████████████| 2410/2410 [00:00<00:00, 30662.35it/s]\nTrainiere Tokenizer (4096 Vocab)...\n\u001b[2K[00:00:00] Tokenize words ██████████████████ 585540 / 585540[00:00:00] Tokenize words ██████████████████ 0 / 0\n\u001b[2K[00:00:00] Count pairs ██████████████████ 585540 / 585540\n\u001b[2K[00:00:03] Compute merges ██████████████████ 3839 / 3839\nTokenizer in 'spark_tokenizer' gespeichert!\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"%%writefile prep_data.py\nfrom datasets import load_dataset\nfrom transformers import PreTrainedTokenizerFast\nimport os\n\ntokenizer = PreTrainedTokenizerFast.from_pretrained(\"spark_tokenizer\")\nblock_size = 512\n\nprint(\"Loading FineWeb-Edu (500000 Samples)...\")\ndataset = load_dataset(\"HuggingFaceFW/fineweb-edu\", name=\"sample-10BT\", split='train[:500000]')\n\ndef tokenize_function(examples):\n return tokenizer(examples[\"text\"])\n\nprint(\"Tokenizing dataset...\")\ncols_to_remove = dataset.column_names\ntokenized_datasets = dataset.map(\n tokenize_function, \n batched=True, \n num_proc=4, \n remove_columns=cols_to_remove\n)\n\ndef group_texts(examples):\n concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n total_length = len(concatenated_examples[list(examples.keys())[0]])\n total_length = (total_length // block_size) * block_size\n result = {\n k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n for k, t in concatenated_examples.items()\n }\n result[\"labels\"] = result[\"input_ids\"].copy()\n return result\n\nprint(\"Grouping into blocks (512)...\")\nlm_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)\n\ntotal_tokens = len(lm_datasets) * block_size\ntokens_in_mrd = total_tokens / 1_000_000_000\n\nprint(\"-\" * 40)\nprint(f\"✅ Token Count: {tokens_in_mrd:.6f} Mrd. ({total_tokens:,} Tokens)\")\nprint(\"-\" * 40)\n\nlm_datasets.save_to_disk(\"spark_v4_data\")\nprint(\"Training data saved in 'spark_v4_data' successfully!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:40:06.565748Z","iopub.execute_input":"2026-04-26T08:40:06.566389Z","iopub.status.idle":"2026-04-26T08:40:06.572644Z","shell.execute_reply.started":"2026-04-26T08:40:06.566354Z","shell.execute_reply":"2026-04-26T08:40:06.571875Z"}},"outputs":[{"name":"stdout","text":"Writing prep_data.py\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"!python3 prep_data.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T08:40:09.124594Z","iopub.execute_input":"2026-04-26T08:40:09.125422Z","iopub.status.idle":"2026-04-26T09:27:31.711360Z","shell.execute_reply.started":"2026-04-26T08:40:09.125383Z","shell.execute_reply":"2026-04-26T09:27:31.710512Z"}},"outputs":[{"name":"stdout","text":"Lade FineWeb-Edu (500.000 Beispiele für den Data-Scaling-Effekt!)...\nWarning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\nResolving data files: 100%|██████████████| 2410/2410 [00:00<00:00, 30659.56it/s]\nsample/10BT/000_00000.parquet: 22%|█▋ | 469M/2.15G [00:08<00:31, 53.4MB/s]\nsample/10BT/001_00000.parquet: 37%|██▉ | 805M/2.15G [00:08<00:14, 94.3MB/s]\nsample/10BT/002_00000.parquet: 37%|██▉ | 805M/2.15G [00:08<00:13, 96.5MB/s]\nsample/10BT/003_00000.parquet: 41%|███▏ | 872M/2.15G [00:08<00:13, 97.2MB/s]\nsample/10BT/004_00000.parquet: 31%|██▍ | 670M/2.15G [00:08<00:19, 75.4MB/s]\nsample/10BT/005_00000.parquet: 22%|█▋ | 469M/2.15G [00:08<00:31, 53.3MB/s]\nsample/10BT/006_00000.parquet: 78%|██████▏ | 1.68G/2.15G [00:14<00:04, 116MB/s]\nsample/10BT/007_00000.parquet: 36%|██▊ | 768M/2.15G [00:09<00:16, 84.9MB/s]\nsample/10BT/008_00000.parquet: 48%|███▊ | 1.02G/2.15G [00:08<00:09, 114MB/s]\nsample/10BT/009_00000.parquet: 28%|██▏ | 604M/2.15G [00:09<00:23, 65.0MB/s]\nsample/10BT/010_00000.parquet: 36%|██▊ | 768M/2.15G [00:09<00:16, 84.2MB/s]\nsample/10BT/011_00000.parquet: 41%|███▏ | 872M/2.15G [00:08<00:13, 97.1MB/s]\nsample/10BT/012_00000.parquet: 44%|███▉ | 939M/2.15G [00:09<00:11, 103MB/s]\nsample/10BT/013_00000.parquet: 50%|████▍ | 268M/541M [00:03<00:03, 85.1MB/s]\nGenerating train split: 100%|█| 9672101/9672101 [04:37<00:00, 34909.16 examples/\nTokenisiere Datensatz (Multiprozessing aktiviert)...\nMap (num_proc=4): 100%|█████████| 500000/500000 [10:19<00:00, 806.64 examples/s]\nGruppiere in Blöcke (512)...\nMap (num_proc=4): 100%|█████████| 500000/500000 [29:47<00:00, 279.74 examples/s]\n----------------------------------------\n✅ Token Count: 0.693131 Mrd. (693,130,752 Tokens)\n----------------------------------------\nSaving the dataset (19/19 shards): 100%|█| 1353771/1353771 [00:18<00:00, 71702.0\nTrainingsdaten in 'spark_v4_data' gespeichert!\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"%%writefile train_model.py\nfrom transformers import LlamaConfig, LlamaForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling\nfrom transformers import PreTrainedTokenizerFast\nfrom datasets import load_from_disk\nimport torch\n\ntokenizer = PreTrainedTokenizerFast.from_pretrained(\"spark_tokenizer\")\ndataset = load_from_disk(\"spark_v4_data\")\n\nsplit_dataset = dataset.train_test_split(test_size=0.05, seed=42)\ntrain_dataset = split_dataset[\"train\"]\neval_dataset = split_dataset[\"test\"]\n\nconfig = LlamaConfig(\n vocab_size=len(tokenizer),\n hidden_size=256,\n intermediate_size=512,\n num_hidden_layers=6,\n num_attention_heads=8,\n max_position_embeddings=512,\n tie_word_embeddings=True, \n bos_token_id=tokenizer.bos_token_id,\n eos_token_id=tokenizer.eos_token_id,\n)\n\nmodel = LlamaForCausalLM(config)\nprint(f\"Model parameters: {model.num_parameters() / 1e6:.2f}M\")\n\ntraining_args = TrainingArguments(\n output_dir=\"./spark_v4_out\",\n eval_strategy=\"steps\",\n eval_steps=1500,\n logging_steps=100,\n save_steps=3000,\n learning_rate=1e-3,\n weight_decay=0.1,\n per_device_train_batch_size=128,\n per_device_eval_batch_size=128,\n max_steps=15000,\n lr_scheduler_type=\"cosine\",\n warmup_steps=1000,\n fp16=True, \n report_to=\"none\",\n optim=\"adamw_torch_fused\",\n dataloader_num_workers=4,\n dataloader_pin_memory=True,\n)\n\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=train_dataset,\n eval_dataset=eval_dataset,\n data_collator=data_collator,\n)\n\nprint(\"Starting v4 training...\")\ntrainer.train()\n\nmodel.save_pretrained(\"./spark_v4_final\")\ntokenizer.save_pretrained(\"./spark_v4_final\")\nprint(\"Training finished and v4 model saved successfully!\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T09:30:37.346192Z","iopub.execute_input":"2026-04-26T09:30:37.346497Z","iopub.status.idle":"2026-04-26T09:30:37.353238Z","shell.execute_reply.started":"2026-04-26T09:30:37.346471Z","shell.execute_reply":"2026-04-26T09:30:37.352371Z"}},"outputs":[{"name":"stdout","text":"Overwriting train_model.py\n","output_type":"stream"}],"execution_count":9},{"cell_type":"code","source":"!python3 train_model.py","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2026-04-26T09:30:39.118288Z","iopub.execute_input":"2026-04-26T09:30:39.119024Z"}},"outputs":[{"name":"stdout","text":"Loading dataset from disk: 100%|████████████████| 19/19 [00:01<00:00, 10.23it/s]\nModel parameters: 4.98M\nStarting v4 training...\n{'loss': '7.792', 'grad_norm': '0.7288', 'learning_rate': '9.9e-05', 'epoch': '0.0199'}\n{'loss': '6.79', 'grad_norm': '0.6882', 'learning_rate': '0.000199', 'epoch': '0.03981'}\n{'loss': '5.921', 'grad_norm': '1.194', 'learning_rate': '0.000299', 'epoch': '0.05971'}\n{'loss': '5.244', 'grad_norm': '1.192', 'learning_rate': '0.000399', 'epoch': '0.07962'}\n{'loss': '4.753', 'grad_norm': '1.035', 'learning_rate': '0.000499', 'epoch': '0.09952'}\n{'loss': '4.447', 'grad_norm': '0.9472', 'learning_rate': '0.000599', 'epoch': '0.1194'}\n{'loss': '4.254', 'grad_norm': '0.7994', 'learning_rate': '0.000699', 'epoch': '0.1393'}\n{'loss': '4.118', 'grad_norm': '1.023', 'learning_rate': '0.000799', 'epoch': '0.1592'}\n{'loss': '4.027', 'grad_norm': '0.6585', 'learning_rate': '0.000899', 'epoch': '0.1791'}\n{'loss': '3.937', 'grad_norm': '0.5435', 'learning_rate': '0.000999', 'epoch': '0.199'}\n{'loss': '3.845', 'grad_norm': '0.7014', 'learning_rate': '0.0009999', 'epoch': '0.2189'}\n{'loss': '3.757', 'grad_norm': '0.6224', 'learning_rate': '0.0009995', 'epoch': '0.2389'}\n{'loss': '3.698', 'grad_norm': '0.608', 'learning_rate': '0.0009989', 'epoch': '0.2588'}\n{'loss': '3.656', 'grad_norm': '0.5573', 'learning_rate': '0.000998', 'epoch': '0.2787'}\n{'loss': '3.621', 'grad_norm': '0.5656', 'learning_rate': '0.0009969', 'epoch': '0.2986'}\n 10%|███▌ | 1500/15000 [31:27<4:43:51, 1.26s/it]\n 0%| | 0/265 [00:00