| """Apply all recent fixes to the training notebook.""" |
| import json |
|
|
| nb = json.load(open("notebooks/financial_doc_training.ipynb", "r", encoding="utf-8")) |
|
|
| |
| step1_source = [ |
| "import os\n", |
| "import json\n", |
| "import glob\n", |
| "import random\n", |
| "import hashlib\n", |
| "import torch\n", |
| "\n", |
| "# ============================================\n", |
| "# Credentials from Kaggle Secrets\n", |
| "# ============================================\n", |
| "from kaggle_secrets import UserSecretsClient\n", |
| "secrets = UserSecretsClient()\n", |
| "HF_USERNAME = secrets.get_secret(\"HF_USERNAME\")\n", |
| "HF_TOKEN = secrets.get_secret(\"HF_TOKEN\")\n", |
| "print(f\"β
Credentials loaded for {HF_USERNAME}\")\n", |
| "\n", |
| "# ============================================\n", |
| "# Auto-detect Data Paths\n", |
| "# ============================================\n", |
| "print(\"\\nπ Available input data:\")\n", |
| "for item in os.listdir(\"/kaggle/input/\"):\n", |
| " subpath = f\"/kaggle/input/{item}\"\n", |
| " print(f\" /kaggle/input/{item}/\")\n", |
| " if os.path.isdir(subpath):\n", |
| " for sub in os.listdir(subpath)[:5]:\n", |
| " print(f\" βββ {sub}\")\n", |
| "\n", |
| "# Find synthetic data\n", |
| "SYNTHETIC_PATH = None\n", |
| "for folder in os.listdir(\"/kaggle/input/\"):\n", |
| " candidate = f\"/kaggle/input/{folder}/with_anomalies.jsonl\"\n", |
| " if os.path.exists(candidate):\n", |
| " SYNTHETIC_PATH = candidate\n", |
| " break\n", |
| "if SYNTHETIC_PATH:\n", |
| " print(f\"\\nβ
Synthetic data: {SYNTHETIC_PATH}\")\n", |
| "else:\n", |
| " print(\"\\nβ Synthetic data NOT found!\")\n", |
| "\n", |
| "# Find SROIE data\n", |
| "SROIE_PATH = None\n", |
| "for folder in os.listdir(\"/kaggle/input/\"):\n", |
| " for root, dirs, files in os.walk(f\"/kaggle/input/{folder}\"):\n", |
| " if 'train' in dirs and 'test' in dirs and 'sroie' in folder.lower():\n", |
| " SROIE_PATH = root\n", |
| " break\n", |
| " if SROIE_PATH:\n", |
| " break\n", |
| "if SROIE_PATH:\n", |
| " print(f\"β
SROIE data: {SROIE_PATH}\")\n", |
| "else:\n", |
| " print(\"β οΈ SROIE data not found (will skip)\")\n", |
| "\n", |
| "# ============================================\n", |
| "# Model & Training Config (VRAM-optimized for T4)\n", |
| "# ============================================\n", |
| "MODEL_NAME = \"unsloth/Qwen2.5-7B-Instruct-bnb-4bit\"\n", |
| "MAX_SEQ_LENGTH = 2048 # Reduced from 4096 to fit T4 VRAM\n", |
| "LORA_RANK = 16\n", |
| "LORA_ALPHA = 32\n", |
| "BATCH_SIZE = 1 # Reduced from 2 to avoid OOM\n", |
| "GRAD_ACCUM = 8 # Increased from 4 to keep effective batch = 8\n", |
| "LEARNING_RATE = 2e-4\n", |
| "NUM_EPOCHS = 3\n", |
| "\n", |
| "# Output paths\n", |
| "MERGED_JSONL = \"/kaggle/working/merged_raw.jsonl\"\n", |
| "TRAIN_JSONL = \"/kaggle/working/train.jsonl\"\n", |
| "TEST_JSONL = \"/kaggle/working/test.jsonl\"\n", |
| "GROUND_TRUTH = \"/kaggle/working/ground_truth.json\"\n", |
| "\n", |
| "print(f\"\\nGPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")\n", |
| "print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB\" if torch.cuda.is_available() else \"No GPU\")\n", |
| "print(f\"Model: {MODEL_NAME}\")\n", |
| "print(f\"Seq Length: {MAX_SEQ_LENGTH}, Batch: {BATCH_SIZE}, Grad Accum: {GRAD_ACCUM}\")\n", |
| "print(f\"LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")" |
| ] |
|
|
| |
| nb["cells"][3]["source"] = step1_source |
|
|
| |
| training_source = [ |
| "from trl import SFTTrainer\n", |
| "from transformers import TrainingArguments\n", |
| "\n", |
| "# Clear GPU cache before training\n", |
| "torch.cuda.empty_cache()\n", |
| "\n", |
| "trainer = SFTTrainer(\n", |
| " model=model,\n", |
| " tokenizer=tokenizer,\n", |
| " train_dataset=dataset,\n", |
| " dataset_text_field=\"text\",\n", |
| " max_seq_length=MAX_SEQ_LENGTH,\n", |
| " dataset_num_proc=2,\n", |
| " packing=False,\n", |
| " args=TrainingArguments(\n", |
| " per_device_train_batch_size=BATCH_SIZE,\n", |
| " gradient_accumulation_steps=GRAD_ACCUM,\n", |
| " warmup_steps=10,\n", |
| " num_train_epochs=NUM_EPOCHS,\n", |
| " learning_rate=LEARNING_RATE,\n", |
| " fp16=not torch.cuda.is_bf16_supported(),\n", |
| " bf16=torch.cuda.is_bf16_supported(),\n", |
| " logging_steps=5,\n", |
| " optim=\"adamw_8bit\",\n", |
| " weight_decay=0.01,\n", |
| " lr_scheduler_type=\"cosine\",\n", |
| " seed=42,\n", |
| " output_dir=\"./outputs\",\n", |
| " save_strategy=\"epoch\",\n", |
| " report_to=\"none\",\n", |
| " ),\n", |
| ")\n", |
| "\n", |
| "print(f\"Starting training...\")\n", |
| "print(f\" Epochs: {NUM_EPOCHS}\")\n", |
| "print(f\" Effective batch size: {BATCH_SIZE * GRAD_ACCUM}\")\n", |
| "print(f\" Learning rate: {LEARNING_RATE}\")\n", |
| "print(f\" VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} GB\\n\")\n", |
| "\n", |
| "stats = trainer.train()\n", |
| "\n", |
| "print(f\"\\nβ
Training complete!\")\n", |
| "print(f\" Final loss: {stats.training_loss:.4f}\")\n", |
| "print(f\" Runtime: {stats.metrics['train_runtime']:.0f}s ({stats.metrics['train_runtime']/60:.1f} min)\")\n", |
| "print(f\" Samples/sec: {stats.metrics['train_samples_per_second']:.1f}\")" |
| ] |
|
|
| |
| |
| for i, cell in enumerate(nb["cells"]): |
| if cell["cell_type"] == "code": |
| src_text = "".join(cell["source"]) |
| if "SFTTrainer" in src_text and "trainer.train()" in src_text: |
| nb["cells"][i]["source"] = training_source |
| print(f"Fixed training cell at index {i}") |
| break |
|
|
| json.dump(nb, open("notebooks/financial_doc_training.ipynb", "w", encoding="utf-8"), indent=1, ensure_ascii=False) |
| print("β
All fixes applied to notebook!") |
| print(" - Auto-detect data paths (no hardcoded paths)") |
| print(" - total_mem -> total_memory (PyTorch fix)") |
| print(" - MAX_SEQ_LENGTH: 4096 -> 2048 (VRAM fix)") |
| print(" - BATCH_SIZE: 2 -> 1 (OOM fix)") |
| print(" - GRAD_ACCUM: 4 -> 8 (compensate batch)") |
| print(" - Added torch.cuda.empty_cache()") |
| print(" - Auto fp16/bf16 detection") |
|
|