"""Apply all recent fixes to the training notebook.""" import json nb = json.load(open("notebooks/financial_doc_training.ipynb", "r", encoding="utf-8")) # === Fix 1: Step 1 - Auto-detect paths + reduced VRAM settings === step1_source = [ "import os\n", "import json\n", "import glob\n", "import random\n", "import hashlib\n", "import torch\n", "\n", "# ============================================\n", "# Credentials from Kaggle Secrets\n", "# ============================================\n", "from kaggle_secrets import UserSecretsClient\n", "secrets = UserSecretsClient()\n", "HF_USERNAME = secrets.get_secret(\"HF_USERNAME\")\n", "HF_TOKEN = secrets.get_secret(\"HF_TOKEN\")\n", "print(f\"āœ… Credentials loaded for {HF_USERNAME}\")\n", "\n", "# ============================================\n", "# Auto-detect Data Paths\n", "# ============================================\n", "print(\"\\nšŸ“ Available input data:\")\n", "for item in os.listdir(\"/kaggle/input/\"):\n", " subpath = f\"/kaggle/input/{item}\"\n", " print(f\" /kaggle/input/{item}/\")\n", " if os.path.isdir(subpath):\n", " for sub in os.listdir(subpath)[:5]:\n", " print(f\" └── {sub}\")\n", "\n", "# Find synthetic data\n", "SYNTHETIC_PATH = None\n", "for folder in os.listdir(\"/kaggle/input/\"):\n", " candidate = f\"/kaggle/input/{folder}/with_anomalies.jsonl\"\n", " if os.path.exists(candidate):\n", " SYNTHETIC_PATH = candidate\n", " break\n", "if SYNTHETIC_PATH:\n", " print(f\"\\nāœ… Synthetic data: {SYNTHETIC_PATH}\")\n", "else:\n", " print(\"\\nāŒ Synthetic data NOT found!\")\n", "\n", "# Find SROIE data\n", "SROIE_PATH = None\n", "for folder in os.listdir(\"/kaggle/input/\"):\n", " for root, dirs, files in os.walk(f\"/kaggle/input/{folder}\"):\n", " if 'train' in dirs and 'test' in dirs and 'sroie' in folder.lower():\n", " SROIE_PATH = root\n", " break\n", " if SROIE_PATH:\n", " break\n", "if SROIE_PATH:\n", " print(f\"āœ… SROIE data: {SROIE_PATH}\")\n", "else:\n", " print(\"āš ļø SROIE data not found (will skip)\")\n", "\n", "# ============================================\n", "# Model & Training Config (VRAM-optimized for T4)\n", "# ============================================\n", "MODEL_NAME = \"unsloth/Qwen2.5-7B-Instruct-bnb-4bit\"\n", "MAX_SEQ_LENGTH = 2048 # Reduced from 4096 to fit T4 VRAM\n", "LORA_RANK = 16\n", "LORA_ALPHA = 32\n", "BATCH_SIZE = 1 # Reduced from 2 to avoid OOM\n", "GRAD_ACCUM = 8 # Increased from 4 to keep effective batch = 8\n", "LEARNING_RATE = 2e-4\n", "NUM_EPOCHS = 3\n", "\n", "# Output paths\n", "MERGED_JSONL = \"/kaggle/working/merged_raw.jsonl\"\n", "TRAIN_JSONL = \"/kaggle/working/train.jsonl\"\n", "TEST_JSONL = \"/kaggle/working/test.jsonl\"\n", "GROUND_TRUTH = \"/kaggle/working/ground_truth.json\"\n", "\n", "print(f\"\\nGPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")\n", "print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB\" if torch.cuda.is_available() else \"No GPU\")\n", "print(f\"Model: {MODEL_NAME}\")\n", "print(f\"Seq Length: {MAX_SEQ_LENGTH}, Batch: {BATCH_SIZE}, Grad Accum: {GRAD_ACCUM}\")\n", "print(f\"LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")" ] # Find Step 1 cell (cell index 3 — after 2 markdown + 1 install code) nb["cells"][3]["source"] = step1_source # === Fix 2: Training cell — OOM fixes === training_source = [ "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "# Clear GPU cache before training\n", "torch.cuda.empty_cache()\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=dataset,\n", " dataset_text_field=\"text\",\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " dataset_num_proc=2,\n", " packing=False,\n", " args=TrainingArguments(\n", " per_device_train_batch_size=BATCH_SIZE,\n", " gradient_accumulation_steps=GRAD_ACCUM,\n", " warmup_steps=10,\n", " num_train_epochs=NUM_EPOCHS,\n", " learning_rate=LEARNING_RATE,\n", " fp16=not torch.cuda.is_bf16_supported(),\n", " bf16=torch.cuda.is_bf16_supported(),\n", " logging_steps=5,\n", " optim=\"adamw_8bit\",\n", " weight_decay=0.01,\n", " lr_scheduler_type=\"cosine\",\n", " seed=42,\n", " output_dir=\"./outputs\",\n", " save_strategy=\"epoch\",\n", " report_to=\"none\",\n", " ),\n", ")\n", "\n", "print(f\"Starting training...\")\n", "print(f\" Epochs: {NUM_EPOCHS}\")\n", "print(f\" Effective batch size: {BATCH_SIZE * GRAD_ACCUM}\")\n", "print(f\" Learning rate: {LEARNING_RATE}\")\n", "print(f\" VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} GB\\n\")\n", "\n", "stats = trainer.train()\n", "\n", "print(f\"\\nāœ… Training complete!\")\n", "print(f\" Final loss: {stats.training_loss:.4f}\")\n", "print(f\" Runtime: {stats.metrics['train_runtime']:.0f}s ({stats.metrics['train_runtime']/60:.1f} min)\")\n", "print(f\" Samples/sec: {stats.metrics['train_samples_per_second']:.1f}\")" ] # Find the training cell — it's the one after "Step 5: Load Training Data & Train" # That's cell index 9 (after markdown headers and data cells) for i, cell in enumerate(nb["cells"]): if cell["cell_type"] == "code": src_text = "".join(cell["source"]) if "SFTTrainer" in src_text and "trainer.train()" in src_text: nb["cells"][i]["source"] = training_source print(f"Fixed training cell at index {i}") break json.dump(nb, open("notebooks/financial_doc_training.ipynb", "w", encoding="utf-8"), indent=1, ensure_ascii=False) print("āœ… All fixes applied to notebook!") print(" - Auto-detect data paths (no hardcoded paths)") print(" - total_mem -> total_memory (PyTorch fix)") print(" - MAX_SEQ_LENGTH: 4096 -> 2048 (VRAM fix)") print(" - BATCH_SIZE: 2 -> 1 (OOM fix)") print(" - GRAD_ACCUM: 4 -> 8 (compensate batch)") print(" - Added torch.cuda.empty_cache()") print(" - Auto fp16/bf16 detection")