File size: 6,602 Bytes
10ff0db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """Apply all recent fixes to the training notebook."""
import json
nb = json.load(open("notebooks/financial_doc_training.ipynb", "r", encoding="utf-8"))
# === Fix 1: Step 1 - Auto-detect paths + reduced VRAM settings ===
step1_source = [
"import os\n",
"import json\n",
"import glob\n",
"import random\n",
"import hashlib\n",
"import torch\n",
"\n",
"# ============================================\n",
"# Credentials from Kaggle Secrets\n",
"# ============================================\n",
"from kaggle_secrets import UserSecretsClient\n",
"secrets = UserSecretsClient()\n",
"HF_USERNAME = secrets.get_secret(\"HF_USERNAME\")\n",
"HF_TOKEN = secrets.get_secret(\"HF_TOKEN\")\n",
"print(f\"β
Credentials loaded for {HF_USERNAME}\")\n",
"\n",
"# ============================================\n",
"# Auto-detect Data Paths\n",
"# ============================================\n",
"print(\"\\nπ Available input data:\")\n",
"for item in os.listdir(\"/kaggle/input/\"):\n",
" subpath = f\"/kaggle/input/{item}\"\n",
" print(f\" /kaggle/input/{item}/\")\n",
" if os.path.isdir(subpath):\n",
" for sub in os.listdir(subpath)[:5]:\n",
" print(f\" βββ {sub}\")\n",
"\n",
"# Find synthetic data\n",
"SYNTHETIC_PATH = None\n",
"for folder in os.listdir(\"/kaggle/input/\"):\n",
" candidate = f\"/kaggle/input/{folder}/with_anomalies.jsonl\"\n",
" if os.path.exists(candidate):\n",
" SYNTHETIC_PATH = candidate\n",
" break\n",
"if SYNTHETIC_PATH:\n",
" print(f\"\\nβ
Synthetic data: {SYNTHETIC_PATH}\")\n",
"else:\n",
" print(\"\\nβ Synthetic data NOT found!\")\n",
"\n",
"# Find SROIE data\n",
"SROIE_PATH = None\n",
"for folder in os.listdir(\"/kaggle/input/\"):\n",
" for root, dirs, files in os.walk(f\"/kaggle/input/{folder}\"):\n",
" if 'train' in dirs and 'test' in dirs and 'sroie' in folder.lower():\n",
" SROIE_PATH = root\n",
" break\n",
" if SROIE_PATH:\n",
" break\n",
"if SROIE_PATH:\n",
" print(f\"β
SROIE data: {SROIE_PATH}\")\n",
"else:\n",
" print(\"β οΈ SROIE data not found (will skip)\")\n",
"\n",
"# ============================================\n",
"# Model & Training Config (VRAM-optimized for T4)\n",
"# ============================================\n",
"MODEL_NAME = \"unsloth/Qwen2.5-7B-Instruct-bnb-4bit\"\n",
"MAX_SEQ_LENGTH = 2048 # Reduced from 4096 to fit T4 VRAM\n",
"LORA_RANK = 16\n",
"LORA_ALPHA = 32\n",
"BATCH_SIZE = 1 # Reduced from 2 to avoid OOM\n",
"GRAD_ACCUM = 8 # Increased from 4 to keep effective batch = 8\n",
"LEARNING_RATE = 2e-4\n",
"NUM_EPOCHS = 3\n",
"\n",
"# Output paths\n",
"MERGED_JSONL = \"/kaggle/working/merged_raw.jsonl\"\n",
"TRAIN_JSONL = \"/kaggle/working/train.jsonl\"\n",
"TEST_JSONL = \"/kaggle/working/test.jsonl\"\n",
"GROUND_TRUTH = \"/kaggle/working/ground_truth.json\"\n",
"\n",
"print(f\"\\nGPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")\n",
"print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB\" if torch.cuda.is_available() else \"No GPU\")\n",
"print(f\"Model: {MODEL_NAME}\")\n",
"print(f\"Seq Length: {MAX_SEQ_LENGTH}, Batch: {BATCH_SIZE}, Grad Accum: {GRAD_ACCUM}\")\n",
"print(f\"LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")"
]
# Find Step 1 cell (cell index 3 β after 2 markdown + 1 install code)
nb["cells"][3]["source"] = step1_source
# === Fix 2: Training cell β OOM fixes ===
training_source = [
"from trl import SFTTrainer\n",
"from transformers import TrainingArguments\n",
"\n",
"# Clear GPU cache before training\n",
"torch.cuda.empty_cache()\n",
"\n",
"trainer = SFTTrainer(\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" train_dataset=dataset,\n",
" dataset_text_field=\"text\",\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" dataset_num_proc=2,\n",
" packing=False,\n",
" args=TrainingArguments(\n",
" per_device_train_batch_size=BATCH_SIZE,\n",
" gradient_accumulation_steps=GRAD_ACCUM,\n",
" warmup_steps=10,\n",
" num_train_epochs=NUM_EPOCHS,\n",
" learning_rate=LEARNING_RATE,\n",
" fp16=not torch.cuda.is_bf16_supported(),\n",
" bf16=torch.cuda.is_bf16_supported(),\n",
" logging_steps=5,\n",
" optim=\"adamw_8bit\",\n",
" weight_decay=0.01,\n",
" lr_scheduler_type=\"cosine\",\n",
" seed=42,\n",
" output_dir=\"./outputs\",\n",
" save_strategy=\"epoch\",\n",
" report_to=\"none\",\n",
" ),\n",
")\n",
"\n",
"print(f\"Starting training...\")\n",
"print(f\" Epochs: {NUM_EPOCHS}\")\n",
"print(f\" Effective batch size: {BATCH_SIZE * GRAD_ACCUM}\")\n",
"print(f\" Learning rate: {LEARNING_RATE}\")\n",
"print(f\" VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} GB\\n\")\n",
"\n",
"stats = trainer.train()\n",
"\n",
"print(f\"\\nβ
Training complete!\")\n",
"print(f\" Final loss: {stats.training_loss:.4f}\")\n",
"print(f\" Runtime: {stats.metrics['train_runtime']:.0f}s ({stats.metrics['train_runtime']/60:.1f} min)\")\n",
"print(f\" Samples/sec: {stats.metrics['train_samples_per_second']:.1f}\")"
]
# Find the training cell β it's the one after "Step 5: Load Training Data & Train"
# That's cell index 9 (after markdown headers and data cells)
for i, cell in enumerate(nb["cells"]):
if cell["cell_type"] == "code":
src_text = "".join(cell["source"])
if "SFTTrainer" in src_text and "trainer.train()" in src_text:
nb["cells"][i]["source"] = training_source
print(f"Fixed training cell at index {i}")
break
json.dump(nb, open("notebooks/financial_doc_training.ipynb", "w", encoding="utf-8"), indent=1, ensure_ascii=False)
print("β
All fixes applied to notebook!")
print(" - Auto-detect data paths (no hardcoded paths)")
print(" - total_mem -> total_memory (PyTorch fix)")
print(" - MAX_SEQ_LENGTH: 4096 -> 2048 (VRAM fix)")
print(" - BATCH_SIZE: 2 -> 1 (OOM fix)")
print(" - GRAD_ACCUM: 4 -> 8 (compensate batch)")
print(" - Added torch.cuda.empty_cache()")
print(" - Auto fp16/bf16 detection")
|