File size: 6,602 Bytes
10ff0db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""Apply all recent fixes to the training notebook."""
import json

nb = json.load(open("notebooks/financial_doc_training.ipynb", "r", encoding="utf-8"))

# === Fix 1: Step 1 - Auto-detect paths + reduced VRAM settings ===
step1_source = [
    "import os\n",
    "import json\n",
    "import glob\n",
    "import random\n",
    "import hashlib\n",
    "import torch\n",
    "\n",
    "# ============================================\n",
    "# Credentials from Kaggle Secrets\n",
    "# ============================================\n",
    "from kaggle_secrets import UserSecretsClient\n",
    "secrets = UserSecretsClient()\n",
    "HF_USERNAME = secrets.get_secret(\"HF_USERNAME\")\n",
    "HF_TOKEN = secrets.get_secret(\"HF_TOKEN\")\n",
    "print(f\"βœ… Credentials loaded for {HF_USERNAME}\")\n",
    "\n",
    "# ============================================\n",
    "# Auto-detect Data Paths\n",
    "# ============================================\n",
    "print(\"\\nπŸ“ Available input data:\")\n",
    "for item in os.listdir(\"/kaggle/input/\"):\n",
    "    subpath = f\"/kaggle/input/{item}\"\n",
    "    print(f\"  /kaggle/input/{item}/\")\n",
    "    if os.path.isdir(subpath):\n",
    "        for sub in os.listdir(subpath)[:5]:\n",
    "            print(f\"    └── {sub}\")\n",
    "\n",
    "# Find synthetic data\n",
    "SYNTHETIC_PATH = None\n",
    "for folder in os.listdir(\"/kaggle/input/\"):\n",
    "    candidate = f\"/kaggle/input/{folder}/with_anomalies.jsonl\"\n",
    "    if os.path.exists(candidate):\n",
    "        SYNTHETIC_PATH = candidate\n",
    "        break\n",
    "if SYNTHETIC_PATH:\n",
    "    print(f\"\\nβœ… Synthetic data: {SYNTHETIC_PATH}\")\n",
    "else:\n",
    "    print(\"\\n❌ Synthetic data NOT found!\")\n",
    "\n",
    "# Find SROIE data\n",
    "SROIE_PATH = None\n",
    "for folder in os.listdir(\"/kaggle/input/\"):\n",
    "    for root, dirs, files in os.walk(f\"/kaggle/input/{folder}\"):\n",
    "        if 'train' in dirs and 'test' in dirs and 'sroie' in folder.lower():\n",
    "            SROIE_PATH = root\n",
    "            break\n",
    "    if SROIE_PATH:\n",
    "        break\n",
    "if SROIE_PATH:\n",
    "    print(f\"βœ… SROIE data: {SROIE_PATH}\")\n",
    "else:\n",
    "    print(\"⚠️  SROIE data not found (will skip)\")\n",
    "\n",
    "# ============================================\n",
    "# Model & Training Config (VRAM-optimized for T4)\n",
    "# ============================================\n",
    "MODEL_NAME = \"unsloth/Qwen2.5-7B-Instruct-bnb-4bit\"\n",
    "MAX_SEQ_LENGTH = 2048      # Reduced from 4096 to fit T4 VRAM\n",
    "LORA_RANK = 16\n",
    "LORA_ALPHA = 32\n",
    "BATCH_SIZE = 1             # Reduced from 2 to avoid OOM\n",
    "GRAD_ACCUM = 8             # Increased from 4 to keep effective batch = 8\n",
    "LEARNING_RATE = 2e-4\n",
    "NUM_EPOCHS = 3\n",
    "\n",
    "# Output paths\n",
    "MERGED_JSONL = \"/kaggle/working/merged_raw.jsonl\"\n",
    "TRAIN_JSONL = \"/kaggle/working/train.jsonl\"\n",
    "TEST_JSONL = \"/kaggle/working/test.jsonl\"\n",
    "GROUND_TRUTH = \"/kaggle/working/ground_truth.json\"\n",
    "\n",
    "print(f\"\\nGPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")\n",
    "print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB\" if torch.cuda.is_available() else \"No GPU\")\n",
    "print(f\"Model: {MODEL_NAME}\")\n",
    "print(f\"Seq Length: {MAX_SEQ_LENGTH}, Batch: {BATCH_SIZE}, Grad Accum: {GRAD_ACCUM}\")\n",
    "print(f\"LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")"
]

# Find Step 1 cell (cell index 3 β€” after 2 markdown + 1 install code)
nb["cells"][3]["source"] = step1_source

# === Fix 2: Training cell β€” OOM fixes ===
training_source = [
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "\n",
    "# Clear GPU cache before training\n",
    "torch.cuda.empty_cache()\n",
    "\n",
    "trainer = SFTTrainer(\n",
    "    model=model,\n",
    "    tokenizer=tokenizer,\n",
    "    train_dataset=dataset,\n",
    "    dataset_text_field=\"text\",\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
    "    dataset_num_proc=2,\n",
    "    packing=False,\n",
    "    args=TrainingArguments(\n",
    "        per_device_train_batch_size=BATCH_SIZE,\n",
    "        gradient_accumulation_steps=GRAD_ACCUM,\n",
    "        warmup_steps=10,\n",
    "        num_train_epochs=NUM_EPOCHS,\n",
    "        learning_rate=LEARNING_RATE,\n",
    "        fp16=not torch.cuda.is_bf16_supported(),\n",
    "        bf16=torch.cuda.is_bf16_supported(),\n",
    "        logging_steps=5,\n",
    "        optim=\"adamw_8bit\",\n",
    "        weight_decay=0.01,\n",
    "        lr_scheduler_type=\"cosine\",\n",
    "        seed=42,\n",
    "        output_dir=\"./outputs\",\n",
    "        save_strategy=\"epoch\",\n",
    "        report_to=\"none\",\n",
    "    ),\n",
    ")\n",
    "\n",
    "print(f\"Starting training...\")\n",
    "print(f\"  Epochs: {NUM_EPOCHS}\")\n",
    "print(f\"  Effective batch size: {BATCH_SIZE * GRAD_ACCUM}\")\n",
    "print(f\"  Learning rate: {LEARNING_RATE}\")\n",
    "print(f\"  VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} GB\\n\")\n",
    "\n",
    "stats = trainer.train()\n",
    "\n",
    "print(f\"\\nβœ… Training complete!\")\n",
    "print(f\"  Final loss: {stats.training_loss:.4f}\")\n",
    "print(f\"  Runtime: {stats.metrics['train_runtime']:.0f}s ({stats.metrics['train_runtime']/60:.1f} min)\")\n",
    "print(f\"  Samples/sec: {stats.metrics['train_samples_per_second']:.1f}\")"
]

# Find the training cell β€” it's the one after "Step 5: Load Training Data & Train"
# That's cell index 9 (after markdown headers and data cells)
for i, cell in enumerate(nb["cells"]):
    if cell["cell_type"] == "code":
        src_text = "".join(cell["source"])
        if "SFTTrainer" in src_text and "trainer.train()" in src_text:
            nb["cells"][i]["source"] = training_source
            print(f"Fixed training cell at index {i}")
            break

json.dump(nb, open("notebooks/financial_doc_training.ipynb", "w", encoding="utf-8"), indent=1, ensure_ascii=False)
print("βœ… All fixes applied to notebook!")
print("  - Auto-detect data paths (no hardcoded paths)")
print("  - total_mem -> total_memory (PyTorch fix)")
print("  - MAX_SEQ_LENGTH: 4096 -> 2048 (VRAM fix)")
print("  - BATCH_SIZE: 2 -> 1 (OOM fix)")
print("  - GRAD_ACCUM: 4 -> 8 (compensate batch)")
print("  - Added torch.cuda.empty_cache()")
print("  - Auto fp16/bf16 detection")