Vaibuzzz's picture
Upload folder using huggingface_hub
10ff0db verified
"""Apply all recent fixes to the training notebook."""
import json
nb = json.load(open("notebooks/financial_doc_training.ipynb", "r", encoding="utf-8"))
# === Fix 1: Step 1 - Auto-detect paths + reduced VRAM settings ===
step1_source = [
"import os\n",
"import json\n",
"import glob\n",
"import random\n",
"import hashlib\n",
"import torch\n",
"\n",
"# ============================================\n",
"# Credentials from Kaggle Secrets\n",
"# ============================================\n",
"from kaggle_secrets import UserSecretsClient\n",
"secrets = UserSecretsClient()\n",
"HF_USERNAME = secrets.get_secret(\"HF_USERNAME\")\n",
"HF_TOKEN = secrets.get_secret(\"HF_TOKEN\")\n",
"print(f\"βœ… Credentials loaded for {HF_USERNAME}\")\n",
"\n",
"# ============================================\n",
"# Auto-detect Data Paths\n",
"# ============================================\n",
"print(\"\\nπŸ“ Available input data:\")\n",
"for item in os.listdir(\"/kaggle/input/\"):\n",
" subpath = f\"/kaggle/input/{item}\"\n",
" print(f\" /kaggle/input/{item}/\")\n",
" if os.path.isdir(subpath):\n",
" for sub in os.listdir(subpath)[:5]:\n",
" print(f\" └── {sub}\")\n",
"\n",
"# Find synthetic data\n",
"SYNTHETIC_PATH = None\n",
"for folder in os.listdir(\"/kaggle/input/\"):\n",
" candidate = f\"/kaggle/input/{folder}/with_anomalies.jsonl\"\n",
" if os.path.exists(candidate):\n",
" SYNTHETIC_PATH = candidate\n",
" break\n",
"if SYNTHETIC_PATH:\n",
" print(f\"\\nβœ… Synthetic data: {SYNTHETIC_PATH}\")\n",
"else:\n",
" print(\"\\n❌ Synthetic data NOT found!\")\n",
"\n",
"# Find SROIE data\n",
"SROIE_PATH = None\n",
"for folder in os.listdir(\"/kaggle/input/\"):\n",
" for root, dirs, files in os.walk(f\"/kaggle/input/{folder}\"):\n",
" if 'train' in dirs and 'test' in dirs and 'sroie' in folder.lower():\n",
" SROIE_PATH = root\n",
" break\n",
" if SROIE_PATH:\n",
" break\n",
"if SROIE_PATH:\n",
" print(f\"βœ… SROIE data: {SROIE_PATH}\")\n",
"else:\n",
" print(\"⚠️ SROIE data not found (will skip)\")\n",
"\n",
"# ============================================\n",
"# Model & Training Config (VRAM-optimized for T4)\n",
"# ============================================\n",
"MODEL_NAME = \"unsloth/Qwen2.5-7B-Instruct-bnb-4bit\"\n",
"MAX_SEQ_LENGTH = 2048 # Reduced from 4096 to fit T4 VRAM\n",
"LORA_RANK = 16\n",
"LORA_ALPHA = 32\n",
"BATCH_SIZE = 1 # Reduced from 2 to avoid OOM\n",
"GRAD_ACCUM = 8 # Increased from 4 to keep effective batch = 8\n",
"LEARNING_RATE = 2e-4\n",
"NUM_EPOCHS = 3\n",
"\n",
"# Output paths\n",
"MERGED_JSONL = \"/kaggle/working/merged_raw.jsonl\"\n",
"TRAIN_JSONL = \"/kaggle/working/train.jsonl\"\n",
"TEST_JSONL = \"/kaggle/working/test.jsonl\"\n",
"GROUND_TRUTH = \"/kaggle/working/ground_truth.json\"\n",
"\n",
"print(f\"\\nGPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}\")\n",
"print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB\" if torch.cuda.is_available() else \"No GPU\")\n",
"print(f\"Model: {MODEL_NAME}\")\n",
"print(f\"Seq Length: {MAX_SEQ_LENGTH}, Batch: {BATCH_SIZE}, Grad Accum: {GRAD_ACCUM}\")\n",
"print(f\"LoRA Rank: {LORA_RANK}, Alpha: {LORA_ALPHA}\")"
]
# Find Step 1 cell (cell index 3 β€” after 2 markdown + 1 install code)
nb["cells"][3]["source"] = step1_source
# === Fix 2: Training cell β€” OOM fixes ===
training_source = [
"from trl import SFTTrainer\n",
"from transformers import TrainingArguments\n",
"\n",
"# Clear GPU cache before training\n",
"torch.cuda.empty_cache()\n",
"\n",
"trainer = SFTTrainer(\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" train_dataset=dataset,\n",
" dataset_text_field=\"text\",\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" dataset_num_proc=2,\n",
" packing=False,\n",
" args=TrainingArguments(\n",
" per_device_train_batch_size=BATCH_SIZE,\n",
" gradient_accumulation_steps=GRAD_ACCUM,\n",
" warmup_steps=10,\n",
" num_train_epochs=NUM_EPOCHS,\n",
" learning_rate=LEARNING_RATE,\n",
" fp16=not torch.cuda.is_bf16_supported(),\n",
" bf16=torch.cuda.is_bf16_supported(),\n",
" logging_steps=5,\n",
" optim=\"adamw_8bit\",\n",
" weight_decay=0.01,\n",
" lr_scheduler_type=\"cosine\",\n",
" seed=42,\n",
" output_dir=\"./outputs\",\n",
" save_strategy=\"epoch\",\n",
" report_to=\"none\",\n",
" ),\n",
")\n",
"\n",
"print(f\"Starting training...\")\n",
"print(f\" Epochs: {NUM_EPOCHS}\")\n",
"print(f\" Effective batch size: {BATCH_SIZE * GRAD_ACCUM}\")\n",
"print(f\" Learning rate: {LEARNING_RATE}\")\n",
"print(f\" VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} GB\\n\")\n",
"\n",
"stats = trainer.train()\n",
"\n",
"print(f\"\\nβœ… Training complete!\")\n",
"print(f\" Final loss: {stats.training_loss:.4f}\")\n",
"print(f\" Runtime: {stats.metrics['train_runtime']:.0f}s ({stats.metrics['train_runtime']/60:.1f} min)\")\n",
"print(f\" Samples/sec: {stats.metrics['train_samples_per_second']:.1f}\")"
]
# Find the training cell β€” it's the one after "Step 5: Load Training Data & Train"
# That's cell index 9 (after markdown headers and data cells)
for i, cell in enumerate(nb["cells"]):
if cell["cell_type"] == "code":
src_text = "".join(cell["source"])
if "SFTTrainer" in src_text and "trainer.train()" in src_text:
nb["cells"][i]["source"] = training_source
print(f"Fixed training cell at index {i}")
break
json.dump(nb, open("notebooks/financial_doc_training.ipynb", "w", encoding="utf-8"), indent=1, ensure_ascii=False)
print("βœ… All fixes applied to notebook!")
print(" - Auto-detect data paths (no hardcoded paths)")
print(" - total_mem -> total_memory (PyTorch fix)")
print(" - MAX_SEQ_LENGTH: 4096 -> 2048 (VRAM fix)")
print(" - BATCH_SIZE: 2 -> 1 (OOM fix)")
print(" - GRAD_ACCUM: 4 -> 8 (compensate batch)")
print(" - Added torch.cuda.empty_cache()")
print(" - Auto fp16/bf16 detection")