{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "84e9f72e-84ff-49e5-b8ba-faa6ee9bc4df",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2b495825-0d1a-4a46-9297-6ceae1ccd2a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import shutil\n",
    "import subprocess\n",
    "import time\n",
    "import requests\n",
    "import torch\n",
    "from pathlib import Path\n",
    "\n",
    "# Fix paths so we can import 'extract.py'\n",
    "project_root = Path(os.getcwd()).parent\n",
    "script_dir = project_root / \"src/transform\"\n",
    "if str(script_dir) not in sys.path:\n",
    "    sys.path.append(str(script_dir))\n",
    "\n",
    "# Import your optimized processor\n",
    "from extract import MarkerFolderProcessor, configure_parallelism"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8d04e7ad-abf2-40e4-b308-fc0863464935",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Setup complete.\n"
     ]
    }
   ],
   "source": [
    "# Paths\n",
    "SCRATCH = Path(os.environ.get(\"SCRATCH\"))\n",
    "INPUT_PDFS = SCRATCH / \"mshauri-fedha/data/cbk/pdfs\"\n",
    "OUTPUT_DIR = SCRATCH / \"mshauri-fedha/data/cbk/marker-output\"\n",
    "\n",
    "# Ollama Setup\n",
    "OLLAMA_HOME = SCRATCH / \"ollama_core\"\n",
    "OLLAMA_BIN = OLLAMA_HOME / \"bin/ollama\"\n",
    "OLLAMA_MODELS_DIR = OLLAMA_HOME / \"models\" \n",
    "OLLAMA_HOST = \"http://localhost:11434\"\n",
    "\n",
    "print(\"✅ Setup complete.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2a7846b4-2041-4b4f-9210-16a891d6c9f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🔍 GH200/A100 Detected: 4 GPUs | 94.5 GB VRAM\n",
      "⚙️  Stability Config: 5 workers/GPU | 20 Total Slots\n"
     ]
    }
   ],
   "source": [
    "total_slots, workers_per_gpu, num_gpus = configure_parallelism()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "039a0a95-91e2-495c-a0ab-d2185f98461c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Kill any old server first\n",
    "subprocess.run([\"pkill\", \"-f\", \"ollama serve\"], stderr=subprocess.DEVNULL)\n",
    "time.sleep(2)\n",
    "\n",
    "server_env = os.environ.copy()\n",
    "server_env[\"OLLAMA_NUM_PARALLEL\"] = str(32) # Matches your total slots\n",
    "server_env[\"OLLAMA_MAX_LOADED_MODELS\"] = \"1\"\n",
    "server_env[\"OLLAMA_MAX_QUEUE\"] = \"2048\"\n",
    "\n",
    "# Start new server\n",
    "process = subprocess.Popen(\n",
    "    [str(OLLAMA_BIN), \"serve\"], \n",
    "    stdout=subprocess.DEVNULL, \n",
    "    stderr=subprocess.DEVNULL,\n",
    "    env=server_env\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accd6a19-e216-450e-9cca-beaeaa7749a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "⏳ Waiting for server heartbeat...\n",
      "✅ Server is UP and listening!\n"
     ]
    }
   ],
   "source": [
    "# Robust Wait Loop\n",
    "print(\"Waiting for server heartbeat...\")\n",
    "server_ready = False\n",
    "for _ in range(60): # Wait 60 seconds max\n",
    "    try:\n",
    "        if requests.get(OLLAMA_HOST).status_code == 200:\n",
    "            server_ready = True\n",
    "            break\n",
    "    except:\n",
    "        time.sleep(1)\n",
    "\n",
    "if server_ready:\n",
    "    print(\"✅ Server is UP and listening!\")\n",
    "else:\n",
    "    raise RuntimeError(\"Server failed to start. Check logs.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "454617a5-c1ef-489f-b9c0-8e6b4fe39b47",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "⬇️  Checking/Pulling qwen2.5:7b...\n",
      "📝 Creating 'qwen2.5-7b-16k' (16k Context)...\n"
     ]
    }
   ],
   "source": [
    "# pull model\n",
    "BASE_MODEL = \"qwen2.5:7b\" \n",
    "CUSTOM_MODEL_NAME = \"qwen2.5-7b-16k\"\n",
    "\n",
    "print(f\" Checking/Pulling {BASE_MODEL}...\")\n",
    "subprocess.run(\n",
    "    [str(OLLAMA_BIN), \"pull\", BASE_MODEL], \n",
    "    check=True, \n",
    "    stdout=subprocess.DEVNULL,\n",
    "    stderr=subprocess.DEVNULL,\n",
    "    env=os.environ.copy()\n",
    ")\n",
    "\n",
    "print(f\"Creating '{CUSTOM_MODEL_NAME}' (16k Context)...\")\n",
    "modelfile_content = f\"FROM {BASE_MODEL}\\nPARAMETER num_ctx 16384\"\n",
    "with open(\"Modelfile_qwen_16k\", \"w\") as f:\n",
    "    f.write(modelfile_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fe26ed7-f31f-43eb-acca-1795b5528219",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Model Ready.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[?2026h\u001b[?25l\u001b[1Ggathering model components \u001b[K\n",
      "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
      "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
      "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
      "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
      "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
      "writing manifest ⠋ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[1Ggathering model components \u001b[K\n",
      "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
      "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
      "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
      "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
      "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
      "writing manifest \u001b[K\n",
      "success \u001b[K\u001b[?25h\u001b[?2026l\n"
     ]
    }
   ],
   "source": [
    "# run model\n",
    "subprocess.run(\n",
    "    [str(OLLAMA_BIN), \"create\", CUSTOM_MODEL_NAME, \"-f\", \"Modelfile_qwen\"], \n",
    "    check=True, \n",
    "    stdout=subprocess.DEVNULL, \n",
    "    env=os.environ.copy()\n",
    ")\n",
    "print(\"Model Ready.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4750bd0f-3cd2-4d62-a6c4-75c2f19e45f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(SCRATCH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9581ee47-f690-46c8-b331-084411fb8535",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Detected 4 GPUs (Dynamic Mode)\n",
      "🚀 Processing PDFs from: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/pdfs\n",
      "📦 Created 1089 batches of 5 files each.\n",
      "🚀 Launching 20 workers on 4 GPUs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20:26:57 - [GPU-3:Dev3] - Initializing Worker 3...\n",
      "20:26:58 - [GPU-0:Dev0] - Initializing Worker 0...\n",
      "20:27:05 - [GPU-1:Dev1] - Initializing Worker 1...\n",
      "20:27:06 - [GPU-2:Dev2] - Initializing Worker 2...\n",
      "20:27:09 - [GPU-4:Dev0] - Initializing Worker 4...\n",
      "20:27:11 - [GPU-5:Dev1] - Initializing Worker 5...\n",
      "20:27:12 - [GPU-6:Dev2] - Initializing Worker 6...\n",
      "20:27:12 - [GPU-9:Dev1] - Initializing Worker 9...\n",
      "20:27:14 - [GPU-7:Dev3] - Initializing Worker 7...\n",
      "20:27:15 - [GPU-8:Dev0] - Initializing Worker 8...\n"
     ]
    }
   ],
   "source": [
    "# Initialize the Processor\n",
    "processor = MarkerFolderProcessor(\n",
    "    output_dir=OUTPUT_DIR,\n",
    "    ollama_url=OLLAMA_HOST,\n",
    "    ollama_model=CUSTOM_MODEL_NAME,\n",
    "    batch_multiplier=4,                  \n",
    "    workers_per_gpu=workers_per_gpu,\n",
    "    num_gpus=num_gpus                   \n",
    ")\n",
    "\n",
    "# 3. Run the extraction\n",
    "print(f\"🚀 Processing PDFs from: {INPUT_PDFS}\")\n",
    "processor.process_folder(INPUT_PDFS, batch_size=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "931650d0-50f1-48c1-a1a1-a561392e004b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}