{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "84e9f72e-84ff-49e5-b8ba-faa6ee9bc4df", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "id": "2b495825-0d1a-4a46-9297-6ceae1ccd2a2", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import shutil\n", "import subprocess\n", "import time\n", "import requests\n", "import torch\n", "from pathlib import Path\n", "\n", "# Fix paths so we can import 'extract.py'\n", "project_root = Path(os.getcwd()).parent\n", "script_dir = project_root / \"src/transform\"\n", "if str(script_dir) not in sys.path:\n", " sys.path.append(str(script_dir))\n", "\n", "# Import your optimized processor\n", "from extract import MarkerFolderProcessor, configure_parallelism" ] }, { "cell_type": "code", "execution_count": 3, "id": "8d04e7ad-abf2-40e4-b308-fc0863464935", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Setup complete.\n" ] } ], "source": [ "# Paths\n", "SCRATCH = Path(os.environ.get(\"SCRATCH\"))\n", "INPUT_PDFS = SCRATCH / \"mshauri-fedha/data/cbk/pdfs\"\n", "OUTPUT_DIR = SCRATCH / \"mshauri-fedha/data/cbk/marker-output\"\n", "\n", "# Ollama Setup\n", "OLLAMA_HOME = SCRATCH / \"ollama_core\"\n", "OLLAMA_BIN = OLLAMA_HOME / \"bin/ollama\"\n", "OLLAMA_MODELS_DIR = OLLAMA_HOME / \"models\" \n", "OLLAMA_HOST = \"http://localhost:11434\"\n", "\n", "print(\"✅ Setup complete.\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "2a7846b4-2041-4b4f-9210-16a891d6c9f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔍 GH200/A100 Detected: 4 GPUs | 94.5 GB VRAM\n", "⚙️ Stability Config: 5 workers/GPU | 20 Total Slots\n" ] } ], "source": [ "total_slots, workers_per_gpu, num_gpus = configure_parallelism()" ] }, { "cell_type": "code", "execution_count": 5, "id": "039a0a95-91e2-495c-a0ab-d2185f98461c", "metadata": {}, "outputs": [], "source": [ "# Kill any old server first\n", "subprocess.run([\"pkill\", \"-f\", \"ollama serve\"], stderr=subprocess.DEVNULL)\n", "time.sleep(2)\n", "\n", "server_env = os.environ.copy()\n", "server_env[\"OLLAMA_NUM_PARALLEL\"] = str(32) # Matches your total slots\n", "server_env[\"OLLAMA_MAX_LOADED_MODELS\"] = \"1\"\n", "server_env[\"OLLAMA_MAX_QUEUE\"] = \"2048\"\n", "\n", "# Start new server\n", "process = subprocess.Popen(\n", " [str(OLLAMA_BIN), \"serve\"], \n", " stdout=subprocess.DEVNULL, \n", " stderr=subprocess.DEVNULL,\n", " env=server_env\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "accd6a19-e216-450e-9cca-beaeaa7749a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "⏳ Waiting for server heartbeat...\n", "✅ Server is UP and listening!\n" ] } ], "source": [ "# Robust Wait Loop\n", "print(\"Waiting for server heartbeat...\")\n", "server_ready = False\n", "for _ in range(60): # Wait 60 seconds max\n", " try:\n", " if requests.get(OLLAMA_HOST).status_code == 200:\n", " server_ready = True\n", " break\n", " except:\n", " time.sleep(1)\n", "\n", "if server_ready:\n", " print(\"✅ Server is UP and listening!\")\n", "else:\n", " raise RuntimeError(\"Server failed to start. Check logs.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "454617a5-c1ef-489f-b9c0-8e6b4fe39b47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "⬇️ Checking/Pulling qwen2.5:7b...\n", "📝 Creating 'qwen2.5-7b-16k' (16k Context)...\n" ] } ], "source": [ "# pull model\n", "BASE_MODEL = \"qwen2.5:7b\" \n", "CUSTOM_MODEL_NAME = \"qwen2.5-7b-16k\"\n", "\n", "print(f\" Checking/Pulling {BASE_MODEL}...\")\n", "subprocess.run(\n", " [str(OLLAMA_BIN), \"pull\", BASE_MODEL], \n", " check=True, \n", " stdout=subprocess.DEVNULL,\n", " stderr=subprocess.DEVNULL,\n", " env=os.environ.copy()\n", ")\n", "\n", "print(f\"Creating '{CUSTOM_MODEL_NAME}' (16k Context)...\")\n", "modelfile_content = f\"FROM {BASE_MODEL}\\nPARAMETER num_ctx 16384\"\n", "with open(\"Modelfile_qwen_16k\", \"w\") as f:\n", " f.write(modelfile_content)" ] }, { "cell_type": "code", "execution_count": null, "id": "0fe26ed7-f31f-43eb-acca-1795b5528219", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Model Ready.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[?2026h\u001b[?25l\u001b[1Ggathering model components \u001b[K\n", "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n", "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n", "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n", "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n", "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n", "writing manifest ⠋ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[1Ggathering model components \u001b[K\n", "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n", "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n", "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n", "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n", "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n", "writing manifest \u001b[K\n", "success \u001b[K\u001b[?25h\u001b[?2026l\n" ] } ], "source": [ "# run model\n", "subprocess.run(\n", " [str(OLLAMA_BIN), \"create\", CUSTOM_MODEL_NAME, \"-f\", \"Modelfile_qwen\"], \n", " check=True, \n", " stdout=subprocess.DEVNULL, \n", " env=os.environ.copy()\n", ")\n", "print(\"Model Ready.\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "4750bd0f-3cd2-4d62-a6c4-75c2f19e45f1", "metadata": {}, "outputs": [], "source": [ "os.chdir(SCRATCH)" ] }, { "cell_type": "code", "execution_count": null, "id": "9581ee47-f690-46c8-b331-084411fb8535", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Detected 4 GPUs (Dynamic Mode)\n", "🚀 Processing PDFs from: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/pdfs\n", "📦 Created 1089 batches of 5 files each.\n", "🚀 Launching 20 workers on 4 GPUs...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "20:26:57 - [GPU-3:Dev3] - Initializing Worker 3...\n", "20:26:58 - [GPU-0:Dev0] - Initializing Worker 0...\n", "20:27:05 - [GPU-1:Dev1] - Initializing Worker 1...\n", "20:27:06 - [GPU-2:Dev2] - Initializing Worker 2...\n", "20:27:09 - [GPU-4:Dev0] - Initializing Worker 4...\n", "20:27:11 - [GPU-5:Dev1] - Initializing Worker 5...\n", "20:27:12 - [GPU-6:Dev2] - Initializing Worker 6...\n", "20:27:12 - [GPU-9:Dev1] - Initializing Worker 9...\n", "20:27:14 - [GPU-7:Dev3] - Initializing Worker 7...\n", "20:27:15 - [GPU-8:Dev0] - Initializing Worker 8...\n" ] } ], "source": [ "# Initialize the Processor\n", "processor = MarkerFolderProcessor(\n", " output_dir=OUTPUT_DIR,\n", " ollama_url=OLLAMA_HOST,\n", " ollama_model=CUSTOM_MODEL_NAME,\n", " batch_multiplier=4, \n", " workers_per_gpu=workers_per_gpu,\n", " num_gpus=num_gpus \n", ")\n", "\n", "# 3. Run the extraction\n", "print(f\"🚀 Processing PDFs from: {INPUT_PDFS}\")\n", "processor.process_folder(INPUT_PDFS, batch_size=5)" ] }, { "cell_type": "code", "execution_count": null, "id": "931650d0-50f1-48c1-a1a1-a561392e004b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }