Buckets:

hf-doc-build/doc-dev / sagemaker /pr_2188 /en /examples /sagemaker-sdk-deepseek-ocr-sagemaker.html
rtrm's picture
download
raw
121 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;DeepSeek OCR Pipeline on SageMaker Training Jobs&quot;,&quot;local&quot;:&quot;deepseek-ocr-pipeline-on-sagemaker-training-jobs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisites&quot;,&quot;local&quot;:&quot;prerequisites&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ“‘ Table of Contents&quot;,&quot;local&quot;:&quot;-table-of-contents&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โš™๏ธ Setup&quot;,&quot;local&quot;:&quot;-setup&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;๐Ÿ”ง AWS SageMaker Training Jobs&quot;,&quot;local&quot;:&quot;-aws-sagemaker-training-jobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“ฆ How the pipeline code is shipped&quot;,&quot;local&quot;:&quot;-how-the-pipeline-code-is-shipped&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“š The dataset&quot;,&quot;local&quot;:&quot;-the-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;The olmOCR subsets&quot;,&quot;local&quot;:&quot;the-olmocr-subsets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;โšก Inference Backend: vLLM&quot;,&quot;local&quot;:&quot;-inference-backend-vllm&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“ DeepSeek-OCR Prompts&quot;,&quot;local&quot;:&quot;-deepseek-ocr-prompts&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ” Authentication&quot;,&quot;local&quot;:&quot;-authentication&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;๐Ÿ”‘ AWS Authentication&quot;,&quot;local&quot;:&quot;-aws-authentication&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โš™๏ธ Configuration&quot;,&quot;local&quot;:&quot;-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ“ฆ Bundle Pipeline Code&quot;,&quot;local&quot;:&quot;-bundle-pipeline-code&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ”ง Define Base Environment Variables&quot;,&quot;local&quot;:&quot;-define-base-environment-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ› ๏ธ Helper Functions&quot;,&quot;local&quot;:&quot;-helper-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ” Stage 1: Extract&quot;,&quot;local&quot;:&quot;-stage-1-extract&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;How to set up batch size for efficient processing&quot;,&quot;local&quot;:&quot;how-to-set-up-batch-size-for-efficient-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿท๏ธ Stage 2: Describe&quot;,&quot;local&quot;:&quot;-stage-2-describe&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿงฉ Stage 3: Assemble&quot;,&quot;local&quot;:&quot;-stage-3-assemble&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ’ฐ Cost Analysis (Extract stage only)&quot;,&quot;local&quot;:&quot;-cost-analysis-extract-stage-only&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โœ… Pipeline Complete&quot;,&quot;local&quot;:&quot;-pipeline-complete&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/sagemaker/pr_2188/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/entry/start.48a18f09.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/scheduler.aec39e6a.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/singletons.8e7a9ddc.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/paths.a4e52f32.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/entry/app.8481cdda.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/preload-helper.382cad4e.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/index.4ee0a2d0.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/nodes/0.52c1f0fb.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/nodes/4.5067b529.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/Tip.e2132029.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.3c60bfa3.js">
<link rel="modulepreload" href="/docs/sagemaker/pr_2188/en/_app/immutable/chunks/CodeBlock.543f5448.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;DeepSeek OCR Pipeline on SageMaker Training Jobs&quot;,&quot;local&quot;:&quot;deepseek-ocr-pipeline-on-sagemaker-training-jobs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisites&quot;,&quot;local&quot;:&quot;prerequisites&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ“‘ Table of Contents&quot;,&quot;local&quot;:&quot;-table-of-contents&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โš™๏ธ Setup&quot;,&quot;local&quot;:&quot;-setup&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;๐Ÿ”ง AWS SageMaker Training Jobs&quot;,&quot;local&quot;:&quot;-aws-sagemaker-training-jobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“ฆ How the pipeline code is shipped&quot;,&quot;local&quot;:&quot;-how-the-pipeline-code-is-shipped&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“š The dataset&quot;,&quot;local&quot;:&quot;-the-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;The olmOCR subsets&quot;,&quot;local&quot;:&quot;the-olmocr-subsets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;โšก Inference Backend: vLLM&quot;,&quot;local&quot;:&quot;-inference-backend-vllm&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;๐Ÿ“ DeepSeek-OCR Prompts&quot;,&quot;local&quot;:&quot;-deepseek-ocr-prompts&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ” Authentication&quot;,&quot;local&quot;:&quot;-authentication&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;๐Ÿ”‘ AWS Authentication&quot;,&quot;local&quot;:&quot;-aws-authentication&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โš™๏ธ Configuration&quot;,&quot;local&quot;:&quot;-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ“ฆ Bundle Pipeline Code&quot;,&quot;local&quot;:&quot;-bundle-pipeline-code&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ”ง Define Base Environment Variables&quot;,&quot;local&quot;:&quot;-define-base-environment-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ› ๏ธ Helper Functions&quot;,&quot;local&quot;:&quot;-helper-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ” Stage 1: Extract&quot;,&quot;local&quot;:&quot;-stage-1-extract&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;How to set up batch size for efficient processing&quot;,&quot;local&quot;:&quot;how-to-set-up-batch-size-for-efficient-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿท๏ธ Stage 2: Describe&quot;,&quot;local&quot;:&quot;-stage-2-describe&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿงฉ Stage 3: Assemble&quot;,&quot;local&quot;:&quot;-stage-3-assemble&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;๐Ÿ’ฐ Cost Analysis (Extract stage only)&quot;,&quot;local&quot;:&quot;-cost-analysis-extract-stage-only&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;โœ… Pipeline Complete&quot;,&quot;local&quot;:&quot;-pipeline-complete&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="deepseek-ocr-pipeline-on-sagemaker-training-jobs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepseek-ocr-pipeline-on-sagemaker-training-jobs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSeek OCR Pipeline on SageMaker Training Jobs</span></h1> <p data-svelte-h="svelte-1hkuclo">๐Ÿ“– <strong>Context</strong>: This notebook is part of the <a href="https://huggingface.co/blog/florentgbelidji/vlm-ocr-recipes-gpu-infra" rel="nofollow">VLM-OCR Recipes on GPU Infrastructure</a> article, which explains the architecture and design decisions behind this pipeline.</p> <p data-svelte-h="svelte-1ovs591">This notebook runs a three-stage OCR pipeline using SageMaker Training Jobs:</p> <ol data-svelte-h="svelte-e633mi"><li><strong>Extract</strong> - Run DeepSeek OCR over a dataset, save Markdown and crop detected figures</li> <li><strong>Describe</strong> - Generate captions for extracted figures</li> <li><strong>Assemble</strong> - Enrich Markdown with figure captions</li></ol> <p data-svelte-h="svelte-xpaqzv">This is the SageMaker equivalent of the HuggingFace Jobs pipeline. It uses SageMaker ModelTrainer V3
with a vLLM container to run GPU-accelerated inference.</p> <p data-svelte-h="svelte-com5qc"><strong>Key difference from HF Jobs:</strong> This notebook saves datasets to S3 instead of HuggingFace Hub.</p> <h2 class="relative group"><a id="prerequisites" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prerequisites"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prerequisites</span></h2> <ul data-svelte-h="svelte-13vc6jp"><li>AWS credentials configured</li> <li>SageMaker execution role with S3 access</li> <li>HuggingFace token for accessing source models and datasets</li> <li>SageMaker SDK V3 installed (<code>pip install sagemaker --upgrade</code>)</li></ul> <h2 class="relative group"><a id="-table-of-contents" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-table-of-contents"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ“‘ Table of Contents</span></h2> <ol data-svelte-h="svelte-1e6luyc"><li><a href="#setup">Setup</a> <ul><li><a href="#sagemaker-training-jobs">AWS SageMaker Training Jobs</a></li> <li><a href="#code-shipping">How the pipeline code is shipped</a></li> <li><a href="#dataset">The dataset</a></li> <li><a href="#vllm-backend">Inference Backend: vLLM</a></li> <li><a href="#prompts">DeepSeek-OCR Prompts</a></li></ul></li> <li><a href="#authentication">Authentication</a></li> <li><a href="#configuration">Configuration</a></li> <li><a href="#bundle-code">Bundle Pipeline Code</a></li> <li><a href="#env-vars">Define Base Environment Variables</a></li> <li><a href="#helpers">Helper Functions</a></li> <li><a href="#stage-1">Stage 1: Extract</a> <ul><li><a href="#batch-size">Batch size optimization</a></li></ul></li> <li><a href="#stage-2">Stage 2: Describe</a></li> <li><a href="#stage-3">Stage 3: Assemble</a></li> <li><a href="#cost-analysis">Cost Analysis</a></li> <li><a href="#complete">Pipeline Complete</a></li></ol> <h2 class="relative group"><a id="-setup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-setup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>โš™๏ธ Setup</span></h2> <h3 class="relative group"><a id="-aws-sagemaker-training-jobs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-aws-sagemaker-training-jobs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ”ง AWS SageMaker Training Jobs</span></h3> <p data-svelte-h="svelte-19lve7u"><a href="https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-training.html" rel="nofollow">SageMaker Training Jobs</a> provide managed infrastructure for running compute-intensive workloads. While traditionally used for model training, theyโ€™re equally well-suited for batch inference.</p> <p data-svelte-h="svelte-j9s7a5"><strong>Why Training Jobs for batch OCR?</strong></p> <p data-svelte-h="svelte-1w0udgv">Training Jobs are the <strong>best option for accessing GPUs on SageMaker for offline/batch workloads</strong>:</p> <ul data-svelte-h="svelte-1w16zc9"><li><strong>Direct GPU access</strong>: Spin up powerful instances like <code>ml.g6e.2xlarge</code> (L40S GPU) or <code>ml.p4d.24xlarge</code> (8x A100) on demand</li> <li><strong>Pay per use</strong>: Billed per second only while the job runs - no idle costs</li> <li><strong>Automatic cleanup</strong>: Jobs terminate automatically on completion, releasing resources</li> <li><strong>S3 integration</strong>: Native support for reading/writing large datasets directly to S3</li> <li><strong>vLLM DLC</strong>: Pre-built <a href="https://github.com/aws/deep-learning-containers" rel="nofollow">Deep Learning Containers</a> with vLLM for efficient inference</li> <li><strong>No infrastructure management</strong>: No cluster setup, scaling, or maintenance required</li></ul> <p data-svelte-h="svelte-1fnzda6"><strong>Alternative: SageMaker Endpoints</strong></p> <p data-svelte-h="svelte-658luw">For different use cases, <a href="https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html" rel="nofollow">SageMaker Endpoints</a> could be explored:</p> <table data-svelte-h="svelte-1dfu4tk"><thead><tr><th>Aspect</th> <th>Training Jobs</th> <th>Endpoints</th></tr></thead> <tbody><tr><td><strong>Best for</strong></td> <td>Batch/offline processing</td> <td>Real-time inference</td></tr> <tr><td><strong>Billing</strong></td> <td>Per-second while running</td> <td>Per-hour while deployed</td></tr> <tr><td><strong>Latency</strong></td> <td>Minutes to start</td> <td>Always ready (when deployed)</td></tr> <tr><td><strong>Cost model</strong></td> <td>Pay only during processing</td> <td>Pay for uptime</td></tr> <tr><td><strong>Scaling</strong></td> <td>Single job, fixed resources</td> <td>Auto-scaling on demand</td></tr></tbody></table> <p data-svelte-h="svelte-gemga1">For this pipelineโ€™s batch OCR workload - processing thousands of documents in one go - <strong>Training Jobs are more cost-effective</strong> since we only pay for actual compute time rather than keeping an endpoint running.</p> <h3 class="relative group"><a id="-how-the-pipeline-code-is-shipped" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-how-the-pipeline-code-is-shipped"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ“ฆ How the pipeline code is shipped</span></h3> <p data-svelte-h="svelte-p42qcv">This notebook uses the <a href="https://sagemaker.readthedocs.io/en/stable/" rel="nofollow"><strong>SageMaker Python SDK v3</strong></a> with the new <code>ModelTrainer</code> API to launch training jobs. The <code>ModelTrainer</code> class provides a simplified, declarative interface for configuring and running jobs.</p> <p data-svelte-h="svelte-1hgqc8i">For every SageMaker Training Job we launch, the logic is similar:</p> <p data-svelte-h="svelte-2lbsmb"><strong>From this notebook</strong>, we bundle and upload to S3:</p> <ol data-svelte-h="svelte-x1e6bi"><li>The entrypoint script (<code>entry.sh</code> + <code>sm_job_runner.py</code>)</li> <li>The pipeline code in <code>llm_ocr/</code></li></ol> <p data-svelte-h="svelte-1tabzdk">SageMaker automatically makes this code available at <code>/opt/ml/input/data/code</code> inside the container.</p> <p data-svelte-h="svelte-1n1hsvy"><strong>Then we launch a SageMaker Training Job</strong> using <code>ModelTrainer</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer = ModelTrainer(
training_image=TRAINING_IMAGE, <span class="hljs-comment"># vLLM DLC from ECR</span>
source_code=SourceCode(source_dir), <span class="hljs-comment"># Code bundle</span>
compute=Compute(instance_type, instance_count),
hyperparameters={...}, <span class="hljs-comment"># Environment variables</span>
output_data_config=OutputDataConfig(s3_output_path),
)
trainer.train(wait=<span class="hljs-literal">False</span>) <span class="hljs-comment"># Async execution</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jrdkbu"><strong>The job then:</strong></p> <ol data-svelte-h="svelte-jj5ap7"><li>Pulls the vLLM Deep Learning Container from ECR</li> <li>Downloads the code bundle from S3</li> <li>Runs <code>entry.sh</code> which installs dependencies via <code>uv</code> and executes <code>sm_job_runner.py</code></li> <li>The runner starts a vLLM server, then imports <code>llm_ocr.cli</code> and calls <code>main()</code> to run the pipeline stage</li> <li>Results are saved back to S3</li></ol> <h3 class="relative group"><a id="-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ“š The dataset</span></h3> <p data-svelte-h="svelte-u97tjr">This pipeline uses <strong>FineVision</strong> (<code>HuggingFaceM4/FineVision</code>) as a large, mixed <strong>image+text</strong> corpus for vision-language training/evaluation. FineVision aggregates many public sub-datasets into one unified interface, and you select a specific <em>subset/config</em> when loading.</p> <ul data-svelte-h="svelte-rgxhh"><li><strong>Dataset</strong>: <a href="https://huggingface.co/datasets/HuggingFaceM4/FineVision" rel="nofollow"><code>HuggingFaceM4/FineVision</code></a></li> <li><strong>Overview / exploration space</strong>: <a href="https://huggingface.co/spaces/HuggingFaceM4/FineVision" rel="nofollow"><code>HuggingFaceM4/FineVision</code> Space</a></li></ul> <h4 class="relative group"><a id="the-olmocr-subsets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-olmocr-subsets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The olmOCR subsets</span></h4> <p data-svelte-h="svelte-lmn70i">The <a href="https://arxiv.org/pdf/2502.18443" rel="nofollow"><code>olmOCR-mix-0225</code></a> dataset from Allen AI contains <strong>260,000 crawled PDF pages</strong> from over 100,000 diverse PDFs - academic papers, legal documents, public domain books, brochures, and more. It includes challenging content: graphics, handwritten text, multi-column layouts, tables, equations, and poor quality scans.</p> <p data-svelte-h="svelte-z7ft52">Available configs:</p> <ul data-svelte-h="svelte-8ybrvt"><li><code>olmOCR-mix-0225-documents</code> - general documents</li> <li><code>olmOCR-mix-0225-books</code> - book pages</li></ul> <p data-svelte-h="svelte-16etafv">๐Ÿ“„ <strong>Note</strong>: In this pipeline, <strong>one document = one page</strong> of a PDF.</p> <p data-svelte-h="svelte-13r0rd6">These mirror real-world enterprise use cases: contracts, invoices, reports, forms, and scanned documents that organizations need to digitize and extract structured information from.</p> <p data-svelte-h="svelte-1qkjh55"><strong>Licensing note</strong>: FineVision is a <em>collection</em> of many datasets, each with its own license/terms. Make sure the subset you use is compatible with your intended downstream use (see the dataset card for details).</p> <h3 class="relative group"><a id="-inference-backend-vllm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-inference-backend-vllm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>โšก Inference Backend: vLLM</span></h3> <p data-svelte-h="svelte-10738bx">This pipeline uses <a href="https://github.com/vllm-project/vllm" rel="nofollow"><strong>vLLM</strong></a> as the inference backend for DeepSeek-OCR. vLLM provides:</p> <ul data-svelte-h="svelte-jotf6n"><li><strong>High throughput</strong> via continuous batching and PagedAttention</li> <li><strong>OpenAI-compatible API</strong> - easy to integrate with existing code</li> <li><strong>Efficient memory management</strong> - run large models on limited GPU memory</li></ul> <p data-svelte-h="svelte-110jblm">The SageMaker Training Job uses the official <a href="https://github.com/aws/deep-learning-containers" rel="nofollow">AWS vLLM Deep Learning Container</a> (<code>vllm:0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker</code>). The pipeline sends batched requests (64 concurrent) to saturate the GPU and maximize throughput (~70 docs/min on <code>ml.g6e.2xlarge</code>).</p> <h3 class="relative group"><a id="-deepseek-ocr-prompts" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-deepseek-ocr-prompts"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ“ DeepSeek-OCR Prompts</span></h3> <p data-svelte-h="svelte-188b290">DeepSeek-OCR supports different prompts for various OCR tasks. See the <a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py" rel="nofollow">official config.py</a> for examples:</p> <table data-svelte-h="svelte-1dk4qux"><thead><tr><th>Use Case</th> <th>Prompt</th></tr></thead> <tbody><tr><td><strong>Document โ†’ Markdown</strong></td> <td><code>&lt;image&gt;\n&lt;\|grounding\|&gt;Convert the document to markdown.</code></td></tr> <tr><td><strong>General OCR</strong></td> <td><code>&lt;image&gt;\n&lt;\|grounding\|&gt;OCR this image.</code></td></tr> <tr><td><strong>Free OCR (no layout)</strong></td> <td><code>&lt;image&gt;\nFree OCR.</code></td></tr> <tr><td><strong>Parse figures</strong></td> <td><code>&lt;image&gt;\nParse the figure.</code></td></tr> <tr><td><strong>Describe image</strong></td> <td><code>&lt;image&gt;\nDescribe this image in detail.</code></td></tr></tbody></table> <p data-svelte-h="svelte-p0zxc7">We configure these prompts via environment variables <code>DOC_PROMPT</code> and <code>FIGURE_PROMPT</code> in our job configuration, re-using the special tokens from the <a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py" rel="nofollow">official DeepSeek-OCR config</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Preview 3 random document images</span>
&gt;&gt; <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
&gt;&gt; <span class="hljs-keyword">from</span> itertools <span class="hljs-keyword">import</span> islice
&gt;&gt; <span class="hljs-keyword">from</span> IPython.display <span class="hljs-keyword">import</span> display
&gt;&gt; ds = load_dataset(<span class="hljs-string">&quot;HuggingFaceM4/FineVision&quot;</span>, <span class="hljs-string">&quot;olmOCR-mix-0225-documents&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>).shuffle(seed=<span class="hljs-number">123</span>)
&gt;&gt; <span class="hljs-keyword">for</span> i, s <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(islice(ds, <span class="hljs-number">3</span>)):
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;--- Doc <span class="hljs-subst">{i}</span> ---&quot;</span>)
<span class="hljs-meta">... </span> img = s[<span class="hljs-string">&quot;images&quot;</span>][<span class="hljs-number">0</span>]
<span class="hljs-meta">... </span> img.thumbnail((<span class="hljs-number">500</span>, <span class="hljs-number">500</span>)) <span class="hljs-comment"># Resize to max 500px</span>
<span class="hljs-meta">... </span> display(img)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-guypk0">--- Doc 0 ---
</pre> <h2 class="relative group"><a id="-authentication" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-authentication"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ” Authentication</span></h2> <h3 class="relative group"><a id="-aws-authentication" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-aws-authentication"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ”‘ AWS Authentication</span></h3> <p data-svelte-h="svelte-1ry3zal">Before running this notebook, ensure your AWS credentials are configured. You can authenticate using one of these methods:</p> <p data-svelte-h="svelte-1of3qcs"><strong>Option 1: AWS SSO (recommended for organizations)</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->aws configure sso
aws sso login --profile your-profile-name<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12hj7k5"><strong>Option 2: IAM credentials via environment variables</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">export</span> AWS_ACCESS_KEY_ID=your-access-key
<span class="hljs-built_in">export</span> AWS_SECRET_ACCESS_KEY=your-secret-key
<span class="hljs-built_in">export</span> AWS_DEFAULT_REGION=us-east-1<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xfabem"><strong>Option 3: AWS CLI configuration</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->aws configure<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18rxjv2">For more details, see the <a href="https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html" rel="nofollow">AWS CLI Configuration Guide</a>.</p> <p data-svelte-h="svelte-104zp9q"><strong>Note</strong>: When running on SageMaker Studio or EC2 with an IAM role attached, credentials are automatically available via the instance metadata service.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip3 install sagemaker --upgrade --quiet<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install -U <span class="hljs-string">&quot;datasets&gt;=4.0.0&quot;</span> <span class="hljs-string">&quot;s3fs&quot;</span> <span class="hljs-string">&quot;fsspec&quot;</span><!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># ๐Ÿ”‘ Authenticate with Hugging Face</span>
<span class="hljs-comment"># Required for accessing private datasets and pushing results</span>
<span class="hljs-comment"># Get your token at: https://huggingface.co/settings/tokens</span>
<span class="hljs-keyword">import</span> os
<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> login, get_token
login()
<span class="hljs-comment"># Store token in env var for SageMaker Jobs</span>
HF_TOKEN = get_token()
os.environ[<span class="hljs-string">&quot;HF_TOKEN&quot;</span>] = HF_TOKEN
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;HF_TOKEN set: <span class="hljs-subst">{HF_TOKEN[:<span class="hljs-number">8</span>]}</span>...&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-keyword">import</span> os
&gt;&gt; <span class="hljs-keyword">import</span> json
&gt;&gt; <span class="hljs-keyword">import</span> shutil
&gt;&gt; <span class="hljs-keyword">import</span> tempfile
&gt;&gt; <span class="hljs-keyword">import</span> time
&gt;&gt; <span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path
&gt;&gt; <span class="hljs-keyword">import</span> boto3
&gt;&gt; <span class="hljs-keyword">import</span> sagemaker
&gt;&gt; <span class="hljs-keyword">from</span> sagemaker.train.model_trainer <span class="hljs-keyword">import</span> ModelTrainer
&gt;&gt; <span class="hljs-keyword">from</span> sagemaker.train.configs <span class="hljs-keyword">import</span> SourceCode, Compute, StoppingCondition, OutputDataConfig
&gt;&gt; <span class="hljs-keyword">from</span> sagemaker.core.helper.session_helper <span class="hljs-keyword">import</span> Session, get_execution_role<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-1k0tiap">sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
</pre> <h2 class="relative group"><a id="-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>โš™๏ธ Configuration</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Initialize SageMaker session</span>
sagemaker_session = Session()
iam = boto3.client(<span class="hljs-string">&#x27;iam&#x27;</span>)
role = iam.get_role(RoleName=<span class="hljs-string">&#x27;&lt;YOUR-ROLE-NAME&gt;&#x27;</span>)[<span class="hljs-string">&#x27;Role&#x27;</span>][<span class="hljs-string">&#x27;Arn&#x27;</span>]
region = sagemaker_session.boto_region_name
account_id = boto3.client(<span class="hljs-string">&quot;sts&quot;</span>).get_caller_identity()[<span class="hljs-string">&quot;Account&quot;</span>]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Region: <span class="hljs-subst">{region}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Account: <span class="hljs-subst">{account_id}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Role: <span class="hljs-subst">{role}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Pipeline Configuration</span>
&gt;&gt; PROJECT_NAME = <span class="hljs-string">&quot;deepseek-ocr-sagemaker&quot;</span>
&gt;&gt; BUCKET_NAME = sagemaker_session.default_bucket()
&gt;&gt; S3_PREFIX = <span class="hljs-string">f&quot;<span class="hljs-subst">{PROJECT_NAME}</span>&quot;</span>
&gt;&gt; <span class="hljs-comment"># S3 output path (single location for all stages - dataset gets updated in place)</span>
&gt;&gt; S3_OUTPUT_URI = <span class="hljs-string">f&quot;s3://<span class="hljs-subst">{BUCKET_NAME}</span>/<span class="hljs-subst">{S3_PREFIX}</span>&quot;</span>
&gt;&gt; <span class="hljs-comment"># vLLM Container - use SageMaker vLLM DLC</span>
&gt;&gt; TRAINING_IMAGE = <span class="hljs-string">f&quot;763104351884.dkr.ecr.<span class="hljs-subst">{region}</span>.amazonaws.com/vllm:0.12.0-gpu-py312-cu129-ubuntu22.04-sagemaker-v1.0&quot;</span> <span class="hljs-comment"># GPU stages</span>
&gt;&gt; LIGHTWEIGHT_IMAGE = <span class="hljs-string">f&quot;763104351884.dkr.ecr.<span class="hljs-subst">{region}</span>.amazonaws.com/pytorch-training:2.4.0-cpu-py311-ubuntu22.04-sagemaker&quot;</span> <span class="hljs-comment"># CPU-only assemble</span>
&gt;&gt; <span class="hljs-comment"># Instance configuration</span>
&gt;&gt; INSTANCE_TYPE = <span class="hljs-string">&quot;ml.g6e.2xlarge&quot;</span> <span class="hljs-comment"># GPU instances for extract/describe stages</span>
&gt;&gt; INSTANCE_TYPE_CPU = <span class="hljs-string">&quot;ml.c5.xlarge&quot;</span> <span class="hljs-comment"># CPU-only instance for assemble stage (much cheaper) # Single L40s GPU</span>
&gt;&gt; <span class="hljs-comment"># INSTANCE_TYPE = &quot;ml.p4d.24xlarge&quot; # 8x A100 GPUs for larger scale</span>
&gt;&gt; VOLUME_SIZE_GB = <span class="hljs-number">100</span>
&gt;&gt; MAX_RUNTIME_SECONDS = <span class="hljs-number">3</span> * <span class="hljs-number">60</span> * <span class="hljs-number">60</span> <span class="hljs-comment"># 3 hours</span>
&gt;&gt; <span class="hljs-comment"># Source dataset (from HuggingFace)</span>
&gt;&gt; SOURCE_DATASET = <span class="hljs-string">&quot;HuggingFaceM4/FineVision&quot;</span>
&gt;&gt; SOURCE_CONFIG = <span class="hljs-string">&quot;olmOCR-mix-0225-documents&quot;</span>
&gt;&gt; MAX_SAMPLES = <span class="hljs-number">1024</span> <span class="hljs-comment"># Start small for testing</span>
&gt;&gt; <span class="hljs-comment"># HuggingFace token for accessing source datasets</span>
&gt;&gt; HF_TOKEN = os.environ.get(<span class="hljs-string">&quot;HF_TOKEN&quot;</span>, <span class="hljs-string">&quot;&quot;</span>)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;S3 Bucket: s3://<span class="hljs-subst">{BUCKET_NAME}</span>/<span class="hljs-subst">{S3_PREFIX}</span>&quot;</span>)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;S3 Output URI: <span class="hljs-subst">{S3_OUTPUT_URI}</span>&quot;</span>)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Instance: <span class="hljs-subst">{INSTANCE_TYPE}</span>&quot;</span>)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Source: <span class="hljs-subst">{SOURCE_DATASET}</span>/<span class="hljs-subst">{SOURCE_CONFIG}</span> (<span class="hljs-subst">{MAX_SAMPLES}</span> samples)&quot;</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-1mo1cg1">S3 Bucket: s3://sagemaker-us-east-1-754289655784/deepseek-ocr-sagemaker
S3 Output URI: s3://sagemaker-us-east-1-754289655784/deepseek-ocr-sagemaker
Instance: ml.g6e.2xlarge
Source: HuggingFaceM4/FineVision/olmOCR-mix-0225-documents (1024 samples)
</pre> <h2 class="relative group"><a id="-bundle-pipeline-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-bundle-pipeline-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ“ฆ Bundle Pipeline Code</span></h2> <p data-svelte-h="svelte-116mcp2">SageMaker automatically uploads this bundle to S3 and makes it available at <code>/opt/ml/input/data/code</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Paths to pipeline code</span>
&gt;&gt; CODE_PATHS = [
<span class="hljs-meta">... </span> Path(<span class="hljs-string">&quot;entry.sh&quot;</span>),
<span class="hljs-meta">... </span> Path(<span class="hljs-string">&quot;sm_job_runner.py&quot;</span>),
<span class="hljs-meta">... </span> Path(<span class="hljs-string">&quot;../llm_ocr&quot;</span>),
<span class="hljs-meta">... </span>]
&gt;&gt; <span class="hljs-comment"># Create a source directory bundle</span>
&gt;&gt; source_dir = Path(tempfile.mkdtemp(prefix=<span class="hljs-string">&quot;sm-ocr-code-&quot;</span>))
&gt;&gt; <span class="hljs-keyword">for</span> path <span class="hljs-keyword">in</span> CODE_PATHS:
<span class="hljs-meta">... </span> src = Path.cwd() / path <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> path.is_absolute() <span class="hljs-keyword">else</span> path
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> src.is_dir():
<span class="hljs-meta">... </span> shutil.copytree(src, source_dir / path.name, dirs_exist_ok=<span class="hljs-literal">True</span>)
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> shutil.copy2(src, source_dir / path.name)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Source directory: <span class="hljs-subst">{source_dir}</span>&quot;</span>)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Contents: <span class="hljs-subst">{<span class="hljs-built_in">list</span>(source_dir.iterdir())}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-19zsnmr">Source directory: /tmp/sm-ocr-code-xymxcvqk
Contents: [PosixPath(&#39;/tmp/sm-ocr-code-xymxcvqk/llm_ocr&#39;), PosixPath(&#39;/tmp/sm-ocr-code-xymxcvqk/sm_job_runner.py&#39;), PosixPath(&#39;/tmp/sm-ocr-code-xymxcvqk/entry.sh&#39;)]
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Dependencies are declared in sm_job_runner.py inline metadata (PEP 723)</span>
<span class="hljs-comment"># entry.sh installs uv and runs: uv run sm_job_runner.py</span>
<span class="hljs-comment"># This automatically installs all dependencies</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-define-base-environment-variables" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-define-base-environment-variables"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ”ง Define Base Environment Variables</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Base environment variables for all stages</span>
<span class="hljs-comment"># All configuration is passed via environment variables (same as HF Jobs)</span>
BASE_ENV = {
<span class="hljs-comment"># vLLM configuration</span>
<span class="hljs-string">&quot;MODEL_ID&quot;</span>: <span class="hljs-string">&quot;deepseek-ai/DeepSeek-OCR&quot;</span>,
<span class="hljs-string">&quot;SERVED_MODEL_NAME&quot;</span>: <span class="hljs-string">&quot;deepseek-ocr&quot;</span>,
<span class="hljs-string">&quot;HOST&quot;</span>: <span class="hljs-string">&quot;0.0.0.0&quot;</span>,
<span class="hljs-string">&quot;PORT&quot;</span>: <span class="hljs-string">&quot;8000&quot;</span>,
<span class="hljs-string">&quot;MAX_MODEL_LEN&quot;</span>: <span class="hljs-string">&quot;8192&quot;</span>,
<span class="hljs-string">&quot;GPU_MEMORY_UTILIZATION&quot;</span>: <span class="hljs-string">&quot;0.90&quot;</span>,
<span class="hljs-string">&quot;TENSOR_PARALLEL_SIZE&quot;</span>: <span class="hljs-string">&quot;1&quot;</span>,
<span class="hljs-comment"># HuggingFace authentication (for source datasets)</span>
<span class="hljs-comment"># Note: For production, consider using AWS Secrets Manager instead of env vars</span>
<span class="hljs-string">&quot;HF_TOKEN&quot;</span>: os.environ.get(<span class="hljs-string">&quot;HF_TOKEN&quot;</span>, <span class="hljs-string">&quot;&quot;</span>),
<span class="hljs-string">&quot;HF_HUB_ENABLE_HF_TRANSFER&quot;</span>: <span class="hljs-string">&quot;1&quot;</span>,
<span class="hljs-comment"># Prompts</span>
<span class="hljs-string">&quot;DOC_PROMPT&quot;</span>: <span class="hljs-string">&quot;&lt;image&gt;\n&lt;|grounding|&gt;Convert this document to Markdown.&quot;</span>,
<span class="hljs-string">&quot;DOC_MAX_TOKENS&quot;</span>: <span class="hljs-string">&quot;4096&quot;</span>,
<span class="hljs-string">&quot;DOC_TEMPERATURE&quot;</span>: <span class="hljs-string">&quot;0.1&quot;</span>,
<span class="hljs-string">&quot;FIGURE_PROMPT&quot;</span>: <span class="hljs-string">&quot;&lt;image&gt;\nDescribe this image in detail.&quot;</span>,
<span class="hljs-string">&quot;FIGURE_MAX_TOKENS&quot;</span>: <span class="hljs-string">&quot;512&quot;</span>,
<span class="hljs-string">&quot;FIGURE_TEMPERATURE&quot;</span>: <span class="hljs-string">&quot;0.6&quot;</span>,
}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-helper-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-helper-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ› ๏ธ Helper Functions</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Import IO and rendering utilities from llm_ocr</span>
<span class="hljs-keyword">import</span> sys; sys.path.insert(<span class="hljs-number">0</span>, <span class="hljs-string">&quot;..&quot;</span>)
<span class="hljs-keyword">from</span> llm_ocr.sm_io <span class="hljs-keyword">import</span> load_dataset_from_s3
<span class="hljs-keyword">from</span> llm_ocr.document <span class="hljs-keyword">import</span> render_sample_markdown, display_markdown, display_samples
<span class="hljs-keyword">def</span> <span class="hljs-title function_">launch_stage</span>(<span class="hljs-params">stage: <span class="hljs-built_in">str</span>, env: <span class="hljs-built_in">dict</span> = <span class="hljs-literal">None</span>, use_gpu: <span class="hljs-built_in">bool</span> = <span class="hljs-literal">True</span></span>):
<span class="hljs-string">&quot;&quot;&quot;Launch a pipeline stage as a SageMaker Training Job.
Args:
stage: Pipeline stage (extract, describe, assemble)
env: Stage-specific environment variables (optional)
use_gpu: Whether to use GPU instance and image (default True)
Returns:
Tuple of (ModelTrainer, job_name)
&quot;&quot;&quot;</span>
<span class="hljs-keyword">import</span> uuid
<span class="hljs-comment"># Generate unique base job name</span>
unique_id = uuid.uuid4().<span class="hljs-built_in">hex</span>[:<span class="hljs-number">8</span>]
base_name = <span class="hljs-string">f&quot;<span class="hljs-subst">{PROJECT_NAME}</span>-<span class="hljs-subst">{stage}</span>-<span class="hljs-subst">{unique_id}</span>&quot;</span>
<span class="hljs-comment"># Merge base env with stage-specific env</span>
full_env = {**BASE_ENV, <span class="hljs-string">&quot;PIPELINE_STAGE&quot;</span>: stage}
<span class="hljs-keyword">if</span> env:
full_env.update(env)
<span class="hljs-comment"># Select image and instance based on GPU usage</span>
<span class="hljs-keyword">if</span> use_gpu:
image_uri = TRAINING_IMAGE
instance_type = INSTANCE_TYPE
<span class="hljs-keyword">else</span>:
<span class="hljs-comment"># Lightweight config for CPU-only stages (assemble)</span>
image_uri = LIGHTWEIGHT_IMAGE
instance_type = INSTANCE_TYPE_CPU
<span class="hljs-comment"># Create trainer</span>
trainer = ModelTrainer(
sagemaker_session=sagemaker_session,
role=role,
training_mode=<span class="hljs-string">&quot;SAGEMAKER_TRAINING_JOB&quot;</span>,
source_code=SourceCode(
source_dir=<span class="hljs-built_in">str</span>(source_dir),
entry_script=<span class="hljs-string">&quot;entry.sh&quot;</span>,
),
compute=Compute(
instance_type=instance_type,
instance_count=<span class="hljs-number">1</span>,
volume_size_in_gb=VOLUME_SIZE_GB,
),
stopping_condition=StoppingCondition(
max_runtime_in_seconds=MAX_RUNTIME_SECONDS,
),
output_data_config=OutputDataConfig(
s3_output_path=<span class="hljs-string">f&quot;s3://<span class="hljs-subst">{BUCKET_NAME}</span>/<span class="hljs-subst">{S3_PREFIX}</span>/output/&quot;</span>,
),
base_job_name=base_name,
environment=full_env,
training_image=image_uri,
)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Launching <span class="hljs-subst">{stage}</span> stage...&quot;</span>)
trainer.train(wait=<span class="hljs-literal">False</span>)
<span class="hljs-comment"># Find the actual job name using list_training_jobs API</span>
sm_client = sagemaker_session.sagemaker_client
time.sleep(<span class="hljs-number">2</span>) <span class="hljs-comment"># Brief wait for job to register</span>
response = sm_client.list_training_jobs(
NameContains=base_name,
SortBy=<span class="hljs-string">&#x27;CreationTime&#x27;</span>,
SortOrder=<span class="hljs-string">&#x27;Descending&#x27;</span>,
MaxResults=<span class="hljs-number">1</span>
)
<span class="hljs-keyword">if</span> response[<span class="hljs-string">&#x27;TrainingJobSummaries&#x27;</span>]:
actual_job_name = response[<span class="hljs-string">&#x27;TrainingJobSummaries&#x27;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&#x27;TrainingJobName&#x27;</span>]
<span class="hljs-keyword">else</span>:
actual_job_name = base_name <span class="hljs-comment"># Fallback</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Job started: <span class="hljs-subst">{actual_job_name}</span>&quot;</span>)
<span class="hljs-keyword">return</span> trainer, actual_job_name
<span class="hljs-keyword">def</span> <span class="hljs-title function_">wait_for_job</span>(<span class="hljs-params">job_name: <span class="hljs-built_in">str</span>, poll_interval: <span class="hljs-built_in">int</span> = <span class="hljs-number">30</span>, timeout: <span class="hljs-built_in">int</span> = <span class="hljs-number">10800</span></span>):
<span class="hljs-string">&quot;&quot;&quot;Wait for a SageMaker Training Job to complete.
Args:
job_name: The exact job name
poll_interval: Seconds between status checks
timeout: Maximum seconds to wait
&quot;&quot;&quot;</span>
sm_client = sagemaker_session.sagemaker_client
start_time = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Waiting for job <span class="hljs-subst">{job_name}</span>...&quot;</span>)
<span class="hljs-keyword">while</span> time.time() - start_time &lt; timeout:
response = sm_client.describe_training_job(TrainingJobName=job_name)
status = response[<span class="hljs-string">&#x27;TrainingJobStatus&#x27;</span>]
elapsed = time.time() - start_time
mins, secs = <span class="hljs-built_in">divmod</span>(<span class="hljs-built_in">int</span>(elapsed), <span class="hljs-number">60</span>)
<span class="hljs-keyword">if</span> status == <span class="hljs-string">&#x27;Completed&#x27;</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; <span class="hljs-subst">{job_name}</span>: Completed โœ“ (<span class="hljs-subst">{mins:02d}</span>:<span class="hljs-subst">{secs:02d}</span>)&quot;</span>)
<span class="hljs-keyword">return</span> response
<span class="hljs-keyword">elif</span> status == <span class="hljs-string">&#x27;Failed&#x27;</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; <span class="hljs-subst">{job_name}</span>: Failed โœ—&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; Reason: <span class="hljs-subst">{response.get(<span class="hljs-string">&#x27;FailureReason&#x27;</span>, <span class="hljs-string">&#x27;Unknown&#x27;</span>)}</span>&quot;</span>)
<span class="hljs-keyword">return</span> response
<span class="hljs-keyword">elif</span> status == <span class="hljs-string">&#x27;Stopped&#x27;</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; <span class="hljs-subst">{job_name}</span>: Stopped&quot;</span>)
<span class="hljs-keyword">return</span> response
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; <span class="hljs-subst">{job_name}</span>: <span class="hljs-subst">{status}</span>... (<span class="hljs-subst">{mins:02d}</span>:<span class="hljs-subst">{secs:02d}</span>)&quot;</span>)
time.sleep(poll_interval)
<span class="hljs-keyword">raise</span> TimeoutError(<span class="hljs-string">f&quot;Job <span class="hljs-subst">{job_name}</span> did not complete within <span class="hljs-subst">{timeout}</span>s&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-stage-1-extract" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-stage-1-extract"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ” Stage 1: Extract</span></h2> <p data-svelte-h="svelte-vzodgv">Run OCR on the source dataset to extract markdown and figures.
Output is saved to S3 (not HF Hub).</p> <p data-svelte-h="svelte-73owjc"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/extract-aws.png" alt="Extract Stage"></p> <h3 class="relative group"><a id="how-to-set-up-batch-size-for-efficient-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-set-up-batch-size-for-efficient-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to set up batch size for efficient processing</span></h3> <p data-svelte-h="svelte-1vtmi77">Since weโ€™re running <strong>batch inference</strong> (not serving live users), we can aggressively maximize GPU utilization without worrying about latency SLAs. The goal is to keep the GPU fully saturated by maintaining enough concurrent requests in flight.</p> <p data-svelte-h="svelte-12vywjn"><strong>Understanding vLLMโ€™s KV cache capacity</strong></p> <p data-svelte-h="svelte-1z0b8om">vLLM allocates GPU memory for its KV cache, which determines how many concurrent requests can be processed. When vLLM starts, it calculates and logs the KV cache capacity for your specific GPU:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">INFO</span> [kv_cache_utils.py] GPU KV cache size: 567,488 tokens
<span class="hljs-built_in">INFO</span> [kv_cache_utils.py] Maximum concurrency <span class="hljs-keyword">for</span> 8,192 tokens per request: 69.27x<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nzwjpu">Check your job logs to find these values for your hardware. The maximum concurrency depends on:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">max_concurrency</span> = KV_cache_tokens / tokens_per_request<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cky92v">For the <strong>G6E instance (L40S GPU, 48GB)</strong> with a sizing length of <strong>8,192 total tokens</strong> (prompt + generated):</p> <table data-svelte-h="svelte-17a9w03"><thead><tr><th>GPU</th> <th>KV Cache Tokens</th> <th>Hard Cap</th> <th>Safe Target (70-85%)</th></tr></thead> <tbody><tr><td><strong>L40S</strong> (48GB)</td> <td>~567,488</td> <td>69</td> <td><strong>50-60</strong></td></tr></tbody></table> <p data-svelte-h="svelte-1002kr"><strong>Setting safe concurrency</strong></p> <p data-svelte-h="svelte-sqd3ey">The <code>EXTRACT_BATCH_SIZE</code> parameter controls concurrent requests sent to vLLM. To set it safely:</p> <ol data-svelte-h="svelte-clazri"><li><strong>Estimate total tokens per request</strong>: <code>L_total = prompt_tokens + generated_tokens</code>. For OCR, generated markdown can be substantial - use your <strong>p95</strong> (not average) to avoid preemption.</li> <li><strong>Apply 70-85% headroom</strong>: This accounts for variance in document lengths and prevents KV cache pressure.</li> <li><strong>If most docs are well below 8,192 tokens</strong>, you can push higher concurrency.</li></ol> <p data-svelte-h="svelte-1yrqdxi"><strong>Our dataset: 1 page = 1 request</strong></p> <p data-svelte-h="svelte-c07w">In this pipeline, each request processes a <strong>single PDF page</strong>, which typically produces far fewer tokens than the 8,192 sizing length. This allows us to push concurrency well beyond the conservative estimates above. In practice, <strong>128 concurrent requests worked safely on G6E (L40S)</strong> - nearly 2x the theoretical hard cap - because actual token usage per page is much lower.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Stage 1: Extract</span>
&gt;&gt; <span class="hljs-comment"># Output dataset will be saved to S3</span>
&gt;&gt; stage1_env = {
<span class="hljs-meta">... </span> <span class="hljs-comment"># Source dataset (from HuggingFace)</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;DATASET_NAME&quot;</span>: SOURCE_DATASET,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;DATASET_CONFIG&quot;</span>: SOURCE_CONFIG,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;DATASET_SPLIT&quot;</span>: <span class="hljs-string">&quot;train&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;MAX_SAMPLES&quot;</span>: <span class="hljs-built_in">str</span>(MAX_SAMPLES),
<span class="hljs-meta">... </span> <span class="hljs-comment"># Local output directory</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;OUTPUT_DIR&quot;</span>: <span class="hljs-string">&quot;./outputs&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-comment"># Batch settings</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;EXTRACT_BATCH_SIZE&quot;</span>: <span class="hljs-string">&quot;128&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-comment"># S3 output (single location for all stages)</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;S3_OUTPUT_URI&quot;</span>: S3_OUTPUT_URI,
<span class="hljs-meta">... </span>}
&gt;&gt; stage1_trainer, stage1_job_name = launch_stage(<span class="hljs-string">&quot;extract&quot;</span>, stage1_env)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-2lgsk4">Launching extract stage...
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Wait for Stage 1 to complete</span>
<span class="hljs-comment"># Estimated time: ~15-20 min for 1024 samples on ml.g6e.2xlarge (scales linearly)</span>
stage1_result = wait_for_job(stage1_job_name)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Extract stage completed: <span class="hljs-subst">{stage1_result[<span class="hljs-string">&#x27;TrainingJobStatus&#x27;</span>]}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Load and display samples after Extract</span>
&gt;&gt; ds_extract = load_dataset_from_s3(<span class="hljs-string">f&quot;<span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset&quot;</span>)
&gt;&gt; display_samples(ds_extract, num_samples=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-tca2us">Dataset: 1023 samples
Columns: [&#39;sample_id&#39;, &#39;dataset_index&#39;, &#39;source_image&#39;, &#39;document_with_boxes_image&#39;, &#39;document_markdown&#39;, &#39;extracted_figures&#39;, &#39;extracted_figures_metadata&#39;, &#39;document_final_markdown&#39;]
=== Sample 0: sample_00000 ===
Source image:
</pre> <h2 class="relative group"><a id="-stage-2-describe" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-stage-2-describe"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿท๏ธ Stage 2: Describe</span></h2> <p data-svelte-h="svelte-1ylwerz">Generate captions for extracted figures.
Input is read from S3 (output of Stage 1), output is saved to S3.</p> <p data-svelte-h="svelte-pluf5s"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/describe-aws.png" alt="Describe Stage"></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Stage 2: Describe</span>
&gt;&gt; <span class="hljs-comment"># Updates dataset in place (same location as extract)</span>
&gt;&gt; stage2_env = {
<span class="hljs-meta">... </span> <span class="hljs-comment"># Local output directory</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;OUTPUT_DIR&quot;</span>: <span class="hljs-string">&quot;./outputs&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-comment"># Batch settings</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;DESCRIBE_BATCH_SIZE&quot;</span>: <span class="hljs-string">&quot;128&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-comment"># S3 input and output (same location - updates in place)</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;S3_INPUT_URI&quot;</span>: <span class="hljs-string">f&quot;<span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;S3_OUTPUT_URI&quot;</span>: S3_OUTPUT_URI,
<span class="hljs-meta">... </span>}
&gt;&gt; stage2_trainer, stage2_job_name = launch_stage(<span class="hljs-string">&quot;describe&quot;</span>, stage2_env)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-1njxhkw">Launching describe stage...
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Wait for Stage 2 to complete</span>
&gt;&gt; <span class="hljs-comment"># Estimated time: ~8-10 min for 1024 samples on ml.g6e.2xlarge</span>
&gt;&gt; stage2_result = wait_for_job(stage2_job_name)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Describe stage completed: <span class="hljs-subst">{stage2_result[<span class="hljs-string">&#x27;TrainingJobStatus&#x27;</span>]}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-emei6z">Waiting for job deepseek-ocr-sagemaker-describe-e7a0a2b5-20260115112243...
deepseek-ocr-sagemaker-describe-e7a0a2b5-20260115112243: Completed โœ“ (00:00)
Describe stage completed: Completed
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Load and display samples after Describe</span>
&gt;&gt; ds_describe = load_dataset_from_s3(<span class="hljs-string">f&quot;<span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset&quot;</span>)
&gt;&gt; display_samples(ds_describe, num_samples=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-tca2us">Dataset: 1023 samples
Columns: [&#39;sample_id&#39;, &#39;dataset_index&#39;, &#39;source_image&#39;, &#39;document_with_boxes_image&#39;, &#39;document_markdown&#39;, &#39;extracted_figures&#39;, &#39;extracted_figures_metadata&#39;, &#39;document_final_markdown&#39;]
=== Sample 0: sample_00000 ===
Source image:
</pre> <h2 class="relative group"><a id="-stage-3-assemble" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-stage-3-assemble"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿงฉ Stage 3: Assemble</span></h2> <p data-svelte-h="svelte-1dswy8c">Enrich markdown with figure captions to create the final dataset. This stage runs <strong>CPU-only</strong> with a lightweight image and smaller instance type - no vLLM or GPU needed.</p> <p data-svelte-h="svelte-zm8hxo">๐Ÿ’ก Uses <code>LIGHTWEIGHT_IMAGE</code> + <code>INSTANCE_TYPE_CPU</code> instead of the full vLLM setup, significantly reducing costs.</p> <p data-svelte-h="svelte-k721qi"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/assemble-aws.png" alt="Assemble Stage"></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Stage 3: Assemble</span>
&gt;&gt; <span class="hljs-comment"># Updates dataset in place + saves final markdown files</span>
&gt;&gt; stage3_env = {
<span class="hljs-meta">... </span> <span class="hljs-comment"># Local output directory</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;OUTPUT_DIR&quot;</span>: <span class="hljs-string">&quot;./outputs&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-comment"># S3 input and output (same location - updates in place)</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;S3_INPUT_URI&quot;</span>: <span class="hljs-string">f&quot;<span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;S3_OUTPUT_URI&quot;</span>: S3_OUTPUT_URI,
<span class="hljs-meta">... </span> <span class="hljs-comment"># Assemble stage doesn&#x27;t need GPU</span>
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;SKIP_SERVER_LAUNCH&quot;</span>: <span class="hljs-string">&quot;true&quot;</span>,
<span class="hljs-meta">... </span>}
&gt;&gt; stage3_trainer, stage3_job_name = launch_stage(<span class="hljs-string">&quot;assemble&quot;</span>, stage3_env, use_gpu=<span class="hljs-literal">False</span>) <span class="hljs-comment"># CPU-only</span><!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-n6sm87">Launching assemble stage...
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Wait for Stage 3 to complete</span>
&gt;&gt; <span class="hljs-comment"># Estimated time: ~3-5 min (CPU-only, just text processing)</span>
&gt;&gt; stage3_result = wait_for_job(stage3_job_name)
&gt;&gt; <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Assemble stage completed: <span class="hljs-subst">{stage3_result[<span class="hljs-string">&#x27;TrainingJobStatus&#x27;</span>]}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-1ius64">Waiting for job deepseek-ocr-sagemaker-assemble-107f1257-20260115114038...
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (00:00)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (00:30)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (01:00)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (01:30)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (02:00)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: InProgress... (02:30)
deepseek-ocr-sagemaker-assemble-107f1257-20260115114038: Completed โœ“ (03:00)
Assemble stage completed: Completed
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; <span class="hljs-comment"># Load and display final samples after Assemble</span>
&gt;&gt; ds_final = load_dataset_from_s3(<span class="hljs-string">f&quot;<span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset&quot;</span>)
&gt;&gt; display_samples(ds_final, num_samples=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <pre data-svelte-h="svelte-tca2us">Dataset: 1023 samples
Columns: [&#39;sample_id&#39;, &#39;dataset_index&#39;, &#39;source_image&#39;, &#39;document_with_boxes_image&#39;, &#39;document_markdown&#39;, &#39;extracted_figures&#39;, &#39;extracted_figures_metadata&#39;, &#39;document_final_markdown&#39;]
=== Sample 0: sample_00000 ===
Source image:
</pre> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Display rendered markdown with images for sample 1</span>
<span class="hljs-comment"># This properly renders figure: URIs using images from extracted_figures column</span>
display_markdown(ds_final[<span class="hljs-number">1</span>])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-cost-analysis-extract-stage-only" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-cost-analysis-extract-stage-only"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>๐Ÿ’ฐ Cost Analysis (Extract stage only)</span></h2> <table data-svelte-h="svelte-1oi7qic"><thead><tr><th>Metric</th> <th>Value</th></tr></thead> <tbody><tr><td>๐Ÿ–ฅ๏ธ <strong>Hardware</strong></td> <td>ml.g6e.2xlarge (L40S, 48GB)</td></tr> <tr><td>โšก <strong>Throughput</strong></td> <td>~83 pages/min</td></tr> <tr><td>๐Ÿ”„ <strong>Concurrency</strong></td> <td>128 parallel requests (saturates GPU batch)</td></tr> <tr><td>๐Ÿ’ต <strong>Hourly rate</strong></td> <td>~$2.80/hour</td></tr></tbody></table> <table data-svelte-h="svelte-dzev5y"><thead><tr><th>Scale</th> <th>โฑ๏ธ Time</th> <th>๐Ÿ’ฒ Cost</th></tr></thead> <tbody><tr><td>1,000 pages</td> <td>~12 min</td> <td>~$0.56</td></tr> <tr><td>10,000 pages</td> <td>~2 hours</td> <td>~$5.60</td></tr> <tr><td>100,000 pages</td> <td>~20 hours</td> <td>~$56</td></tr></tbody></table> <p data-svelte-h="svelte-8l8jt">๐Ÿ“„ <strong>Note</strong>: 1 page = 1 PDF page in these benchmarks. Pricing based on <a href="https://aws.amazon.com/sagemaker/pricing/" rel="nofollow">SageMaker on-demand pricing</a>.</p> <p data-svelte-h="svelte-1rlqdi8">๐Ÿ’ก <strong>Cost optimization</strong>: These costs can be further optimized by evaluating the best instance type and hardware utilization based on your dataset characteristics (average page complexity, token lengths, batch sizes). Consider testing different instance types (e.g., ml.g5, ml.p4d) to find the optimal price/performance ratio for your workload.</p> <h2 class="relative group"><a id="-pipeline-complete" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-pipeline-complete"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>โœ… Pipeline Complete</span></h2> <p data-svelte-h="svelte-1ahom1c">The OCR pipeline has finished. Your dataset is available in S3:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;\n&quot;</span> + <span class="hljs-string">&quot;=&quot;</span>*<span class="hljs-number">60</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Pipeline Complete!&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;=&quot;</span>*<span class="hljs-number">60</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;\nS3 Output Location: <span class="hljs-subst">{S3_OUTPUT_URI}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; - Dataset: <span class="hljs-subst">{S3_OUTPUT_URI}</span>/dataset/&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; - Files: <span class="hljs-subst">{S3_OUTPUT_URI}</span>/outputs/&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;\nS3 Job Output: s3://<span class="hljs-subst">{BUCKET_NAME}</span>/<span class="hljs-subst">{S3_PREFIX}</span>/output/&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;\nJob Summary:&quot;</span>)
<span class="hljs-keyword">for</span> i, (name, result) <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>([
(<span class="hljs-string">&quot;Extract&quot;</span>, stage1_result),
(<span class="hljs-string">&quot;Describe&quot;</span>, stage2_result),
(<span class="hljs-string">&quot;Assemble&quot;</span>, stage3_result),
], <span class="hljs-number">1</span>):
status = result[<span class="hljs-string">&quot;TrainingJobStatus&quot;</span>]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot; <span class="hljs-subst">{i}</span>. <span class="hljs-subst">{name}</span>: <span class="hljs-subst">{status}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <hr> <blockquote class="tip"><p data-svelte-h="svelte-176vrl1">๐Ÿ“ Find the complete example on GitHub <a href="https://github.com/huggingface/hub-docs/tree/main/notebooks/sagemaker-sdk/deepseek-ocr-sagemaker/sagemaker-notebook.ipynb" rel="nofollow">here</a>!</p></blockquote> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/sagemaker/source/examples/sagemaker-sdk-deepseek-ocr-sagemaker.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_16a821l = {
assets: "/docs/sagemaker/pr_2188/en",
base: "/docs/sagemaker/pr_2188/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/sagemaker/pr_2188/en/_app/immutable/entry/start.48a18f09.js"),
import("/docs/sagemaker/pr_2188/en/_app/immutable/entry/app.8481cdda.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 4],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
121 kB
ยท
Xet hash:
52155cc7891a9a232036201eb36f211871becff7ea3becdb3649d7b85b64ea39

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.