u-ashish commited on
Commit
49d3a56
·
1 Parent(s): 6937dae

Add an example showcasing modal deployment for marker

Browse files
examples/README_MODAL.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Usage Examples
2
+
3
+ This directory contains examples of running `marker` in different contexts.
4
+
5
+ ### Usage with Modal
6
+
7
+ We have a [self-contained example](./modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON.
8
+
9
+ It's a limited example that you can extend into different use cases.
10
+
11
+ #### Pre-requisites
12
+
13
+ Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started)
14
+
15
+ #### Running the example
16
+
17
+ Once `modal` is configured, you can deploy it to your workspace by running:
18
+
19
+ > modal deploy marker_modal_deployment.py --env <YOUR_MODEL_ENV>
20
+
21
+ Notes:
22
+ - `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running
23
+
24
+ > modal run --env <YOUR_MODAL_ENV> modal_deployment.py::download_models
25
+
26
+ Which will create a [`Modal Volume`](https://modal.com/docs/reference/modal.Volume) to store them for re-use.
27
+
28
+ - Regardless, once the deploy is finished, you can submit a request. To do so, get the base URL for your endpoint:
29
+ - Go into Modal
30
+ - Find the app (default name: `datalab-marker-modal-demo`)
31
+ - Click on `MarkerModalDemoService`
32
+ - You should see the URL there
33
+
34
+ - Make a request to `{BASE_URL}/convert` like this (you can also use Insomnia, etc. to help):
35
+ ```
36
+ curl --request POST \
37
+ --url {BASE_URL}/convert \
38
+ --header 'Content-Type: multipart/form-data' \
39
+ --form file=@/Users/cooldev/sample.pdf \
40
+ --form output_format=html
41
+ ```
42
+
43
+ You should get a response like this
44
+
45
+ ```
46
+ {
47
+ "success": true,
48
+ "filename": "sample.pdf",
49
+ "output_format": "html",
50
+ "json": null,
51
+ "html": "<YOUR_RESPONSE_CONTENT>",
52
+ "markdown": null,
53
+ "images": {},
54
+ "metadata": {... page level metadata ...},
55
+ "page_count": 2
56
+ }
57
+ ```
examples/marker_modal_deployment.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modal deployment for Datalab Marker PDF conversion service.
3
+ """
4
+
5
+ import modal
6
+ import os
7
+
8
+ # Define the Modal app
9
+ app = modal.App("datalab-marker-modal-demo")
10
+ GPU_TYPE = "L40S"
11
+ MODEL_PATH_PREFIX = "/root/.cache/datalab/models"
12
+
13
+ # Define the container image with all dependencies
14
+ image = (
15
+ modal.Image.debian_slim(python_version="3.10")
16
+ .apt_install(["git", "wget"])
17
+ .env({"TORCH_DEVICE": "cuda", "MARKER_CACHE_DIR": "/models"})
18
+ .pip_install([
19
+ "marker-pdf[full]",
20
+ "fastapi==0.104.1",
21
+ "uvicorn==0.24.0",
22
+ "python-multipart==0.0.6",
23
+ "torch>=2.2.2,<3.0.0",
24
+ "torchvision>=0.17.0",
25
+ "torchaudio>=2.2.0",
26
+ ])
27
+ )
28
+
29
+ # Create a persistent volume for model caching
30
+ models_volume = modal.Volume.from_name("marker-models-modal-demo", create_if_missing=True)
31
+
32
+ def setup_models_with_cache_check(logger, commit_volume=False):
33
+ """
34
+ Shared function to create models and handle cache checking/logging.
35
+ """
36
+ import os
37
+ import gc
38
+ from marker.models import create_model_dict
39
+
40
+ # Check if models exist in cache
41
+ models_dir_exists = os.path.exists(MODEL_PATH_PREFIX)
42
+ models_dir_contents = os.listdir(MODEL_PATH_PREFIX) if models_dir_exists else []
43
+
44
+ logger.info(f"Models cache directory exists: {models_dir_exists}")
45
+ logger.info(f"Models cache directory contents: {models_dir_contents}")
46
+
47
+ if models_dir_exists and models_dir_contents:
48
+ logger.info("Found existing models in volume cache, loading from cache...")
49
+ else:
50
+ logger.warning("No models found in volume cache. Models will be downloaded now (this may take several minutes).")
51
+
52
+ # Create/load models
53
+ models = create_model_dict()
54
+ logger.info(f"Successfully loaded {len(models)} models")
55
+
56
+ # Check what was downloaded/cached
57
+ if os.path.exists(MODEL_PATH_PREFIX):
58
+ contents = os.listdir(MODEL_PATH_PREFIX)
59
+ logger.info(f"Models in cache: {contents}")
60
+
61
+ # Commit volume if requested (for download function)
62
+ if commit_volume:
63
+ gc.collect()
64
+ logger.info("Attempting to commit volume...")
65
+ models_volume.commit()
66
+ logger.info("Volume committed successfully")
67
+
68
+ return models
69
+
70
+ @app.function(
71
+ image=image,
72
+ volumes={MODEL_PATH_PREFIX: models_volume},
73
+ gpu=GPU_TYPE,
74
+ timeout=600,
75
+ )
76
+ def download_models():
77
+ """
78
+ Helper function to download models used in marker into a Modal volume.
79
+ """
80
+ import logging
81
+
82
+ logging.basicConfig(level=logging.INFO)
83
+ logger = logging.getLogger(__name__)
84
+
85
+ logger.info("Downloading models to persistent volume...")
86
+ logger.info(f"Volume mounted at: {MODEL_PATH_PREFIX}")
87
+
88
+ try:
89
+ models = setup_models_with_cache_check(logger, commit_volume=True)
90
+ return f"Models downloaded successfully: {list(models.keys())}"
91
+ except Exception as e:
92
+ logger.error(f"Failed to download models: {e}")
93
+ raise
94
+
95
+ @app.cls(
96
+ image=image,
97
+ gpu=GPU_TYPE,
98
+ memory=16384,
99
+ timeout=600, # 10 minute timeout for large documents
100
+ volumes={MODEL_PATH_PREFIX: models_volume},
101
+ scaledown_window=300,
102
+ )
103
+ class MarkerModalDemoService:
104
+ @modal.enter()
105
+ def load_models(self):
106
+ """Load models once per container using @modal.enter() for efficiency."""
107
+ import logging
108
+ import traceback
109
+
110
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
111
+ logger = logging.getLogger(__name__)
112
+
113
+ logger.info("Loading Marker models using @modal.enter()...")
114
+ try:
115
+ self.models = setup_models_with_cache_check(logger, commit_volume=True)
116
+ except Exception as e:
117
+ logger.error(f"Error loading models: {e}")
118
+ traceback.print_exc()
119
+ self.models = None
120
+
121
+ @modal.asgi_app()
122
+ def fastapi_app(self):
123
+ import traceback
124
+ import io
125
+ import base64
126
+ import logging
127
+ from contextlib import asynccontextmanager
128
+ from typing import Optional
129
+ from pathlib import Path
130
+
131
+ from fastapi import FastAPI, Form, File, UploadFile, HTTPException
132
+ from fastapi.responses import JSONResponse
133
+
134
+ from marker.converters.pdf import PdfConverter
135
+ from marker.config.parser import ConfigParser
136
+ from marker.settings import settings
137
+
138
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
139
+ logger = logging.getLogger(__name__)
140
+
141
+ @asynccontextmanager
142
+ async def lifespan(app: FastAPI):
143
+ # Models are already loaded in @modal.enter()
144
+ logger.info("Datalab Marker / Modal demo app starting up...")
145
+ yield
146
+ logger.info("Datalab Marker / Modal demo app shutting down...")
147
+
148
+ # Create FastAPI app
149
+ web_app = FastAPI(
150
+ title="Datalab Marker PDF Conversion Service - Modal Demo",
151
+ description="Convert PDFs and documents to markdown, JSON, or HTML using Marker, deployed on Modal",
152
+ version="1.0.0",
153
+ lifespan=lifespan
154
+ )
155
+
156
+ @web_app.get("/health")
157
+ async def health_check():
158
+ models_loaded = hasattr(self, 'models') and self.models is not None
159
+ model_count = len(self.models) if models_loaded else 0
160
+
161
+ # Check volume contents for debugging
162
+ cache_exists = os.path.exists(MODEL_PATH_PREFIX)
163
+ cache_contents = os.listdir(MODEL_PATH_PREFIX) if cache_exists else []
164
+
165
+ return {
166
+ "status": "healthy" if models_loaded else "loading",
167
+ "models_loaded": models_loaded,
168
+ "model_count": model_count,
169
+ "cache_dir": MODEL_PATH_PREFIX,
170
+ "cache_exists": cache_exists,
171
+ "cache_contents": cache_contents[:10]
172
+ }
173
+
174
+ @web_app.post("/convert")
175
+ async def convert_document(
176
+ file: UploadFile = File(..., description="Document to convert"),
177
+ page_range: Optional[str] = Form(None),
178
+ force_ocr: bool = Form(False),
179
+ paginate_output: bool = Form(False),
180
+ output_format: str = Form("markdown"),
181
+ use_llm: bool = Form(False),
182
+ ):
183
+ """Convert uploaded document to specified format."""
184
+
185
+ if not hasattr(self, 'models') or self.models is None:
186
+ logger.error("Models not available for conversion")
187
+ raise HTTPException(status_code=503, detail="Models not loaded yet. Please wait for model initialization.")
188
+
189
+ # Validate file type
190
+ allowed_extensions = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
191
+ file_ext = Path(file.filename).suffix.lower()
192
+ if file_ext not in allowed_extensions:
193
+ raise HTTPException(
194
+ status_code=400,
195
+ detail=f"Unsupported file type: {file_ext}. Supported: {allowed_extensions}"
196
+ )
197
+
198
+ # Validate output format
199
+ if output_format not in ["markdown", "json", "html", "chunks"]:
200
+ raise HTTPException(
201
+ status_code=400,
202
+ detail="Output format must be one of: markdown, json, html, chunks"
203
+ )
204
+
205
+ try:
206
+ # Read file content
207
+ file_content = await file.read()
208
+
209
+ # Save to temporary file
210
+ temp_path = f"/tmp/{file.filename}"
211
+ with open(temp_path, "wb") as temp_file:
212
+ temp_file.write(file_content)
213
+
214
+ # Configure conversion parameters
215
+ config = {
216
+ "filepath": temp_path,
217
+ "page_range": page_range,
218
+ "force_ocr": force_ocr,
219
+ "paginate_output": paginate_output,
220
+ "output_format": output_format,
221
+ "use_llm": use_llm,
222
+ }
223
+
224
+ # Create converter
225
+ config_parser = ConfigParser(config)
226
+ config_dict = config_parser.generate_config_dict()
227
+ config_dict["pdftext_workers"] = 1
228
+
229
+ converter = PdfConverter(
230
+ config=config_dict,
231
+ artifact_dict=self.models,
232
+ processor_list=config_parser.get_processors(),
233
+ renderer=config_parser.get_renderer(),
234
+ llm_service=config_parser.get_llm_service() if use_llm else None,
235
+ )
236
+
237
+ # Convert document - converter already applies the appropriate renderer
238
+ logger.info(f"Converting {file.filename} to {output_format}...")
239
+ rendered_output = converter(temp_path)
240
+
241
+ # Extract content based on output format
242
+ json_content = None
243
+ html_content = None
244
+ markdown_content = None
245
+ encoded_images = {}
246
+
247
+ if output_format == "json":
248
+ # For JSON, return the structured data directly
249
+ json_content = rendered_output.model_dump()
250
+ else:
251
+ from marker.output import text_from_rendered
252
+ text, _, images = text_from_rendered(rendered_output)
253
+
254
+ # Assign to appropriate content field
255
+ if output_format == "html":
256
+ html_content = text
257
+ else:
258
+ markdown_content = text
259
+
260
+ # Encode images as base64
261
+ for img_name, img_obj in images.items():
262
+ byte_stream = io.BytesIO()
263
+ img_obj.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
264
+ encoded_images[img_name] = base64.b64encode(byte_stream.getvalue()).decode('utf-8')
265
+
266
+ metadata = rendered_output.metadata
267
+
268
+ logger.info(f"Conversion completed for {file.filename}")
269
+
270
+ # Clean up temp file
271
+ os.unlink(temp_path)
272
+
273
+ return JSONResponse({
274
+ "success": True,
275
+ "filename": file.filename,
276
+ "output_format": output_format,
277
+ "json": json_content,
278
+ "html": html_content,
279
+ "markdown": markdown_content,
280
+ "images": encoded_images,
281
+ "metadata": metadata,
282
+ "page_count": len(metadata.get("page_stats", [])),
283
+ })
284
+
285
+ except Exception as e:
286
+ # Clean up temp file if it exists
287
+ if os.path.exists(temp_path):
288
+ os.unlink(temp_path)
289
+
290
+ logger.error(f"Conversion error for {file.filename}: {str(e)}")
291
+ traceback.print_exc()
292
+
293
+ raise HTTPException(
294
+ status_code=500,
295
+ detail=f"Conversion failed: {str(e)}"
296
+ )
297
+
298
+ return web_app