import modal app = modal.App("example-app-with-volumes") #Download checkpoints model_cache = modal.Volume.from_name("checkpoints-cache", create_if_missing=True) download_image = ( modal.Image.debian_slim(python_version="3.11") .pip_install("huggingface_hub[hf_transfer]==0.26.2") .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) ) @app.function(image=download_image, volumes={cache_dir: model_cache}, timeout=30 * MINUTES) def download_model(repo_id, allow_patterns, revision: Optional[str] = None): from huggingface_hub import snapshot_download print(f"🦙 downloading model from {repo_id} if not present") snapshot_download( repo_id=repo_id, revision=revision, local_dir=cache_dir, allow_patterns=allow_patterns, ) model_cache.commit() # ensure other Modal Functions can see our writes before we quit print("🦙 model loaded") @app.local_entrypoint() def main(video: Optional[str] = None, audio: str = "DeepSeek-R1"): """Run llama.cpp inference on Modal for phi-4 or deepseek r1.""" import shlex org_name = "unsloth" # two sample models: the diminutive phi-4 and the chonky deepseek r1 if model.lower() == "phi-4": model_name = "phi-4-GGUF" quant = "Q2_K" model_entrypoint_file = f"phi-4-{quant}.gguf" model_pattern = f"*{quant}*" revision = None parsed_args = DEFAULT_PHI_ARGS if args is None else shlex.split(args) elif model.lower() == "deepseek-r1": model_name = "DeepSeek-R1-GGUF" quant = "UD-IQ1_S" model_entrypoint_file = ( f"{model}-{quant}/DeepSeek-R1-{quant}-00001-of-00003.gguf" ) model_pattern = f"*{quant}*" revision = "02656f62d2aa9da4d3f0cdb34c341d30dd87c3b6" parsed_args = DEFAULT_DEEPSEEK_R1_ARGS if args is None else shlex.split(args) else: raise ValueError(f"Unknown model {model}") repo_id = f"{org_name}/{model_name}" download_model.remote(repo_id, [model_pattern], revision) # call out to a `.remote` Function on Modal for inference result = llama_cpp_inference.remote( model_entrypoint_file, prompt, n_predict, parsed_args, store_output=model.lower() == "deepseek-r1", ) output_path = Path("/tmp") / f"llama-cpp-{model}.txt" output_path.parent.mkdir(parents=True, exist_ok=True) print(f"🦙 writing response to {output_path}") output_path.write_text(result)