Upload folder using huggingface_hub

714cf46 verified 20 days ago

31.2 kB

	"""
	Backend-only Modal app for GUI-driven Protify workflows.

	This module intentionally avoids browser UI dependencies. It exposes remote
	functions that the local Tk GUI can call to deploy, submit jobs, monitor status,
	cancel jobs, and fetch artifacts.
	"""

	import base64
	import json
	import os
	import random
	import shutil
	import string
	import subprocess
	import threading
	import time
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Any, Dict, Optional

	import modal
	import yaml


	SCRIPT_DIR = Path(__file__).parent.resolve()
	PROJECT_ROOT = SCRIPT_DIR.parents[1]

	APP_NAME = "protify-backend"
	PROTIFY_DEFAULT_GPU = "A10"
	AVAILABLE_GPUS = ["H200", "H100", "A100-80GB", "A100", "L40S", "A10", "L4", "T4"]

	GPU_CPU_MIN, GPU_CPU_MAX = 8.0, 16.0
	GPU_MEMORY_MIN, GPU_MEMORY_MAX = 65536, 262144
	MAX_CONTAINERS_GPU = 8
	CPU_MEMORY_MIN, CPU_MEMORY_MAX = 4096, 8192
	CPU_COUNT_MIN, CPU_COUNT_MAX = 2.0, 4.0
	MAX_CONTAINERS_CPU = 10
	SCALEDOWN_WINDOW_GPU = 10
	SCALEDOWN_WINDOW_CPU = 300
	TIMEOUT_SECONDS = 86400
	HEARTBEAT_SECONDS = 10

	STATUS_FILE_PATH = "/data/job_status.json"
	LOG_DIR_DEFAULT = "/data/logs"
	RESULTS_DIR_DEFAULT = "/data/results"
	PLOTS_DIR_DEFAULT = "/data/plots"
	WEIGHTS_DIR_DEFAULT = "/data/weights"
	EMBED_DIR_DEFAULT = "/data/embeddings"
	DOWNLOAD_DIR_DEFAULT = "/data/downloads"


	def _build_image():
	image = (
	modal.Image.debian_slim(python_version="3.10")
	.apt_install("git", "wget", "curl")
	.run_commands("pip install --upgrade pip setuptools")
	)

	req_file_path = "requirements.txt"
	if (PROJECT_ROOT / req_file_path).exists():
	image = image.add_local_file(req_file_path, "/tmp/requirements.txt", copy=True)
	image = image.run_commands("pip install -r /tmp/requirements.txt")
	else:
	image = image.run_commands("pip install torch transformers datasets")

	src_dir_path = "src"
	if (PROJECT_ROOT / src_dir_path).exists():
	image = image.add_local_dir(src_dir_path, "/root/src", copy=True)

	readme_file_path = "README.md"
	if (PROJECT_ROOT / readme_file_path).exists():
	image = image.add_local_file(readme_file_path, "/root/README.md", copy=True)

	image = image.env(
	{
	"TF_CPP_MIN_LOG_LEVEL": "2",
	"TF_ENABLE_ONEDNN_OPTS": "0",
	"TOKENIZERS_PARALLELISM": "true",
	"CUBLAS_WORKSPACE_CONFIG": ":4096:8",
	}
	)
	return image


	app = modal.App(APP_NAME)
	image = _build_image()
	volume = modal.Volume.from_name("protify-data", create_if_missing=True)

	_status_lock = threading.Lock()


	def _now_utc_iso() -> str:
	return datetime.now(timezone.utc).isoformat()


	def _generate_job_id() -> str:
	random_letters = "".join(random.choices(string.ascii_uppercase, k=4))
	date_str = datetime.now().strftime("%Y-%m-%d-%H-%M")
	return f"{date_str}_{random_letters}"


	def _safe_read_json(json_path: str) -> Dict[str, Any]:
	if not os.path.exists(json_path):
	return {}
	try:
	with open(json_path, "r", encoding="utf-8") as file:
	return json.load(file)
	except Exception:
	return {}


	def _safe_write_json(json_path: str, payload: Dict[str, Any]) -> None:
	os.makedirs(os.path.dirname(json_path), exist_ok=True)
	with open(json_path, "w", encoding="utf-8") as file:
	json.dump(payload, file, indent=2)


	def _update_job_status(job_id: str, patch: Dict[str, Any]) -> Dict[str, Any]:
	with _status_lock:
	status_store = _safe_read_json(STATUS_FILE_PATH)
	if job_id not in status_store:
	status_store[job_id] = {
	"job_id": job_id,
	"status": "PENDING",
	"phase": "created",
	"created_at_utc": _now_utc_iso(),
	"updated_at_utc": _now_utc_iso(),
	}
	status_store[job_id].update(patch)
	status_store[job_id]["updated_at_utc"] = _now_utc_iso()
	_safe_write_json(STATUS_FILE_PATH, status_store)
	volume.commit()
	return status_store[job_id]


	def _infer_phase_from_line(line: str, current_phase: str) -> str:
	lowered = line.lower()
	if "loading and preparing datasets" in lowered or "getting data" in lowered:
	return "data_loading"
	if "computing embeddings" in lowered or "saving embeddings" in lowered or "download embeddings" in lowered:
	return "embedding"
	if "starting training" in lowered or "training probe" in lowered or "run_wandb_hyperopt" in lowered:
	return "training"
	if "proteingym" in lowered:
	return "proteingym"
	if "generating visualization plots" in lowered:
	return "plotting"
	if "successfully saved model to huggingface hub" in lowered:
	return "pushing_to_hub"
	return current_phase


	def _tail_text(text: str, max_chars: int) -> str:
	if len(text) <= max_chars:
	return text
	return text[-max_chars:]


	def _fix_paths(config_obj: Any) -> Any:
	if isinstance(config_obj, dict):
	for key in list(config_obj.keys()):
	value = config_obj[key]
	if isinstance(value, str):
	if value.startswith("data/") or value.startswith("local_data/"):
	config_obj[key] = f"/data/{value.split('/', 1)[1]}"
	elif key.endswith("_dir") and (not os.path.isabs(value)):
	config_obj[key] = f"/data/{value}"
	elif isinstance(value, list):
	config_obj[key] = [_fix_paths(item) for item in value]
	elif isinstance(value, dict):
	config_obj[key] = _fix_paths(value)
	elif isinstance(config_obj, list):
	return [_fix_paths(item) for item in config_obj]
	return config_obj


	def _prepare_config(config: Dict[str, Any]) -> Dict[str, Any]:
	config_copy = dict(config)
	config_copy = _fix_paths(config_copy)

	if ("log_dir" not in config_copy) or (not config_copy["log_dir"]):
	config_copy["log_dir"] = LOG_DIR_DEFAULT
	if ("results_dir" not in config_copy) or (not config_copy["results_dir"]):
	config_copy["results_dir"] = RESULTS_DIR_DEFAULT
	if ("model_save_dir" not in config_copy) or (not config_copy["model_save_dir"]):
	config_copy["model_save_dir"] = WEIGHTS_DIR_DEFAULT
	if ("embedding_save_dir" not in config_copy) or (not config_copy["embedding_save_dir"]):
	config_copy["embedding_save_dir"] = EMBED_DIR_DEFAULT
	if ("plots_dir" not in config_copy) or (not config_copy["plots_dir"]):
	config_copy["plots_dir"] = PLOTS_DIR_DEFAULT
	if ("download_dir" not in config_copy) or (not config_copy["download_dir"]):
	config_copy["download_dir"] = DOWNLOAD_DIR_DEFAULT
	if "replay_path" not in config_copy:
	config_copy["replay_path"] = None
	if "pretrained_probe_path" not in config_copy:
	config_copy["pretrained_probe_path"] = None
	if "hf_home" not in config_copy:
	config_copy["hf_home"] = None

	path_keys = ["log_dir", "results_dir", "model_save_dir", "embedding_save_dir", "plots_dir", "download_dir"]
	for path_key in path_keys:
	os.makedirs(config_copy[path_key], exist_ok=True)

	return config_copy


	def _execute_protify_job(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	if job_id is None:
	job_id = _generate_job_id()

	selected_gpu = gpu_type if gpu_type in AVAILABLE_GPUS else PROTIFY_DEFAULT_GPU
	_update_job_status(
	job_id,
	{
	"status": "RUNNING",
	"phase": "startup",
	"gpu_type": selected_gpu,
	"last_heartbeat_utc": _now_utc_iso(),
	"started_at_utc": _now_utc_iso(),
	"error": None,
	},
	)

	active_hf_token = hf_token
	if active_hf_token is None:
	active_hf_token = os.environ.get("HF_TOKEN")

	if active_hf_token is not None:
	try:
	from huggingface_hub import login

	os.environ["HF_TOKEN"] = active_hf_token
	login(active_hf_token)
	except Exception:
	pass

	prepared_config = _prepare_config(config)
	log_file_path = os.path.join(prepared_config["log_dir"], f"{job_id}.txt")
	_update_job_status(
	job_id,
	{
	"log_file_path": log_file_path,
	"results_dir": prepared_config["results_dir"],
	"plots_dir": prepared_config["plots_dir"],
	},
	)

	run_dir = Path("/tmp/protify_run") / job_id
	run_dir.mkdir(parents=True, exist_ok=True)
	config_path = run_dir / "config.yaml"

	config_to_dump = dict(prepared_config)
	config_to_dump["hf_token"] = None
	config_to_dump["wandb_api_key"] = None
	config_to_dump["synthyra_api_key"] = None
	with open(config_path, "w", encoding="utf-8") as config_file:
	yaml.dump(config_to_dump, config_file, default_flow_style=False, allow_unicode=True, sort_keys=False)

	command = ["python", "-u", "main.py", "--yaml_path", str(config_path)]
	if active_hf_token is not None:
	command.extend(["--hf_token", active_hf_token])
	if wandb_api_key is not None:
	command.extend(["--wandb_api_key", wandb_api_key])
	if synthyra_api_key is not None:
	command.extend(["--synthyra_api_key", synthyra_api_key])

	process_env = os.environ.copy()
	process_env["PYTHONPATH"] = "/root/src"
	process_env["WORKING_DIR"] = "/root"
	process_env["PYTHONUNBUFFERED"] = "1"
	process_env["PROTIFY_JOB_ID"] = job_id
	process_env["CUDA_VISIBLE_DEVICES"] = "0"
	if active_hf_token is not None:
	process_env["HF_TOKEN"] = active_hf_token
	if wandb_api_key is not None:
	process_env["WANDB_API_KEY"] = wandb_api_key

	os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
	with open(log_file_path, "w", encoding="utf-8") as log_file:
	log_file.write(f"[{_now_utc_iso()}] Starting job {job_id}\n")
	log_file.write(f"GPU={selected_gpu}\n")
	log_file.write(f"Command={' '.join(command)}\n")
	volume.commit()

	stdout_lines = []
	stderr_lines = []
	log_lock = threading.Lock()
	phase_state = {"phase": "startup"}

	def append_log(log_line: str) -> None:
	with log_lock:
	with open(log_file_path, "a", encoding="utf-8", errors="ignore") as log_file:
	log_file.write(log_line + "\n")

	def stream_output(pipe, output_list, prefix: str = ""):
	try:
	for line in iter(pipe.readline, ""):
	if not line:
	continue
	clean_line = line.rstrip("\n")
	full_line = f"{prefix}{clean_line}"
	output_list.append(full_line)
	phase_state["phase"] = _infer_phase_from_line(clean_line, phase_state["phase"])
	append_log(full_line)
	print(full_line, flush=True)
	finally:
	pipe.close()

	timed_out = False
	process = None
	try:
	process = subprocess.Popen(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	bufsize=1,
	cwd="/root/src/protify",
	env=process_env,
	)

	stdout_thread = threading.Thread(target=stream_output, args=(process.stdout, stdout_lines, ""), daemon=True)
	stderr_thread = threading.Thread(target=stream_output, args=(process.stderr, stderr_lines, "[STDERR] "), daemon=True)
	stdout_thread.start()
	stderr_thread.start()

	max_runtime_seconds = timeout_seconds
	if max_runtime_seconds > TIMEOUT_SECONDS - 60:
	max_runtime_seconds = TIMEOUT_SECONDS - 60

	start_time = time.time()
	last_heartbeat = 0.0
	while process.poll() is None:
	now = time.time()
	if now - start_time > max_runtime_seconds:
	timed_out = True
	process.kill()
	break
	if now - last_heartbeat >= HEARTBEAT_SECONDS:
	_update_job_status(
	job_id,
	{
	"status": "RUNNING",
	"phase": phase_state["phase"],
	"last_heartbeat_utc": _now_utc_iso(),
	},
	)
	last_heartbeat = now
	time.sleep(1)

	stdout_thread.join(timeout=5)
	stderr_thread.join(timeout=5)

	return_code = process.returncode if process is not None else -1
	stdout_text = "\n".join(stdout_lines)
	stderr_text = "\n".join(stderr_lines)

	if timed_out:
	_update_job_status(
	job_id,
	{
	"status": "TIMEOUT",
	"phase": "timeout",
	"last_heartbeat_utc": _now_utc_iso(),
	"error": f"Process timed out after {max_runtime_seconds} seconds.",
	"exit_code": -1,
	"finished_at_utc": _now_utc_iso(),
	},
	)
	return {
	"success": False,
	"job_id": job_id,
	"status": "TIMEOUT",
	"error": f"Process timed out after {max_runtime_seconds} seconds.",
	"stdout": _tail_text(stdout_text, 5000),
	}

	if return_code != 0:
	_update_job_status(
	job_id,
	{
	"status": "FAILED",
	"phase": "failed",
	"last_heartbeat_utc": _now_utc_iso(),
	"error": _tail_text(stderr_text, 5000) if stderr_text else "Unknown subprocess error.",
	"exit_code": return_code,
	"finished_at_utc": _now_utc_iso(),
	},
	)
	return {
	"success": False,
	"job_id": job_id,
	"status": "FAILED",
	"error": _tail_text(stderr_text, 5000) if stderr_text else "Unknown subprocess error.",
	"stdout": _tail_text(stdout_text, 5000),
	}

	_update_job_status(
	job_id,
	{
	"status": "SUCCESS",
	"phase": "completed",
	"last_heartbeat_utc": _now_utc_iso(),
	"error": None,
	"exit_code": return_code,
	"finished_at_utc": _now_utc_iso(),
	},
	)
	return {
	"success": True,
	"job_id": job_id,
	"status": "SUCCESS",
	"stdout": _tail_text(stdout_text, 5000),
	}
	except Exception as error:
	_update_job_status(
	job_id,
	{
	"status": "FAILED",
	"phase": "exception",
	"last_heartbeat_utc": _now_utc_iso(),
	"error": str(error),
	"exit_code": -1,
	"finished_at_utc": _now_utc_iso(),
	},
	)
	return {
	"success": False,
	"job_id": job_id,
	"status": "FAILED",
	"error": str(error),
	"stdout": "",
	}


	@app.function(
	image=image,
	gpu="H200",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_h200(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="H100",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_h100(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="A100-80GB",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_a100_80gb(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="A100",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_a100(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="L40S",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_l40s(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="A10",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_a10(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="L4",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_l4(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	@app.function(
	image=image,
	gpu="T4",
	volumes={"/data": volume},
	memory=(GPU_MEMORY_MIN, GPU_MEMORY_MAX),
	cpu=(GPU_CPU_MIN, GPU_CPU_MAX),
	max_containers=MAX_CONTAINERS_GPU,
	scaledown_window=SCALEDOWN_WINDOW_GPU,
	timeout=TIMEOUT_SECONDS,
	)
	def run_protify_job_t4(
	config: Dict[str, Any],
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	job_id: Optional[str] = None,
	gpu_type: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	) -> Dict[str, Any]:
	return _execute_protify_job(config, hf_token, wandb_api_key, synthyra_api_key, job_id, gpu_type, timeout_seconds)


	gpu_functions = {
	"H200": run_protify_job_h200,
	"H100": run_protify_job_h100,
	"A100-80GB": run_protify_job_a100_80gb,
	"A100": run_protify_job_a100,
	"L40S": run_protify_job_l40s,
	"A10": run_protify_job_a10,
	"L4": run_protify_job_l4,
	"T4": run_protify_job_t4,
	}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def submit_protify_job(
	config: Dict[str, Any],
	gpu_type: str = PROTIFY_DEFAULT_GPU,
	hf_token: Optional[str] = None,
	wandb_api_key: Optional[str] = None,
	synthyra_api_key: Optional[str] = None,
	timeout_seconds: int = TIMEOUT_SECONDS,
	job_id: Optional[str] = None,
	) -> Dict[str, Any]:
	if job_id is None:
	job_id = _generate_job_id()

	selected_gpu = gpu_type if gpu_type in AVAILABLE_GPUS else PROTIFY_DEFAULT_GPU
	_update_job_status(
	job_id,
	{
	"status": "PENDING",
	"phase": "queued",
	"gpu_type": selected_gpu,
	"last_heartbeat_utc": _now_utc_iso(),
	"error": None,
	},
	)

	selected_gpu_function = gpu_functions[selected_gpu]
	handle = selected_gpu_function.spawn(
	config=config,
	hf_token=hf_token,
	wandb_api_key=wandb_api_key,
	synthyra_api_key=synthyra_api_key,
	job_id=job_id,
	gpu_type=selected_gpu,
	timeout_seconds=timeout_seconds,
	)
	function_call_id = handle.object_id
	_update_job_status(
	job_id,
	{
	"status": "RUNNING",
	"phase": "queued",
	"function_call_id": function_call_id,
	"last_heartbeat_utc": _now_utc_iso(),
	},
	)
	return {
	"success": True,
	"job_id": job_id,
	"function_call_id": function_call_id,
	"status": "RUNNING",
	"gpu_type": selected_gpu,
	}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def get_job_status(job_id: str) -> Dict[str, Any]:
	volume.reload()
	status_store = _safe_read_json(STATUS_FILE_PATH)
	if job_id not in status_store:
	return {"success": False, "job_id": job_id, "error": "Job ID not found."}

	job_status = status_store[job_id]
	heartbeat_age_seconds = None
	if "last_heartbeat_utc" in job_status and job_status["last_heartbeat_utc"]:
	try:
	heartbeat_time = datetime.fromisoformat(job_status["last_heartbeat_utc"])
	heartbeat_age_seconds = (datetime.now(timezone.utc) - heartbeat_time).total_seconds()
	except Exception:
	heartbeat_age_seconds = None
	job_status["heartbeat_age_seconds"] = heartbeat_age_seconds
	job_status["success"] = True
	return job_status


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def get_job_log_tail(job_id: str, max_chars: int = 5000) -> Dict[str, Any]:
	volume.reload()
	status_store = _safe_read_json(STATUS_FILE_PATH)
	status_entry = status_store[job_id] if job_id in status_store else None
	if status_entry is not None and "log_file_path" in status_entry and status_entry["log_file_path"]:
	log_file_path = status_entry["log_file_path"]
	else:
	log_file_path = os.path.join(LOG_DIR_DEFAULT, f"{job_id}.txt")

	if not os.path.exists(log_file_path):
	if status_entry is None:
	return {"success": False, "job_id": job_id, "error": "Job ID not found.", "log_tail": ""}
	return {"success": True, "job_id": job_id, "log_tail": ""}

	with open(log_file_path, "r", encoding="utf-8", errors="ignore") as log_file:
	text = log_file.read()
	return {"success": True, "job_id": job_id, "log_tail": _tail_text(text, max_chars)}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def get_job_log_delta(job_id: str, offset: int = 0, max_chars: int = 5000) -> Dict[str, Any]:
	volume.reload()
	if offset < 0:
	offset = 0
	if max_chars <= 0:
	max_chars = 1

	status_store = _safe_read_json(STATUS_FILE_PATH)
	status_entry = status_store[job_id] if job_id in status_store else None
	if status_entry is not None and "log_file_path" in status_entry and status_entry["log_file_path"]:
	log_file_path = status_entry["log_file_path"]
	else:
	log_file_path = os.path.join(LOG_DIR_DEFAULT, f"{job_id}.txt")

	if not os.path.exists(log_file_path):
	return {
	"success": True,
	"job_id": job_id,
	"file_exists": False,
	"chunk": "",
	"next_offset": offset,
	"file_size": 0,
	}

	with open(log_file_path, "r", encoding="utf-8", errors="ignore") as log_file:
	text = log_file.read()
	file_size = len(text)
	if offset > file_size:
	offset = file_size
	end_offset = offset + max_chars
	if end_offset > file_size:
	end_offset = file_size
	chunk = text[offset:end_offset]
	return {
	"success": True,
	"job_id": job_id,
	"file_exists": True,
	"chunk": chunk,
	"next_offset": end_offset,
	"file_size": file_size,
	}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def delete_modal_embeddings() -> Dict[str, Any]:
	volume.reload()
	embedding_dir = Path(EMBED_DIR_DEFAULT)
	if not embedding_dir.exists():
	return {
	"success": True,
	"message": f"Embedding directory does not exist: {EMBED_DIR_DEFAULT}",
	"deleted_files": 0,
	"deleted_dirs": 0,
	}

	deleted_files = 0
	deleted_dirs = 0
	for path in embedding_dir.glob("*"):
	if path.is_file():
	path.unlink()
	deleted_files += 1
	elif path.is_dir():
	shutil.rmtree(path)
	deleted_dirs += 1

	volume.commit()
	return {
	"success": True,
	"message": f"Deleted modal embedding cache contents ({deleted_files} files, {deleted_dirs} directories).",
	"deleted_files": deleted_files,
	"deleted_dirs": deleted_dirs,
	}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def cancel_protify_job(function_call_id: str, job_id: Optional[str] = None) -> Dict[str, Any]:
	function_call = modal.FunctionCall.from_id(function_call_id)
	function_call.cancel()
	if job_id is not None:
	_update_job_status(
	job_id,
	{
	"status": "TERMINATED",
	"phase": "cancelled",
	"last_heartbeat_utc": _now_utc_iso(),
	"finished_at_utc": _now_utc_iso(),
	},
	)
	return {"success": True, "function_call_id": function_call_id, "job_id": job_id}


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def get_results(job_id: str) -> Dict[str, Any]:
	volume.reload()
	image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp"}
	results = {"success": True, "files": {}, "images": {}}

	results_dir = Path(RESULTS_DIR_DEFAULT)
	plots_dir = Path(PLOTS_DIR_DEFAULT)
	logs_dir = Path(LOG_DIR_DEFAULT)

	collected_files = set()
	result_file = results_dir / f"{job_id}.tsv"
	if result_file.exists():
	collected_files.add(result_file)
	log_file = logs_dir / f"{job_id}.txt"
	if log_file.exists():
	collected_files.add(log_file)
	plot_dir = plots_dir / job_id
	if plot_dir.exists() and plot_dir.is_dir():
	for file_path in plot_dir.rglob("*"):
	if file_path.is_file():
	collected_files.add(file_path)

	for file_path in collected_files:
	relative_path = str(file_path.relative_to(Path("/data")))
	suffix = file_path.suffix.lower()
	try:
	if suffix in image_extensions:
	with open(file_path, "rb") as image_file:
	encoded = base64.b64encode(image_file.read()).decode("utf-8")
	mime_type = f"image/{suffix[1:]}" if suffix != ".svg" else "image/svg+xml"
	results["images"][relative_path] = {"data": encoded, "mime_type": mime_type}
	else:
	with open(file_path, "r", encoding="utf-8", errors="ignore") as text_file:
	results["files"][relative_path] = text_file.read()
	except Exception as error:
	if suffix in image_extensions:
	results["images"][relative_path] = {"error": str(error)}
	else:
	results["files"][relative_path] = f"Error reading file: {error}"

	return results


	@app.function(
	image=image,
	volumes={"/data": volume},
	memory=(CPU_MEMORY_MIN, CPU_MEMORY_MAX),
	cpu=(CPU_COUNT_MIN, CPU_COUNT_MAX),
	max_containers=MAX_CONTAINERS_CPU,
	scaledown_window=SCALEDOWN_WINDOW_CPU,
	)
	def list_jobs() -> Dict[str, Any]:
	volume.reload()
	status_store = _safe_read_json(STATUS_FILE_PATH)
	jobs = []
	for job_id in status_store:
	jobs.append(status_store[job_id])
	jobs.sort(key=lambda item: item["job_id"], reverse=True)
	return {"success": True, "jobs": jobs}


	if __name__ == "__main__":
	with app.run():
	pass