Slurm_Gen / app.py
Rick-Xu315's picture
Create app.py
6e7c413 verified
import gradio as gr
def generate_slurm_script(job_name, account, partition, nodes, ntasks_per_node,
cpus_per_task, memory, walltime, program_file, program_args,
gpu_count, output_file, error_file, combine_output,
array_indices, dependency_type, dependency_job_ids,
mail_type, mail_user, export_env, nodelist, signal_time):
# Enhanced error checking for required fields
errors = []
warnings = []
# Required field validation
if not job_name or not job_name.strip():
errors.append("Job Name is required")
elif len(job_name.strip()) > 64:
warnings.append("Job Name is longer than 64 characters (may be truncated)")
if not program_file or not program_file.strip():
errors.append("Program/Script to Run is required")
elif not (program_file.strip().endswith(('.py', '.sh', '.R', '.m', '.cpp', '.c', '.f90', '.f', '.pl', '.rb', '.go', '.rs')) or
program_file.strip().startswith(('./', '/'))):
warnings.append("Program file doesn't have a common extension - ensure it's executable")
if not walltime or walltime == "":
errors.append("Wall Time is required")
elif walltime and walltime != "":
# Validate walltime format
try:
parts = walltime.split(':')
if len(parts) != 3:
errors.append("Wall Time must be in HH:MM:SS format")
else:
hours, minutes, seconds = map(int, parts)
if hours < 0 or minutes < 0 or minutes >= 60 or seconds < 0 or seconds >= 60:
errors.append("Invalid time values in Wall Time")
except ValueError:
errors.append("Wall Time must contain only numbers and colons (HH:MM:SS)")
# Additional validations
if nodes and int(nodes) < 1:
errors.append("Number of nodes must be at least 1")
if gpu_count and int(gpu_count) > 0 and not partition or partition == "Default":
warnings.append("GPU requested but no GPU partition selected")
if array_indices and array_indices.strip():
# Validate array indices format
try:
# Basic validation for array format
if not any(c.isdigit() for c in array_indices):
errors.append("Array indices must contain numbers")
except:
errors.append("Invalid array indices format")
if dependency_type != "None" and (not dependency_job_ids or not dependency_job_ids.strip()):
errors.append("Dependency Job IDs required when dependency type is selected")
if mail_type != "None" and (not mail_user or not mail_user.strip()):
warnings.append("Email address recommended when email notifications are enabled")
# Return errors or warnings
if errors:
error_msg = "❌ ERRORS FOUND:\n" + "\n".join(f"β€’ {error}" for error in errors)
if warnings:
error_msg += "\n\n⚠️ WARNINGS:\n" + "\n".join(f"β€’ {warning}" for warning in warnings)
return error_msg
if warnings:
warning_msg = "⚠️ WARNINGS:\n" + "\n".join(f"β€’ {warning}" for warning in warnings) + "\n\n"
else:
warning_msg = ""
# Start building the script
script = "#!/bin/bash\n\n"
script += "# Slurm job script generated by GUI\n"
if warnings:
script += f"# {warning_msg.replace(chr(10), chr(10) + '# ')}\n"
script += "\n"
# Required directives
script += f"#SBATCH --job-name={job_name}\n"
script += f"#SBATCH --time={walltime}\n"
# Optional account
if account and account.strip():
script += f"#SBATCH --account={account}\n"
# Partition/Queue
if partition and partition != "Default":
script += f"#SBATCH --partition={partition}\n"
# Resource allocation
script += f"#SBATCH --nodes={int(nodes)}\n"
if int(ntasks_per_node) > 0:
script += f"#SBATCH --ntasks-per-node={int(ntasks_per_node)}\n"
if int(cpus_per_task) > 0:
script += f"#SBATCH --cpus-per-task={int(cpus_per_task)}\n"
# Memory
if memory and memory != "Default":
script += f"#SBATCH --mem={memory}\n"
# GPU resources
if int(gpu_count) > 0:
script += f"#SBATCH --gres=gpu:{int(gpu_count)}\n"
# Output/Error files
if combine_output:
output_name = output_file if output_file.strip() else f"{job_name}_%j.out"
script += f"#SBATCH --output={output_name}\n"
else:
output_name = output_file if output_file.strip() else f"{job_name}_%j.out"
error_name = error_file if error_file.strip() else f"{job_name}_%j.err"
script += f"#SBATCH --output={output_name}\n"
script += f"#SBATCH --error={error_name}\n"
# Job arrays
if array_indices and array_indices.strip():
script += f"#SBATCH --array={array_indices}\n"
# Job dependencies
if dependency_type != "None" and dependency_job_ids and dependency_job_ids.strip():
script += f"#SBATCH --dependency={dependency_type}:{dependency_job_ids}\n"
# Email notifications
if mail_type != "None":
script += f"#SBATCH --mail-type={mail_type}\n"
if mail_user and mail_user.strip():
script += f"#SBATCH --mail-user={mail_user}\n"
# Environment export
if export_env != "Default":
script += f"#SBATCH --export={export_env}\n"
# Specific node list
if nodelist and nodelist.strip():
script += f"#SBATCH --nodelist={nodelist}\n"
# Signal before job termination
if int(signal_time) > 0:
script += f"#SBATCH --signal=B:USR1@{int(signal_time)}\n"
script += "\n"
# Add environment variable examples as comments
script += "# Available Slurm environment variables:\n"
script += "# $SLURM_JOB_NAME - Job name\n"
script += "# $SLURM_JOB_ID - Job ID\n"
script += "# $SLURM_SUBMIT_DIR - Submit directory\n"
script += "# $SLURM_SUBMIT_HOST - Submit host\n"
script += "# $SLURM_JOB_NODELIST - Node list\n"
script += "# $SLURM_JOB_PARTITION - Partition name\n"
script += "# $SLURM_JOB_NUM_NODES - Number of allocated nodes\n"
script += "# $SLURM_NTASKS - Number of processes\n"
script += "# $SLURM_TASKS_PER_NODE - Processes per node\n"
script += "# $SLURM_ARRAY_TASK_ID - Array task ID (if array job)\n\n"
# Change to submit directory
script += "# Change to the directory from which the job was submitted\n"
script += "cd $SLURM_SUBMIT_DIR\n\n"
# Load modules section
script += "# Load required modules here\n"
script += "# module load python/3.9\n"
script += "# module load gcc/9.3.0\n\n"
# Main program execution
script += "# Run the program\n"
if program_file.startswith('./') or program_file.startswith('/'):
script += f"{program_file}"
else:
script += f"./{program_file}"
if program_args and program_args.strip():
script += f" {program_args}"
script += "\n"
return script
# Define all the input components
def create_interface():
with gr.Blocks(title="πŸš€ Comprehensive Slurm Script Generator") as interface:
gr.Markdown("""
# πŸš€ Comprehensive Slurm Script Generator
Generate complete Slurm job scripts with all available options.
**Instructions:**
1. Fill in required fields (marked with *)
2. Configure your job parameters
3. Click Submit to generate the script
4. Copy the generated script and save as .sh file
5. Submit with: sbatch your_script.sh
**Tips:**
β€’ Job Name and Program/Script are required
β€’ Use array jobs (e.g., 1-10) for parameter sweeps
β€’ Set dependencies to chain jobs together
β€’ Check available partitions with 'sinfo' command
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Basic Job Information")
job_name = gr.Textbox(label="Job Name *", placeholder="my_job_name", info="Required: Name for your job")
account = gr.Textbox(label="Account (optional)", placeholder="your_account_name")
partition = gr.Dropdown(
choices=["Default", "general", "debug", "gpu", "gpuq", "highmem", "standard"],
value="Default",
label="Partition/Queue"
)
program_file = gr.Textbox(label="Program/Script to Run *", placeholder="my_program.py", info="Required: Script or program to execute")
program_args = gr.Textbox(label="Program Arguments (optional)", placeholder="input.txt --verbose")
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Resource Allocation")
nodes = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Number of Nodes")
ntasks_per_node = gr.Slider(minimum=0, maximum=128, value=1, step=1, label="Tasks per Node (0=auto)")
cpus_per_task = gr.Slider(minimum=0, maximum=64, value=1, step=1, label="CPUs per Task (0=auto)")
memory = gr.Dropdown(
choices=["Default", "1G", "2G", "4G", "8G", "16G", "32G", "64G", "128G", "256G"],
value="8G",
label="Memory per Node"
)
walltime = gr.Dropdown(
choices=["", "00:15:00", "00:30:00", "01:00:00", "02:00:00", "04:00:00", "08:00:00", "12:00:00", "24:00:00"],
value="01:00:00",
label="Wall Time (HH:MM:SS) *",
info="Required: Maximum runtime for your job"
)
gpu_count = gr.Slider(minimum=0, maximum=8, value=0, step=1, label="Number of GPUs (0=no GPU)")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Output Configuration")
output_file = gr.Textbox(label="Output File (optional)", placeholder="output_%j.out")
error_file = gr.Textbox(label="Error File (optional)", placeholder="error_%j.err")
combine_output = gr.Checkbox(label="Combine stdout and stderr", value=False)
with gr.Column(scale=1):
gr.Markdown("### πŸ”— Job Dependencies & Arrays")
array_indices = gr.Textbox(label="Array Job Indices (optional)", placeholder="1-10 or 1,3,5-8")
dependency_type = gr.Dropdown(
choices=["None", "after", "afterok", "afternotok", "afterany"],
value="None",
label="Job Dependency Type"
)
dependency_job_ids = gr.Textbox(label="Dependency Job IDs (optional)", placeholder="12345,12346")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“§ Email Notifications")
mail_type = gr.Dropdown(
choices=["None", "BEGIN", "END", "FAIL", "ALL", "BEGIN,END", "END,FAIL", "BEGIN,END,FAIL"],
value="FAIL",
label="Email Notification Types"
)
mail_user = gr.Textbox(label="Email Address (optional)", placeholder="user@university.edu")
with gr.Column(scale=1):
gr.Markdown("### πŸ”§ Advanced Options")
export_env = gr.Dropdown(
choices=["Default", "ALL", "NONE"],
value="Default",
label="Export Environment"
)
nodelist = gr.Textbox(label="Specific Nodes (optional)", placeholder="node001,node002")
signal_time = gr.Slider(minimum=0, maximum=300, value=0, step=10, label="Signal Before End (seconds, 0=disabled)")
# Submit button
submit_btn = gr.Button("πŸš€ Generate Slurm Script", variant="primary", size="lg")
# Output
output = gr.Textbox(label="Generated Slurm Script", lines=20, max_lines=30)
# Connect the function
submit_btn.click(
fn=generate_slurm_script,
inputs=[
job_name, account, partition, nodes, ntasks_per_node, cpus_per_task,
memory, walltime, program_file, program_args, gpu_count, output_file,
error_file, combine_output, array_indices, dependency_type,
dependency_job_ids, mail_type, mail_user, export_env, nodelist, signal_time
],
outputs=output
)
return interface
# Launch the interface
if __name__ == "__main__":
interface = create_interface()
interface.launch(share=True)