Spaces:

Victarry
/

PP-schedule-visualizer

Running

Victarry commited on Jan 13

Commit

2bb73ed

2 Parent(s): f300aa9 fa7e466

Merge branch 'main' into hf_space

* main:
Make P2P in warmup/cooldown stage to sync comm.
Add support to set custom stage time.
Add visualization script and README for Pipeline Parallelism in Megatron-LM
Update link.

Files changed (5) hide show

app.py +132 -9
assets/dumped_example.jpg +3 -0
examples/megatron-lm/README.md +95 -0
examples/megatron-lm/plot.py +315 -0
src/execution_model.py +61 -1

app.py CHANGED Viewed

@@ -292,6 +292,35 @@ timing_params_card = dbc.Card([
     ])
 ], style=card_style)
 # Updated app layout with improved structure
 app.layout = html.Div([
     header,
@@ -346,6 +375,7 @@ app.layout = html.Div([
                 basic_params_card,
                 scheduling_params_card,
                 timing_params_card,
                 # Generate button with better styling
                 dbc.Button([
@@ -398,15 +428,11 @@ app.layout = html.Div([
                         html.A([
                             html.I(className="bi bi-github me-2"),
                             "View on GitHub"
-                        ], href="#", className="small text-muted d-block mb-2"),
-                        html.A([
-                            html.I(className="bi bi-book me-2"),
-                            "Documentation"
-                        ], href="#", className="small text-muted d-block mb-2"),
                         html.A([
                             html.I(className="bi bi-question-circle me-2"),
                             "Report an Issue"
-                        ], href="#", className="small text-muted d-block")
                     ])
                 ], md=4)
             ]),
@@ -525,6 +551,75 @@ def toggle_advanced_options(n_clicks, is_open):
         return not is_open
     return is_open
 # --- Client-side Callback for Strategy Card Selection ---
 app.clientside_callback(
     """
@@ -580,12 +675,14 @@ app.clientside_callback(
     State('op_time_overlapped_fwd_bwd', 'value'),
     State('microbatch_group_size_per_vp_stage', 'value'),
     State('selected-strategies-store', 'data'),
     prevent_initial_call=True
 )
 def update_graph(n_clicks, num_devices, num_stages, num_batches, p2p_latency,
                  op_time_forward, op_time_backward, op_time_backward_d, op_time_backward_w,
                  op_time_overlapped_fwd_bwd, microbatch_group_size_per_vp_stage,
-                 selected_strategies):
     strategy_display_order = ["1f1b", "1f1b_interleave", "1f1b_overlap", "1f1b_interleave_overlap", "dualpipe", "zb1p"]
@@ -673,14 +770,40 @@ def update_graph(n_clicks, num_devices, num_stages, num_batches, p2p_latency,
                      if adjustment_msg not in automatic_adjustments:
                          automatic_adjustments.append(adjustment_msg)
-                op_times = { "forward": float(op_time_forward) * time_scale_factor }
                 if split_backward:
                     op_times["backward_D"] = float(op_time_backward_d) * time_scale_factor
                     op_times["backward_W"] = float(op_time_backward_w) * time_scale_factor
                     op_times["backward"] = (float(op_time_backward_d) + float(op_time_backward_w)) * time_scale_factor
                 else:
-                    op_times["backward"] = float(op_time_backward) * time_scale_factor
                 if op_time_overlapped_fwd_bwd is not None:
                     try:

     ])
 ], style=card_style)
+# Per-stage timing configuration card
+per_stage_timing_card = dbc.Card([
+    dbc.CardBody([
+        html.H5([
+            html.I(className="bi bi-list-ol section-icon"),
+            "Per-Stage Timing Configuration"
+        ], className="section-title"),
+        dbc.Button([
+            html.I(className="bi bi-sliders2 me-2"),
+            "Customize Per-Stage Timing"
+        ],
+        id="per-stage-timing-toggle",
+        color="light",
+        className="mb-3 w-100",
+        size="sm"
+        ),
+        dbc.Collapse([
+            dbc.Alert([
+                html.I(className="bi bi-info-circle-fill me-2"),
+                "Override global timing values for individual stages. Leave empty to use global values."
+            ], color="info", className="mb-3"),
+            html.Div(id='per-stage-inputs-container', children=[])
+        ], id="per-stage-timing-collapse", is_open=False)
+    ])
+], style=card_style)
 # Updated app layout with improved structure
 app.layout = html.Div([
     header,
                 basic_params_card,
                 scheduling_params_card,
                 timing_params_card,
+                per_stage_timing_card,
                 # Generate button with better styling
                 dbc.Button([
                         html.A([
                             html.I(className="bi bi-github me-2"),
                             "View on GitHub"
+                        ], href="https://github.com/Victarry/PP-Schedule-Visualization", className="small text-muted d-block mb-2"),
                         html.A([
                             html.I(className="bi bi-question-circle me-2"),
                             "Report an Issue"
+                        ], href="https://github.com/Victarry/PP-Schedule-Visualization/issues", className="small text-muted d-block")
                     ])
                 ], md=4)
             ]),
         return not is_open
     return is_open
+# --- Callback to toggle Per-Stage Timing Collapse ---
+@app.callback(
+    Output("per-stage-timing-collapse", "is_open"),
+    Input("per-stage-timing-toggle", "n_clicks"),
+    State("per-stage-timing-collapse", "is_open"),
+    prevent_initial_call=True,
+)
+def toggle_per_stage_timing(n_clicks, is_open):
+    if n_clicks:
+        return not is_open
+    return is_open
+# --- Callback to dynamically generate per-stage timing inputs ---
+@app.callback(
+    Output("per-stage-inputs-container", "children"),
+    Input("num_stages", "value"),
+)
+def generate_per_stage_inputs(num_stages):
+    if num_stages is None or num_stages < 1:
+        return []
+    # Limit to reasonable number of stages for UI
+    num_stages = min(int(num_stages), 32)
+    stage_inputs = []
+    for stage_id in range(num_stages):
+        stage_inputs.append(
+            dbc.Row([
+                dbc.Col([
+                    html.Strong(f"Stage {stage_id}", className="text-muted")
+                ], width=2, className="d-flex align-items-center"),
+                dbc.Col([
+                    dbc.InputGroup([
+                        dbc.InputGroupText("F", style={"minWidth": "30px"}),
+                        dbc.Input(
+                            id={"type": "stage-forward", "index": stage_id},
+                            type="number",
+                            placeholder="1.0",
+                            min=0.01,
+                            step=0.01,
+                            size="sm"
+                        ),
+                    ], size="sm")
+                ], width=5),
+                dbc.Col([
+                    dbc.InputGroup([
+                        dbc.InputGroupText("B", style={"minWidth": "30px"}),
+                        dbc.Input(
+                            id={"type": "stage-backward", "index": stage_id},
+                            type="number",
+                            placeholder="1.0",
+                            min=0.01,
+                            step=0.01,
+                            size="sm"
+                        ),
+                    ], size="sm")
+                ], width=5),
+            ], className="mb-2 g-2")
+        )
+    # Add header row
+    header = dbc.Row([
+        dbc.Col([html.Small("Stage", className="text-muted fw-bold")], width=2),
+        dbc.Col([html.Small("Forward Time", className="text-muted fw-bold")], width=5),
+        dbc.Col([html.Small("Backward Time", className="text-muted fw-bold")], width=5),
+    ], className="mb-2")
+    return [header] + stage_inputs
 # --- Client-side Callback for Strategy Card Selection ---
 app.clientside_callback(
     """
     State('op_time_overlapped_fwd_bwd', 'value'),
     State('microbatch_group_size_per_vp_stage', 'value'),
     State('selected-strategies-store', 'data'),
+    State({'type': 'stage-forward', 'index': ALL}, 'value'),
+    State({'type': 'stage-backward', 'index': ALL}, 'value'),
     prevent_initial_call=True
 )
 def update_graph(n_clicks, num_devices, num_stages, num_batches, p2p_latency,
                  op_time_forward, op_time_backward, op_time_backward_d, op_time_backward_w,
                  op_time_overlapped_fwd_bwd, microbatch_group_size_per_vp_stage,
+                 selected_strategies, stage_forward_values, stage_backward_values):
     strategy_display_order = ["1f1b", "1f1b_interleave", "1f1b_overlap", "1f1b_interleave_overlap", "dualpipe", "zb1p"]
                      if adjustment_msg not in automatic_adjustments:
                          automatic_adjustments.append(adjustment_msg)
+                # Check if per-stage timing values are provided
+                has_per_stage_forward = stage_forward_values and any(v is not None for v in stage_forward_values)
+                has_per_stage_backward = stage_backward_values and any(v is not None for v in stage_backward_values)
+                # Build forward timing - either per-stage dict or global value
+                if has_per_stage_forward:
+                    forward_times = {}
+                    for stage_id in range(current_num_stages):
+                        if stage_id < len(stage_forward_values) and stage_forward_values[stage_id] is not None:
+                            forward_times[stage_id] = float(stage_forward_values[stage_id]) * time_scale_factor
+                        else:
+                            # Use global value as fallback (default 1.0 if not specified)
+                            forward_times[stage_id] = float(op_time_forward if op_time_forward else 1.0) * time_scale_factor
+                    op_times = {"forward": forward_times}
+                else:
+                    op_times = {"forward": float(op_time_forward) * time_scale_factor}
+                # Build backward timing
                 if split_backward:
                     op_times["backward_D"] = float(op_time_backward_d) * time_scale_factor
                     op_times["backward_W"] = float(op_time_backward_w) * time_scale_factor
                     op_times["backward"] = (float(op_time_backward_d) + float(op_time_backward_w)) * time_scale_factor
                 else:
+                    if has_per_stage_backward:
+                        backward_times = {}
+                        for stage_id in range(current_num_stages):
+                            if stage_id < len(stage_backward_values) and stage_backward_values[stage_id] is not None:
+                                backward_times[stage_id] = float(stage_backward_values[stage_id]) * time_scale_factor
+                            else:
+                                # Use global value as fallback (default 1.0 if not specified)
+                                backward_times[stage_id] = float(op_time_backward if op_time_backward else 1.0) * time_scale_factor
+                        op_times["backward"] = backward_times
+                    else:
+                        op_times["backward"] = float(op_time_backward) * time_scale_factor
                 if op_time_overlapped_fwd_bwd is not None:
                     try:

assets/dumped_example.jpg ADDED Viewed

Git LFS Details

SHA256: a12aecf1e52d57ef7f13c2b6bfd0a7320dd8ed389c8722a86fd999953b301a3b
Pointer size: 131 Bytes
Size of remote file: 796 kB

examples/megatron-lm/README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# Pipeline Parallelism Visualization for Megatron-LM
+This tool provides visualization capabilities for Pipeline Parallelism (PP) scheduling in Megatron-LM training, helping you analyze load balancing issues and debug abnormal PP bubble problems that are difficult to inspect directly from Nsight Systems profiling.
+## Overview
+The visualization tool offers intuitive visual representation of PP scheduling, making it easier to:
+- Identify load balancing issues across pipeline stages
+- Debug PP bubble problems
+- Analyze pipeline efficiency and bottlenecks
+- Optimize pipeline parallelism configurations
+## Prerequisites
+- Megatron-LM with PP timer support
+- Python environment with required dependencies
+- UV package manager (recommended)
+## Usage
+### Step 1: Enable PP Timer in Megatron-LM
+First, you need to apply the PP timer patch to your Megatron-LM installation:
+1. Cherry-pick the commit from the modified Megatron-LM repository:
+   ```bash
+   # Navigate to your Megatron-LM directory
+   cd /path/to/Megatron-LM
+   # Cherry-pick the PP timer commit
+   git remote add victarry https://github.com/Victarry/PP-Schedule-Visualization.git
+   git fetch victarry
+   git cherry-pick ad3bc3a22adc79827dc1b35619ad6813078e621b
+   ```
+   **Note**: The commit can be viewed at: https://github.com/Victarry/Megatron-LM/commit/ad3bc3a22adc79827dc1b35619ad6813078e621b
+### Step 2: Configure Environment Variables
+Set the following environment variables before running your training script:
+```bash
+# Enable PP timer functionality
+export ENABLE_PP_TIMER=1
+# Specify which iteration to dump (e.g., iteration 1)
+export ENABLE_PP_TIMER_ITER=1
+# Set directory to save the dumped timer results
+export PP_TIMER_LOG_DIR=/path/to/save/timer/logs
+# Run your training script
+bash your_training_script.sh
+```
+### Step 3: Generate Visualization
+Once you have collected the timer data, use the visualization script:
+```bash
+# Navigate to the PP-Schedule-Visualization directory
+cd /path/to/PP-Schedule-Visualization
+# Set your configuration parameters
+PP_SIZE=4        # Number of pipeline parallel stages
+VPP_SIZE=1       # Virtual pipeline parallel size (usually 1)
+DATA_DIR=/path/to/timer/logs  # Directory containing the dumped timer data
+# Run the visualization script
+uv run examples/megatron-lm/plot.py --data-dir $DATA_DIR --pp-size $PP_SIZE --vpp-size $VPP_SIZE
+```
+**Parameters:**
+- `--data-dir`: Path to the directory containing PP timer log files
+- `--pp-size`: Number of pipeline parallel stages in your training setup
+- `--vpp-size`: Virtual pipeline parallel size (typically 1 unless using virtual PP)
+### Example Output
+After running the visualization script, you will see a detailed PP schedule visualization similar to:
+![PP Schedule Visualization](../../assets/dumped_example.jpg)
+The visualization shows:
+- Timeline of each pipeline stage
+- Forward and backward pass scheduling
+- Bubble time and idle periods
+- Communication overhead between stages
+## Known Issue
+- If the global batch size is very large, it may takes > 1 minutes to see the visualization results.
+## Contributing
+If you encounter issues or have suggestions for improvements, please open an issue or submit a pull request.

examples/megatron-lm/plot.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import json
+import os
+import argparse
+import re
+from collections import defaultdict
+from src.execution_model import Schedule, ScheduleConfig, Operation
+from src.visualizer import visualize_pipeline_parallelism_dash
+def is_valid_event_filename(filename, pp_size, vpp_size):
+    """
+    Check if filename matches the expected format:
+    event_times_PP{pp_size}_VPP{vpp_size}_TPxCPxDP_rank_{rank}_pp_rank_{pp_rank}_rank_{final_rank}.json
+    Returns True if valid, False otherwise.
+    """
+    # Create regex pattern for the expected format
+    pattern = rf"^event_times_PP{pp_size}_VPP{vpp_size}_TPxCPxDP_rank_\d+_pp_rank_\d+_rank_\d+\.json$"
+    return bool(re.match(pattern, filename))
+def parse_event_filename(filename):
+    """
+    Parse the event filename and extract rank information.
+    Expected format: event_times_PP{pp_size}_VPP{vpp_size}_TPxCPxDP_rank_{rank}_pp_rank_{pp_rank}_rank_{final_rank}.json
+    Returns: (TPxCPxDP_rank, pp_rank, global_rank) or None if parsing fails
+    """
+    try:
+        # Remove .json extension
+        name_without_ext = filename.replace(".json", "")
+        parts = name_without_ext.split("_")
+        # Find the TPxCPxDP part and the rank values
+        tpxcpxdp_rank = None
+        pp_rank = None
+        global_rank = None
+        for i, part in enumerate(parts):
+            # Look for TPxCPxDP pattern followed by 'rank'
+            if part.startswith("TP") and "CP" in part and part.endswith("DP"):
+                if i + 2 < len(parts) and parts[i + 1] == "rank":
+                    tpxcpxdp_rank = int(parts[i + 2])
+            # Look for 'pp_rank' pattern
+            elif part == "pp" and i + 2 < len(parts) and parts[i + 1] == "rank":
+                pp_rank = int(parts[i + 2])
+            # Look for the final 'rank' (global rank) - this should be the last rank in the filename
+            elif part == "rank" and i + 1 < len(parts) and i == len(parts) - 2:
+                global_rank = int(parts[i + 1])
+        if tpxcpxdp_rank is None or pp_rank is None or global_rank is None:
+            return None
+        return (tpxcpxdp_rank, pp_rank, global_rank)
+    except (ValueError, IndexError):
+        return None
+def load_event_times_from_json(data_dir, pp_size, vpp_size):
+    """Load event times from JSON files in the specified directory."""
+    all_files = [f for f in os.listdir(data_dir) if f.endswith(".json")]
+    # Filter files that match the expected format
+    event_files = [
+        f for f in all_files if is_valid_event_filename(f, pp_size, vpp_size)
+    ]
+    if len(event_files) == 0:
+        print(f"Available files in {data_dir}:")
+        for f in all_files[:10]:  # Show first 10 files for debugging
+            print(f"  {f}")
+        raise ValueError(
+            f"No event files found matching pattern event_times_PP{pp_size}_VPP{vpp_size}_*.json"
+        )
+    print(f"Found {len(event_files)} matching event files")
+    event_times = {}
+    for file_name in event_files:
+        parsed_result = parse_event_filename(file_name)
+        if parsed_result is None:
+            print(f"Warning: Could not parse filename {file_name}")
+            continue
+        tpxcpxdp_rank, pp_rank, global_rank = parsed_result
+        if tpxcpxdp_rank == 0:
+            try:
+                with open(os.path.join(data_dir, file_name), "r") as f:
+                    event_data = json.load(f)
+                    event_times[(pp_rank, tpxcpxdp_rank)] = event_data
+                    print(
+                        f"Loaded data from {file_name}: global_rank={global_rank}, pp_rank={pp_rank}, tpxcpxdp_rank={tpxcpxdp_rank}"
+                    )
+            except Exception as e:
+                print(f"Error loading {file_name}: {e}")
+    return event_times
+def create_pp_schedule_from_event_times(event_times, pp_size):
+    """Create a Schedule object from event times data."""
+    # Determine number of devices/stages from the data
+    num_devices = pp_size
+    # Find the maximum batch ID by parsing event names
+    max_batch_id = 0
+    for events in event_times.values():
+        for event_name in events:
+            if event_name.startswith(("forward-", "backward-")):
+                parts = event_name.split("-")
+                if len(parts) >= 2 and parts[1].isdigit():
+                    batch_id = int(parts[1])
+                    max_batch_id = max(max_batch_id, batch_id)
+    num_batches = max_batch_id + 1
+    # Create a simple config (actual times will come from event data)
+    config = ScheduleConfig(
+        num_devices=num_devices,
+        num_stages=num_devices,  # Assuming 1:1 mapping of devices to stages
+        num_batches=num_batches,
+        p2p_latency=0,  # Will be implicit in the event timing
+        op_times={},  # Not needed as we'll use real timing data
+        placement_strategy="standard",
+    )
+    # Create a schedule
+    schedule = Schedule(config)
+    # Populate the schedule with operations based on event times
+    for (pp_rank, tpxcpxdp_rank), events in event_times.items():
+        # Process forward passes
+        for batch_id in range(num_batches):
+            forward_start_key = f"forward-{batch_id}-start"
+            forward_end_key = f"forward-{batch_id}-end"
+            if forward_start_key in events and forward_end_key in events:
+                # Create an operation and set its timing directly
+                forward_op = Operation(batch_id, pp_rank, "forward")
+                forward_op.execution_time = (
+                    events[forward_end_key] - events[forward_start_key]
+                )
+                forward_op.start_time = events[forward_start_key]
+                forward_op.end_time = events[forward_end_key]
+                # Add to schedule
+                schedule.ops[(batch_id, pp_rank, "forward")] = forward_op
+                schedule.device_queues[pp_rank].add_operation(forward_op)
+        # Process backward passes
+        for batch_id in range(num_batches):
+            backward_start_key = f"backward-{batch_id}-start"
+            backward_end_key = f"backward-{batch_id}-end"
+            if backward_start_key in events and backward_end_key in events:
+                # Create an operation and set its timing directly
+                backward_op = Operation(batch_id, pp_rank, "backward")
+                backward_op.execution_time = (
+                    events[backward_end_key] - events[backward_start_key]
+                )
+                backward_op.start_time = events[backward_start_key]
+                backward_op.end_time = events[backward_end_key]
+                # Add to schedule
+                schedule.ops[(batch_id, pp_rank, "backward")] = backward_op
+                schedule.device_queues[pp_rank].add_operation(backward_op)
+    return schedule
+def create_vpp_schedule_from_event_times(event_times, pp_size, vpp_size):
+    """Create a VPP Schedule object from event times data."""
+    # Determine number of devices/stages from the data
+    # Find the maximum batch ID by parsing event names
+    max_batch_id = 0
+    for events in event_times.values():
+        for event_name in events:
+            if event_name.startswith(("forward-", "backward-")):
+                parts = event_name.split("-")
+                assert len(parts) == 4
+                assert parts[0] in ["forward", "backward"]
+                assert parts[1].isdigit() and parts[2].isdigit()
+                assert parts[3] in ["start", "end"]
+                batch_id = int(parts[2]) # backward-0-19-end
+                max_batch_id = max(max_batch_id, batch_id)
+    num_batches = max_batch_id + 1
+    # Create a simple config (actual times will come from event data)
+    config = ScheduleConfig(
+        num_devices=pp_size,
+        num_stages=pp_size * vpp_size,
+        num_batches=num_batches,
+        p2p_latency=0,  # Will be implicit in the event timing
+        op_times={},  # Not needed as we'll use real timing data
+        placement_strategy="interleave",
+    )
+    # Create a schedule
+    schedule = Schedule(config)
+    # Populate the schedule with operations based on event times
+    for (pp_rank, tpxcpxdp_rank), events in event_times.items():
+        # Process forward passes
+        for model_chunk_id in range(vpp_size):
+            for batch_id in range(num_batches):
+                forward_start_key = f"forward-{model_chunk_id}-{batch_id}-start"
+                forward_end_key = f"forward-{model_chunk_id}-{batch_id}-end"
+                # Create an operation and set its timing directly
+                stage_id = pp_size * model_chunk_id + pp_rank
+                forward_op = Operation(batch_id, stage_id=stage_id, op_type="forward")
+                forward_op.execution_time = (
+                    events[forward_end_key] - events[forward_start_key]
+                )
+                forward_op.start_time = events[forward_start_key]
+                forward_op.end_time = events[forward_end_key]
+                # Add to schedule
+                schedule.ops[(batch_id, stage_id, "forward")] = forward_op
+                schedule.device_queues[pp_rank].add_operation(forward_op)
+        # Process backward passes
+        for model_chunk_id in range(vpp_size):
+            for batch_id in range(num_batches):
+                backward_start_key = f"backward-{model_chunk_id}-{batch_id}-start"
+                backward_end_key = f"backward-{model_chunk_id}-{batch_id}-end"
+                stage_id = pp_size * model_chunk_id + pp_rank
+                if backward_start_key in events and backward_end_key in events:
+                    # Create an operation and set its timing directly
+                    backward_op = Operation(
+                        batch_id, stage_id=stage_id, op_type="backward"
+                    )
+                    backward_op.execution_time = (
+                        events[backward_end_key] - events[backward_start_key]
+                    )
+                    backward_op.start_time = events[backward_start_key]
+                    backward_op.end_time = events[backward_end_key]
+                    # Add to schedule
+                    schedule.ops[(batch_id, stage_id, "backward")] = backward_op
+                    schedule.device_queues[pp_rank].add_operation(backward_op)
+    return schedule
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Visualize pipeline parallelism from event data"
+    )
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        required=True,
+        help="Directory containing event_times_*.json files",
+    )
+    parser.add_argument(
+        "--pp-size", type=int, required=True, help="Pipeline parallelism size"
+    )
+    parser.add_argument(
+        "--vpp-size", type=int, required=True, help="Virtual pipeline parallelism size"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8050,
+        help="Port for the visualization dashboard (default: 8050)",
+    )
+    args = parser.parse_args()
+    # Load event times from JSON files
+    event_times = load_event_times_from_json(args.data_dir, args.pp_size, args.vpp_size)
+    # Create schedule from event times
+    if args.vpp_size == 1:
+        schedule = create_pp_schedule_from_event_times(event_times, args.pp_size)
+    else:
+        schedule = create_vpp_schedule_from_event_times(
+            event_times, args.pp_size, args.vpp_size
+        )
+    # Calculate and print execution metrics
+    total_execution_time = max(
+        op.end_time for op in schedule.ops.values() if op.end_time is not None
+    )
+    print(f"Total execution time: {total_execution_time:.2f} ms")
+    # Calculate bubble time percentage
+    device_times = defaultdict(float)
+    for device_id, device_queue in enumerate(schedule.device_queues):
+        for op in device_queue.ops:
+            if op.start_time is not None and op.end_time is not None:
+                device_times[device_id] += op.end_time - op.start_time
+    # Print bubble percentage for each device
+    for device_id, active_time in device_times.items():
+        bubble_percentage = (
+            (total_execution_time - active_time) / total_execution_time * 100
+        )
+        print(f"Device {device_id} bubble: {bubble_percentage:.2f}%")
+    # Visualize the schedule
+    print("Launching visualization...")
+    visualize_pipeline_parallelism_dash(
+        schedule, schedule_type="1F1B-Imported", port=args.port
+    )
+if __name__ == "__main__":
+    main()

src/execution_model.py CHANGED Viewed

@@ -243,6 +243,39 @@ class Schedule:
                 return None
         return self.ops[(batch_id, stage_id, op_type)]
     def get_dependencies(self, op: Operation, include_device_dependency=True):
         deps = []
         if isinstance(op, OverlappedOperation):
@@ -327,7 +360,34 @@ class Schedule:
         if include_device_dependency:
             device_index = self.device_queues[op.device_id].ops.index(op)
             if device_index > 0:
-                deps.append((self.device_queues[op.device_id].ops[device_index - 1], 0.0))
         return deps
     def show(self):

                 return None
         return self.ops[(batch_id, stage_id, op_type)]
+    def get_p2p_receiver_op(self, sender_op: Operation) -> Optional[Operation]:
+        """
+        Get the operation that receives P2P data from sender_op.
+        For forward ops: sender on stage N sends to receiver on stage N+1
+        For backward ops: sender on stage N sends to receiver on stage N-1
+        Returns None if there is no P2P receiver (first/last stage).
+        """
+        if isinstance(sender_op, OverlappedOperation):
+            # For overlapped ops, return None (P2P is overlapped with computation)
+            return None
+        if sender_op.op_type == "forward":
+            # Forward sends to next stage
+            next_stage = sender_op.stage_id + 1
+            if next_stage >= self.config.num_stages:
+                return None  # Last stage, no P2P
+            return self.get_op(sender_op.batch_id, next_stage, "forward", allow_none=True)
+        elif sender_op.op_type in ("backward", "backward_D"):
+            # Backward sends to previous stage
+            prev_stage = sender_op.stage_id - 1
+            if prev_stage < 0:
+                return None  # First stage, no P2P
+            # Try backward_D first, then backward
+            receiver = self.get_op(sender_op.batch_id, prev_stage, "backward_D", allow_none=True)
+            if receiver is None:
+                receiver = self.get_op(sender_op.batch_id, prev_stage, "backward", allow_none=True)
+            return receiver
+        return None
     def get_dependencies(self, op: Operation, include_device_dependency=True):
         deps = []
         if isinstance(op, OverlappedOperation):
         if include_device_dependency:
             device_index = self.device_queues[op.device_id].ops.index(op)
             if device_index > 0:
+                prev_op = self.device_queues[op.device_id].ops[device_index - 1]
+                # Check if sync P2P should apply
+                # Sync P2P means sender waits for P2P transfer to complete before next op
+                # This adds p2p_latency to the device dependency gap
+                sync_p2p_gap = 0.0
+                if self.config.p2p_latency > 0:
+                    is_prev_overlapped = isinstance(prev_op, OverlappedOperation)
+                    is_current_overlapped = isinstance(op, OverlappedOperation)
+                    # Only add sync P2P gap when:
+                    # 1. Both ops are not overlapped (not in overlap schedule's steady state)
+                    # 2. Both ops have the same base type (both forward or both backward)
+                    # 3. Both ops are on the same stage (ensures we're in a pure warmup/cooldown
+                    #    sequence for a specific stage, avoiding cycles in interleaved schedules)
+                    if not is_prev_overlapped and not is_current_overlapped:
+                        prev_base_type = "backward" if prev_op.op_type.startswith("backward") else prev_op.op_type
+                        curr_base_type = "backward" if op.op_type.startswith("backward") else op.op_type
+                        if prev_base_type == curr_base_type and prev_op.stage_id == op.stage_id:
+                            receiver_op = self.get_p2p_receiver_op(prev_op)
+                            if receiver_op is not None and not isinstance(receiver_op, OverlappedOperation):
+                                # Sync P2P: sender waits for P2P transfer to complete
+                                # Current op starts after prev_op.end_time + p2p_latency
+                                # (not after receiver completes, just after transfer completes)
+                                sync_p2p_gap = self.config.p2p_latency
+                deps.append((prev_op, sync_p2p_gap))
         return deps
     def show(self):