Spaces:

argmaxinc
/

whisperkit-benchmarks

Running

App Files Files Community

ardaatahan commited on Nov 7, 2024

Commit

e48391e

1 Parent(s): 7540444

update whisperkit version and release

Browse files

Files changed (3) hide show

.github/scripts/check_dataset_update.py +2 -2
.github/scripts/process_report.py +503 -0
.github/workflows/dataset_update.yml +88 -2

.github/scripts/check_dataset_update.py CHANGED Viewed

@@ -27,8 +27,8 @@ def check_dataset_updates(dataset_id):
             {
                 "last_modified": last_modified,
                 "sha": current_sha,
-                "releases": ["a9b92c4"],
-                "whisperkit_version": "0.9.1"
             },
             f,
         )

             {
                 "last_modified": last_modified,
                 "sha": current_sha,
+                "releases": ["5254d82"],
+                "whisperkit_version": "0.9.4",
             },
             f,
         )

.github/scripts/process_report.py ADDED Viewed

	@@ -0,0 +1,503 @@

+import json
+import os
+import re
+from datetime import datetime
+from typing import Tuple
+import pandas as pd
+from bs4 import BeautifulSoup
+def format_datetime(dt_str: str) -> str:
+    """
+    Format a datetime string for display.
+    :param dt_str: String representing a datetime in ISO format
+    :return: Formatted datetime string
+    """
+    return dt_str.replace("T", " ").split("+")[0]
+def read_json_line_by_line(file_path):
+    """
+    Read a JSON file line by line, parsing each line as a separate JSON object.
+    :param file_path: Path to the JSON file
+    :return: List of parsed JSON objects
+    This function is useful for reading large JSON files that contain one JSON object
+    per line. It handles JSON parsing errors gracefully, skipping invalid lines.
+    """
+    data = []
+    with open(file_path, "r") as f:
+        for line in f:
+            try:
+                item = json.loads(line.strip())
+                data.append(item)
+            except json.JSONDecodeError:
+                print(f"Skipping invalid JSON in {file_path}: {line}")
+    return data
+def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]:
+    """Calculate percentage change and return with appropriate emoji."""
+    pct_change = new - old
+    if abs(pct_change) < 1:
+        emoji = "↔️"
+    elif pct_change > 0:
+        emoji = "🟢" if "wer" not in metric_name.lower() else "❌"
+    else:
+        emoji = "❌" if "wer" not in metric_name.lower() else "🟢"
+    return (pct_change, emoji)
+def has_changes(config, prev_dict, curr_dict):
+    """Check if any metrics have changed."""
+    curr = curr_dict[config]
+    prev = prev_dict[config]
+    metrics = ["speed", "tokens_per_second", "average_wer", "qoi"]
+    for key in metrics:
+        if key in curr and key in prev:
+            curr_val = curr[key]
+            prev_val = prev[key]
+            if abs(curr_val - prev_val) >= 1:  # 1% threshold
+                return True
+    return False
+def format_metrics_table(config, prev_dict, curr_dict):
+    """Format metrics into a table string."""
+    curr = curr_dict[config]
+    prev = prev_dict[config]
+    metrics = [
+        ("Speed", "speed"),
+        ("Tok/s", "tokens_per_second"),
+        ("WER", "average_wer"),
+        ("QoI", "qoi"),
+    ]
+    table = "```\nMetric    Previous    Current    Change\n--------------------------------\n"
+    for metric_name, key in metrics:
+        if key in curr and key in prev:
+            curr_val = curr[key]
+            prev_val = prev[key]
+            pct_change, _ = calculate_change(curr_val, prev_val, metric_name)
+            if abs(pct_change) >= 1:  # Only show metrics with changes
+                table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n"
+    table += "```"
+    return table
+def extract_status_and_os(cell_value):
+    """
+    Extract status and OS versions from a cell, handling both HTML and plain text.
+    Returns list of tuples: [(status, os_version), ...]
+    """
+    results = []
+    cell_value = str(cell_value)
+    # First, handle the case where there's no HTML tags
+    if cell_value == "Not Supported":
+        return results
+    # Split the cell into parts (first element and subsequent <p> elements)
+    parts = cell_value.split("<p>")
+    for part in parts:
+        part = part.strip("</p>")
+        if not part:
+            continue
+        # Check if part contains warning symbol
+        if "⚠️" in part:
+            # Parse HTML to extract OS version from anchor tag
+            soup = BeautifulSoup(part, "html.parser")
+            # Find text after href that contains OS version
+            text = soup.get_text()
+            os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", text)
+            if os_match:
+                os_version = os_match.group(0)
+                results.append(("⚠️", os_version))
+        else:
+            # For success cases, OS version is directly in the text
+            os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", part)
+            if os_match:
+                os_version = os_match.group(0)
+                results.append(("✅", os_version))
+    return results
+def escape_string(s: str) -> str:
+    """Escape a string to be used as a value in JSON."""
+    return (
+        s.replace("\\", "\\\\")
+        .replace('"', '\\"')
+        .replace("\n", "\\n")
+        .replace("\r", "\\r")
+    )
+def analyze_support_changes(prev_csv, curr_csv):
+    """Analyze support changes between CSV files."""
+    # Read CSV files
+    prev_df = pd.read_csv(prev_csv)
+    prev_df.set_index(prev_df.columns[0], inplace=True)
+    curr_df = pd.read_csv(curr_csv)
+    curr_df.set_index(curr_df.columns[0], inplace=True)
+    # Get device lists (excluding first column which is the index)
+    prev_devices = sorted(prev_df.columns[1:])
+    curr_devices = sorted(curr_df.columns[1:])
+    # Calculate device ratio
+    device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1
+    needs_alert = device_ratio < 0.9  # Alert if less than 90% of previous devices
+    # Convert to dictionary for easier comparison
+    prev_status = {}
+    curr_status = {}
+    # Process previous data
+    for idx in range(len(prev_df)):
+        model = prev_df.index[idx]
+        for col_idx in range(1, len(prev_df.columns)):
+            cell_value = prev_df.iloc[idx, col_idx]
+            device = prev_df.columns[col_idx]
+            statuses = extract_status_and_os(cell_value)
+            for status, os_version in statuses:
+                prev_status[(model, device, os_version)] = status
+    # Process current data and track new configurations
+    new_configs = []
+    for idx in range(len(curr_df)):
+        model = curr_df.index[idx]
+        for col_idx in range(1, len(curr_df.columns)):
+            cell_value = curr_df.iloc[idx, col_idx]
+            device = curr_df.columns[col_idx]
+            statuses = extract_status_and_os(cell_value)
+            for status, os_version in statuses:
+                curr_status[(model, device, os_version)] = status
+                # Check if this is a new configuration
+                if (model, device, os_version) not in prev_status:
+                    new_configs.append((model, device, os_version))
+    # Find changes
+    fixed_errors = []
+    new_errors = []
+    # Check all configurations that exist in both datasets
+    common_configs = set(prev_status.keys()) & set(curr_status.keys())
+    for config in common_configs:
+        model, device, os_version = config
+        if prev_status[config] == "⚠️" and curr_status[config] == "✅":
+            fixed_errors.append((model, device, os_version))
+        elif prev_status[config] == "✅" and curr_status[config] == "⚠️":
+            new_errors.append((model, device, os_version))
+    return fixed_errors, new_errors, new_configs, needs_alert
+def generate_report():
+    # Load current and previous data
+    prev_perf_data = read_json_line_by_line("report_data/performance_data.json")
+    curr_perf_data = read_json_line_by_line("dashboard_data/performance_data.json")
+    prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data}
+    curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data}
+    common_configs = set(curr_dict.keys()) & set(prev_dict.keys())
+    # Load version data
+    with open("report_data/version.json", "r") as f:
+        prev_version = json.load(f)
+    with open("dashboard_data/version.json", "r") as f:
+        curr_version = json.load(f)
+    prev_releases = set(prev_version.get("releases", []))
+    curr_releases = set(curr_version.get("releases", []))
+    new_releases = curr_releases - prev_releases
+    removed_releases = prev_releases - curr_releases
+    # Track metrics
+    total_configs = len(common_configs)
+    improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
+    regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0}
+    new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys()))
+    # Analyze support changes
+    fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes(
+        "report_data/support_data.csv", "dashboard_data/support_data.csv"
+    )
+    # Create Slack blocks
+    current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S")
+    prev_release_tag, curr_release_tag = (
+        prev_version["whisperkit_version"],
+        curr_version["whisperkit_version"],
+    )
+    slack_blocks = {
+        "blocks": [
+            {
+                "type": "header",
+                "text": {
+                    "type": "plain_text",
+                    "text": "🔔 WhisperKit Dataset Update Report 🔔",
+                    "emoji": True,
+                },
+            },
+            {
+                "type": "context",
+                "elements": [{"text": f"*{current_time}*", "type": "mrkdwn"}],
+            },
+            {"type": "divider"},
+            {
+                "type": "section",
+                "text": {"type": "mrkdwn", "text": "ℹ️ *CURRENT VERSION INFO* ℹ️"},
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Last Modified:* `{format_datetime(curr_version['last_modified'])}`",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Dataset SHA:* `{curr_version['sha']}`",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Current Releases:* {', '.join(f'`{r}`' for r in curr_version['releases'])}",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Current Release Tag:* `{curr_release_tag}`",
+                },
+            },
+            {"type": "divider"},
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "🔄 *SUMMARY OF PERFORMANCE UPDATES* 🔄",
+                },
+            },
+        ]
+    }
+    # Add release information
+    slack_blocks["blocks"].extend(
+        [
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Added Releases:* {', '.join(sorted(new_releases)) if new_releases else 'None'}",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Removed Releases:* {', '.join(sorted(removed_releases)) if removed_releases else 'None'}",
+                },
+            },
+        ]
+    )
+    if prev_release_tag != curr_release_tag:
+        slack_blocks["blocks"].append(
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *Release Tag Change:* `{prev_release_tag}` → `{curr_release_tag}`",
+                },
+            }
+        )
+    slack_blocks["blocks"].extend(
+        [
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "\n",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *New Data Points:* `{new_data_points}` new configurations",
+                },
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "\n",
+                },
+            },
+        ]
+    )
+    # Add metrics summary
+    for metric_name, key in [
+        ("Speed", "speed"),
+        ("Tok/s", "tokens_per_second"),
+        ("WER", "average_wer"),
+        ("QoI", "qoi"),
+    ]:
+        slack_blocks["blocks"].append(
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"• *{metric_name}:* `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed",
+                },
+            }
+        )
+    # Add support changes section
+    if fixed_errors or new_errors or new_configs:
+        slack_blocks["blocks"].extend(
+            [
+                {"type": "divider"},
+                {
+                    "type": "section",
+                    "text": {"type": "mrkdwn", "text": "📱 *DEVICE SUPPORT CHANGES* 📱"},
+                },
+            ]
+        )
+        if fixed_errors:
+            slack_blocks["blocks"].extend(
+                [
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": "*Successful Configurations That Override Previous Failures*",
+                        },
+                    }
+                ]
+            )
+            for model, device, os_version in sorted(fixed_errors):
+                slack_blocks["blocks"].append(
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": f"• {model} on {device} ({os_version})",
+                        },
+                    }
+                )
+        if new_errors:
+            slack_blocks["blocks"].extend(
+                [
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": "*Failed Configurations That Override Previous Successes*",
+                        },
+                    }
+                ]
+            )
+            for model, device, os_version in sorted(new_errors):
+                slack_blocks["blocks"].append(
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": f"• {model} on {device} ({os_version})",
+                        },
+                    }
+                )
+        if new_configs:
+            slack_blocks["blocks"].extend(
+                [
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": "*Newly Tested Configurations*",
+                        },
+                    }
+                ]
+            )
+            for model, device, os_version in sorted(new_configs):
+                slack_blocks["blocks"].append(
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": f"• {model} on {device} ({os_version})",
+                        },
+                    }
+                )
+    # Add alert if significant decrease in device count
+    if needs_alert:
+        slack_blocks["blocks"].append(
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "⚠️ *ALERT:* Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!",
+                },
+            }
+        )
+    # Create performance text as a single mrkdwn string
+    if common_configs:
+        performance_text = "💡 *Performance Updates* 💡\n\n"
+        # Group by model for better organization
+        models = sorted(set(model for model, _, _ in common_configs))
+        for model in models:
+            model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model])
+            for config in model_configs:
+                device_info = f"*{model}* ({config[2]})"
+                if not has_changes(config, prev_dict, curr_dict):
+                    # If no changes, just add the model with a checkmark
+                    performance_text += f"{device_info} ✅\n\n"
+                else:
+                    # If there are changes, show the metrics
+                    performance_text += f"{device_info}\n"
+                    performance_text += format_metrics_table(
+                        config, prev_dict, curr_dict
+                    )
+                    performance_text += "\n\n"
+    # Write to GITHUB_OUTPUT
+    github_output = os.getenv("GITHUB_OUTPUT")
+    if github_output:
+        with open(github_output, "a") as f:
+            f.write("slack_message_payload<<EOF\n")
+            json.dump(slack_blocks, f, indent=2)
+            f.write("\nEOF\n")
+        with open(github_output, "a") as f:
+            escaped_text = escape_string(performance_text)
+            print(f"performance_message={escaped_text}", file=f)
+if __name__ == "__main__":
+    generate_report()

.github/workflows/dataset_update.yml CHANGED Viewed

@@ -1,4 +1,4 @@
-name: WhisperKit Benchmarks Dataset Update Workflow
 on:
   schedule:
@@ -29,6 +29,28 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: python .github/scripts/check_dataset_update.py
       - name: Install full requirements
         if: steps.check_updates.outputs.has_updates == 'true'
         run: |
@@ -54,4 +76,68 @@ jobs:
           git add .
           git commit -m "update dataset files" || echo "No changes to commit"
           git push
-          git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/argmaxinc/whisperkit-benchmarks

+name: WhisperKit Evals Dataset Update Workflow
 on:
   schedule:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: python .github/scripts/check_dataset_update.py
+      - name: Save workflow data
+        run: |
+          mkdir -p ./workflow_data
+          echo "${{ steps.check_updates.outputs.has_updates }}" > ./workflow_data/has_updates.txt
+      - name: Upload workflow data
+        uses: actions/upload-artifact@v4
+        with:
+          name: workflow_data
+          path: workflow_data/
+          overwrite: true
+      - name: Upload relevant dashboard data for report generation
+        if: steps.check_updates.outputs.has_updates == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: report_data
+          path: |
+            dashboard_data/performance_data.json
+            dashboard_data/support_data.csv
+            dashboard_data/version.json
       - name: Install full requirements
         if: steps.check_updates.outputs.has_updates == 'true'
         run: |
           git add .
           git commit -m "update dataset files" || echo "No changes to commit"
           git push
+          git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/argmaxinc/whisperkit-benchmarks-internal
+  generate-report:
+    needs: update-datasets
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Download workflow data
+        uses: actions/download-artifact@v4
+        with:
+          name: workflow_data
+          path: workflow_data
+      - name: Check updates status
+        id: check
+        run: |
+          HAS_UPDATES=$(cat workflow_data/has_updates.txt)
+          echo "has_updates=$HAS_UPDATES" >> $GITHUB_OUTPUT
+      - name: Download report data
+        if: steps.check.outputs.has_updates == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          name: report_data
+          path: report_data
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pandas beautifulsoup4
+      - name: Process report
+        if: steps.check.outputs.has_updates == 'true'
+        id: report
+        run: python .github/scripts/process_report.py
+      - name: Post to a Slack Channel
+        if: steps.check.outputs.has_updates == 'true'
+        id: slack_message
+        uses: slackapi/slack-github-action@v1.27.0
+        with:
+          channel-id: ${{ secrets.SLACK_CHANNEL_ID }}
+          payload: |
+            ${{ steps.report.outputs.slack_message_payload }}
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+      - name: Send Thread Message
+        if: steps.check.outputs.has_updates == 'true'
+        uses: slackapi/slack-github-action@v1.27.0
+        with:
+          channel-id: ${{ secrets.SLACK_CHANNEL_ID }}
+          payload: |
+            {
+              "thread_ts": "${{ steps.slack_message.outputs.ts }}",
+              "text": "${{ steps.report.outputs.performance_message }}"
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}