Spaces:

danielrosehill
/

Interesting-Whisper-Projects

Running

File size: 11,138 Bytes

#!/usr/bin/env python3
"""
Parse the Awesome Whisper Apps markdown file and generate a JSON data file.
This script extracts project information from projects.md and creates projects.json
"""

import json
import re
from pathlib import Path


def extract_github_repo(url):
    """Extract GitHub repository from URL."""
    github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url)
    if github_match:
        return github_match.group(1)
    return None


def parse_table_row(line, context):
    """Parse a markdown table row."""
    # Table format: | [Name](url) | stars_badge | Description |
    # or: | [Name](url) | Platform | stars_badge | Description |
    parts = [p.strip() for p in line.split('|')[1:-1]]  # Remove empty first/last
    if len(parts) < 2:
        return None

    # Extract name and URL from first column
    match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0])
    if not match:
        return None

    name = match.group(1)
    url = match.group(2)
    github_repo = extract_github_repo(url)

    # Find the description (last column that doesn't contain an image badge)
    description = ""
    for i in range(len(parts) - 1, 0, -1):
        if '![' not in parts[i]:
            description = parts[i]
            break

    return {
        'name': name,
        'url': url,
        'description': description,
        'platforms': context.get('platforms', []),
        'usecases': context.get('usecases', []),
        'github_repo': github_repo,
        'has_stars': github_repo is not None
    }


def parse_bullet_item(line, context):
    """Parse a bullet list item."""
    # Format: - **[name](url)** description
    # or: - **[name](url)** ![stars](badge) - description
    match = re.search(r'-\s+\*\*\[([^\]]+)\]\(([^)]+)\)\*\*\s+(?:!\[GitHub[^\]]*\][^-]*-\s+)?(.+)', line)

    if not match:
        return None

    name = match.group(1)
    url = match.group(2)
    description = match.group(3).strip()
    github_repo = extract_github_repo(url)

    return {
        'name': name,
        'url': url,
        'description': description,
        'platforms': context.get('platforms', []),
        'usecases': context.get('usecases', []),
        'github_repo': github_repo,
        'has_stars': github_repo is not None
    }


def determine_platform_from_subsection(line, current_platforms):
    """Determine platform from **Platform:** style markers."""
    line_lower = line.lower()

    if '**cross-platform:**' in line_lower:
        return ['cross-platform']
    elif '**linux:**' in line_lower:
        return ['linux']
    elif '**macos:**' in line_lower:
        return ['macos']
    elif '**windows:**' in line_lower:
        return ['windows']
    elif '**mobile:**' in line_lower:
        return ['android', 'ios']

    return current_platforms


def parse_markdown(md_path):
    """Parse the markdown file and extract all projects."""
    projects = []
    current_context = {'platforms': [], 'usecases': []}
    in_content_section = False

    with open(md_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Start tracking only in sections with real URLs (not reference links)
        if '## By Platform' in line:
            in_content_section = True
            current_context = {'platforms': [], 'usecases': []}
        elif '## By Use Case' in line:
            in_content_section = False  # Skip this section (has reference links only)

        # Major section headers (##)
        if line.startswith('##') and in_content_section:
            if 'By Platform' in line:
                current_context = {'platforms': [], 'usecases': []}
            elif 'For Developers' in line:
                in_content_section = True
                current_context = {'platforms': [], 'usecases': ['developer']}
            elif 'SRT' in line or 'Subtitles' in line:
                current_context['usecases'] = ['subtitles']
            elif 'Meeting' in line:
                current_context['usecases'] = ['meetings']
            elif 'Real-Time' in line or 'Streaming' in line:
                current_context['usecases'] = ['real-time']

        # Subsection headers (###)
        elif line.startswith('###') and in_content_section:
            # Platform sections
            if 'Linux' in line and '##' not in line:
                current_context['platforms'] = ['linux']
            elif 'macOS' in line:
                current_context['platforms'] = ['macos']
            elif 'Windows' in line:
                current_context['platforms'] = ['windows']
            elif 'Android' in line:
                current_context['platforms'] = ['android']
            elif 'iOS' in line:
                current_context['platforms'] = ['ios']
            elif 'Cross-Platform' in line:
                current_context['platforms'] = ['cross-platform']
            elif 'Embedded' in line or 'Raspberry' in line:
                current_context['platforms'] = ['embedded']

            # Use case sections
            if 'Voice Typing' in line or 'Dictation' in line:
                current_context['usecases'] = ['voice-typing']
            elif 'SaaS' in line or 'Cloud' in line:
                current_context['usecases'] = ['saas']
            elif 'Subtitles' in line or 'Captioning' in line:
                current_context['usecases'] = ['subtitles']
            elif 'Meeting' in line or 'Productivity' in line:
                current_context['usecases'] = ['meetings']
            elif 'Web Interface' in line or 'Web UI' in line:
                current_context['usecases'] = ['web']
                current_context['platforms'] = ['web']
            elif 'Real-Time' in line or 'Streaming' in line:
                current_context['usecases'] = ['real-time']
            elif 'Model Variants' in line or 'Performance' in line:
                current_context['usecases'] = ['model-variants']
            elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line:
                current_context['usecases'] = ['developer']

        # Sub-subsection headers (####)
        elif line.startswith('####') and in_content_section:
            if 'Desktop Applications' in line:
                pass  # Keep current platform
            elif 'System Integration' in line:
                if not current_context.get('usecases'):
                    current_context['usecases'] = ['voice-typing']
            elif 'CLI Tools' in line:
                current_context['usecases'] = ['developer']

        # Check for **Platform:** style subsections
        elif line.startswith('**') and ':' in line and '**' in line[:20]:
            current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', []))

        # Parse table rows
        elif in_content_section and line.startswith('|') and '[' in line and '](' in line:
            project = parse_table_row(line, current_context)
            if project and not any(p['name'] == project['name'] for p in projects):
                projects.append(project)

        # Parse bullet items
        elif in_content_section and line.startswith('- **['):
            project = parse_bullet_item(line, current_context)
            if project and not any(p['name'] == project['name'] for p in projects):
                projects.append(project)

        # Parse simple links (for SaaS section)
        elif in_content_section and line.startswith('- [') and not line.startswith('- **['):
            match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line)
            if match:
                name = match.group(1)
                url = match.group(2)
                description = match.group(3).strip()
                github_repo = extract_github_repo(url)

                project = {
                    'name': name,
                    'url': url,
                    'description': description,
                    'platforms': current_context.get('platforms', ['web']),
                    'usecases': current_context.get('usecases', ['saas']),
                    'github_repo': github_repo,
                    'has_stars': github_repo is not None
                }

                if not any(p['name'] == project['name'] for p in projects):
                    projects.append(project)

        i += 1

    return projects


def enhance_projects(projects):
    """Add additional metadata and clean up project data."""
    for project in projects:
        # Ensure we have at least one use case
        if not project.get('usecases'):
            desc_lower = project['description'].lower()

            if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower:
                project['usecases'] = ['subtitles']
            elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower:
                project['usecases'] = ['real-time']
            elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower:
                project['usecases'] = ['meetings']
            elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower:
                project['usecases'] = ['voice-typing']
            elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower:
                project['usecases'] = ['model-variants']
            else:
                project['usecases'] = ['developer']

        # Ensure we have at least one platform
        if not project.get('platforms'):
            desc_lower = project['description'].lower()
            url_lower = project['url'].lower()

            if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower:
                project['platforms'] = ['web']
            elif not project.get('github_repo'):  # Non-GitHub links are likely web services
                project['platforms'] = ['web']
            else:
                project['platforms'] = ['cross-platform']

    return projects


def main():
    script_dir = Path(__file__).parent
    md_path = script_dir / 'projects.md'
    json_path = script_dir / 'projects.json'

    print(f"Parsing {md_path}...")
    projects = parse_markdown(md_path)

    print(f"Extracted {len(projects)} projects")

    # Enhance with additional metadata
    projects = enhance_projects(projects)

    # Write to JSON
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(projects, f, indent=2, ensure_ascii=False)

    print(f"Created {json_path} with {len(projects)} projects")

    # Print statistics
    platforms = set()
    usecases = set()
    for p in projects:
        platforms.update(p['platforms'])
        usecases.update(p['usecases'])

    print(f"\nStatistics:")
    print(f"  Platforms: {', '.join(sorted(platforms))}")
    print(f"  Use cases: {', '.join(sorted(usecases))}")

    # Show sample projects
    print(f"\nSample projects:")
    for p in projects[:5]:
        print(f"  - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...")


if __name__ == '__main__':
    main()