#!/usr/bin/env python3 """ Parse the Awesome Whisper Apps markdown file and generate a JSON data file. This script extracts project information from projects.md and creates projects.json """ import json import re from pathlib import Path def extract_github_repo(url): """Extract GitHub repository from URL.""" github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url) if github_match: return github_match.group(1) return None def parse_table_row(line, context): """Parse a markdown table row.""" # Table format: | [Name](url) | stars_badge | Description | # or: | [Name](url) | Platform | stars_badge | Description | parts = [p.strip() for p in line.split('|')[1:-1]] # Remove empty first/last if len(parts) < 2: return None # Extract name and URL from first column match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0]) if not match: return None name = match.group(1) url = match.group(2) github_repo = extract_github_repo(url) # Find the description (last column that doesn't contain an image badge) description = "" for i in range(len(parts) - 1, 0, -1): if '![' not in parts[i]: description = parts[i] break return { 'name': name, 'url': url, 'description': description, 'platforms': context.get('platforms', []), 'usecases': context.get('usecases', []), 'github_repo': github_repo, 'has_stars': github_repo is not None } def parse_bullet_item(line, context): """Parse a bullet list item.""" # Format: - **[name](url)** description # or: - **[name](url)** ![stars](badge) - description match = re.search(r'-\s+\*\*\[([^\]]+)\]\(([^)]+)\)\*\*\s+(?:!\[GitHub[^\]]*\][^-]*-\s+)?(.+)', line) if not match: return None name = match.group(1) url = match.group(2) description = match.group(3).strip() github_repo = extract_github_repo(url) return { 'name': name, 'url': url, 'description': description, 'platforms': context.get('platforms', []), 'usecases': context.get('usecases', []), 'github_repo': github_repo, 'has_stars': github_repo is not None } def determine_platform_from_subsection(line, current_platforms): """Determine platform from **Platform:** style markers.""" line_lower = line.lower() if '**cross-platform:**' in line_lower: return ['cross-platform'] elif '**linux:**' in line_lower: return ['linux'] elif '**macos:**' in line_lower: return ['macos'] elif '**windows:**' in line_lower: return ['windows'] elif '**mobile:**' in line_lower: return ['android', 'ios'] return current_platforms def parse_markdown(md_path): """Parse the markdown file and extract all projects.""" projects = [] current_context = {'platforms': [], 'usecases': []} in_content_section = False with open(md_path, 'r', encoding='utf-8') as f: lines = f.readlines() i = 0 while i < len(lines): line = lines[i].strip() # Start tracking only in sections with real URLs (not reference links) if '## By Platform' in line: in_content_section = True current_context = {'platforms': [], 'usecases': []} elif '## By Use Case' in line: in_content_section = False # Skip this section (has reference links only) # Major section headers (##) if line.startswith('##') and in_content_section: if 'By Platform' in line: current_context = {'platforms': [], 'usecases': []} elif 'For Developers' in line: in_content_section = True current_context = {'platforms': [], 'usecases': ['developer']} elif 'SRT' in line or 'Subtitles' in line: current_context['usecases'] = ['subtitles'] elif 'Meeting' in line: current_context['usecases'] = ['meetings'] elif 'Real-Time' in line or 'Streaming' in line: current_context['usecases'] = ['real-time'] # Subsection headers (###) elif line.startswith('###') and in_content_section: # Platform sections if 'Linux' in line and '##' not in line: current_context['platforms'] = ['linux'] elif 'macOS' in line: current_context['platforms'] = ['macos'] elif 'Windows' in line: current_context['platforms'] = ['windows'] elif 'Android' in line: current_context['platforms'] = ['android'] elif 'iOS' in line: current_context['platforms'] = ['ios'] elif 'Cross-Platform' in line: current_context['platforms'] = ['cross-platform'] elif 'Embedded' in line or 'Raspberry' in line: current_context['platforms'] = ['embedded'] # Use case sections if 'Voice Typing' in line or 'Dictation' in line: current_context['usecases'] = ['voice-typing'] elif 'SaaS' in line or 'Cloud' in line: current_context['usecases'] = ['saas'] elif 'Subtitles' in line or 'Captioning' in line: current_context['usecases'] = ['subtitles'] elif 'Meeting' in line or 'Productivity' in line: current_context['usecases'] = ['meetings'] elif 'Web Interface' in line or 'Web UI' in line: current_context['usecases'] = ['web'] current_context['platforms'] = ['web'] elif 'Real-Time' in line or 'Streaming' in line: current_context['usecases'] = ['real-time'] elif 'Model Variants' in line or 'Performance' in line: current_context['usecases'] = ['model-variants'] elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line: current_context['usecases'] = ['developer'] # Sub-subsection headers (####) elif line.startswith('####') and in_content_section: if 'Desktop Applications' in line: pass # Keep current platform elif 'System Integration' in line: if not current_context.get('usecases'): current_context['usecases'] = ['voice-typing'] elif 'CLI Tools' in line: current_context['usecases'] = ['developer'] # Check for **Platform:** style subsections elif line.startswith('**') and ':' in line and '**' in line[:20]: current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', [])) # Parse table rows elif in_content_section and line.startswith('|') and '[' in line and '](' in line: project = parse_table_row(line, current_context) if project and not any(p['name'] == project['name'] for p in projects): projects.append(project) # Parse bullet items elif in_content_section and line.startswith('- **['): project = parse_bullet_item(line, current_context) if project and not any(p['name'] == project['name'] for p in projects): projects.append(project) # Parse simple links (for SaaS section) elif in_content_section and line.startswith('- [') and not line.startswith('- **['): match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line) if match: name = match.group(1) url = match.group(2) description = match.group(3).strip() github_repo = extract_github_repo(url) project = { 'name': name, 'url': url, 'description': description, 'platforms': current_context.get('platforms', ['web']), 'usecases': current_context.get('usecases', ['saas']), 'github_repo': github_repo, 'has_stars': github_repo is not None } if not any(p['name'] == project['name'] for p in projects): projects.append(project) i += 1 return projects def enhance_projects(projects): """Add additional metadata and clean up project data.""" for project in projects: # Ensure we have at least one use case if not project.get('usecases'): desc_lower = project['description'].lower() if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower: project['usecases'] = ['subtitles'] elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower: project['usecases'] = ['real-time'] elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower: project['usecases'] = ['meetings'] elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower: project['usecases'] = ['voice-typing'] elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower: project['usecases'] = ['model-variants'] else: project['usecases'] = ['developer'] # Ensure we have at least one platform if not project.get('platforms'): desc_lower = project['description'].lower() url_lower = project['url'].lower() if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower: project['platforms'] = ['web'] elif not project.get('github_repo'): # Non-GitHub links are likely web services project['platforms'] = ['web'] else: project['platforms'] = ['cross-platform'] return projects def main(): script_dir = Path(__file__).parent md_path = script_dir / 'projects.md' json_path = script_dir / 'projects.json' print(f"Parsing {md_path}...") projects = parse_markdown(md_path) print(f"Extracted {len(projects)} projects") # Enhance with additional metadata projects = enhance_projects(projects) # Write to JSON with open(json_path, 'w', encoding='utf-8') as f: json.dump(projects, f, indent=2, ensure_ascii=False) print(f"Created {json_path} with {len(projects)} projects") # Print statistics platforms = set() usecases = set() for p in projects: platforms.update(p['platforms']) usecases.update(p['usecases']) print(f"\nStatistics:") print(f" Platforms: {', '.join(sorted(platforms))}") print(f" Use cases: {', '.join(sorted(usecases))}") # Show sample projects print(f"\nSample projects:") for p in projects[:5]: print(f" - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...") if __name__ == '__main__': main()