File size: 11,138 Bytes
948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a 323ad41 948605a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
#!/usr/bin/env python3
"""
Parse the Awesome Whisper Apps markdown file and generate a JSON data file.
This script extracts project information from projects.md and creates projects.json
"""
import json
import re
from pathlib import Path
def extract_github_repo(url):
"""Extract GitHub repository from URL."""
github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url)
if github_match:
return github_match.group(1)
return None
def parse_table_row(line, context):
"""Parse a markdown table row."""
# Table format: | [Name](url) | stars_badge | Description |
# or: | [Name](url) | Platform | stars_badge | Description |
parts = [p.strip() for p in line.split('|')[1:-1]] # Remove empty first/last
if len(parts) < 2:
return None
# Extract name and URL from first column
match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0])
if not match:
return None
name = match.group(1)
url = match.group(2)
github_repo = extract_github_repo(url)
# Find the description (last column that doesn't contain an image badge)
description = ""
for i in range(len(parts) - 1, 0, -1):
if '![' not in parts[i]:
description = parts[i]
break
return {
'name': name,
'url': url,
'description': description,
'platforms': context.get('platforms', []),
'usecases': context.get('usecases', []),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
def parse_bullet_item(line, context):
"""Parse a bullet list item."""
# Format: - **[name](url)** description
# or: - **[name](url)**  - description
match = re.search(r'-\s+\*\*\[([^\]]+)\]\(([^)]+)\)\*\*\s+(?:!\[GitHub[^\]]*\][^-]*-\s+)?(.+)', line)
if not match:
return None
name = match.group(1)
url = match.group(2)
description = match.group(3).strip()
github_repo = extract_github_repo(url)
return {
'name': name,
'url': url,
'description': description,
'platforms': context.get('platforms', []),
'usecases': context.get('usecases', []),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
def determine_platform_from_subsection(line, current_platforms):
"""Determine platform from **Platform:** style markers."""
line_lower = line.lower()
if '**cross-platform:**' in line_lower:
return ['cross-platform']
elif '**linux:**' in line_lower:
return ['linux']
elif '**macos:**' in line_lower:
return ['macos']
elif '**windows:**' in line_lower:
return ['windows']
elif '**mobile:**' in line_lower:
return ['android', 'ios']
return current_platforms
def parse_markdown(md_path):
"""Parse the markdown file and extract all projects."""
projects = []
current_context = {'platforms': [], 'usecases': []}
in_content_section = False
with open(md_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i].strip()
# Start tracking only in sections with real URLs (not reference links)
if '## By Platform' in line:
in_content_section = True
current_context = {'platforms': [], 'usecases': []}
elif '## By Use Case' in line:
in_content_section = False # Skip this section (has reference links only)
# Major section headers (##)
if line.startswith('##') and in_content_section:
if 'By Platform' in line:
current_context = {'platforms': [], 'usecases': []}
elif 'For Developers' in line:
in_content_section = True
current_context = {'platforms': [], 'usecases': ['developer']}
elif 'SRT' in line or 'Subtitles' in line:
current_context['usecases'] = ['subtitles']
elif 'Meeting' in line:
current_context['usecases'] = ['meetings']
elif 'Real-Time' in line or 'Streaming' in line:
current_context['usecases'] = ['real-time']
# Subsection headers (###)
elif line.startswith('###') and in_content_section:
# Platform sections
if 'Linux' in line and '##' not in line:
current_context['platforms'] = ['linux']
elif 'macOS' in line:
current_context['platforms'] = ['macos']
elif 'Windows' in line:
current_context['platforms'] = ['windows']
elif 'Android' in line:
current_context['platforms'] = ['android']
elif 'iOS' in line:
current_context['platforms'] = ['ios']
elif 'Cross-Platform' in line:
current_context['platforms'] = ['cross-platform']
elif 'Embedded' in line or 'Raspberry' in line:
current_context['platforms'] = ['embedded']
# Use case sections
if 'Voice Typing' in line or 'Dictation' in line:
current_context['usecases'] = ['voice-typing']
elif 'SaaS' in line or 'Cloud' in line:
current_context['usecases'] = ['saas']
elif 'Subtitles' in line or 'Captioning' in line:
current_context['usecases'] = ['subtitles']
elif 'Meeting' in line or 'Productivity' in line:
current_context['usecases'] = ['meetings']
elif 'Web Interface' in line or 'Web UI' in line:
current_context['usecases'] = ['web']
current_context['platforms'] = ['web']
elif 'Real-Time' in line or 'Streaming' in line:
current_context['usecases'] = ['real-time']
elif 'Model Variants' in line or 'Performance' in line:
current_context['usecases'] = ['model-variants']
elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line:
current_context['usecases'] = ['developer']
# Sub-subsection headers (####)
elif line.startswith('####') and in_content_section:
if 'Desktop Applications' in line:
pass # Keep current platform
elif 'System Integration' in line:
if not current_context.get('usecases'):
current_context['usecases'] = ['voice-typing']
elif 'CLI Tools' in line:
current_context['usecases'] = ['developer']
# Check for **Platform:** style subsections
elif line.startswith('**') and ':' in line and '**' in line[:20]:
current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', []))
# Parse table rows
elif in_content_section and line.startswith('|') and '[' in line and '](' in line:
project = parse_table_row(line, current_context)
if project and not any(p['name'] == project['name'] for p in projects):
projects.append(project)
# Parse bullet items
elif in_content_section and line.startswith('- **['):
project = parse_bullet_item(line, current_context)
if project and not any(p['name'] == project['name'] for p in projects):
projects.append(project)
# Parse simple links (for SaaS section)
elif in_content_section and line.startswith('- [') and not line.startswith('- **['):
match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line)
if match:
name = match.group(1)
url = match.group(2)
description = match.group(3).strip()
github_repo = extract_github_repo(url)
project = {
'name': name,
'url': url,
'description': description,
'platforms': current_context.get('platforms', ['web']),
'usecases': current_context.get('usecases', ['saas']),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
if not any(p['name'] == project['name'] for p in projects):
projects.append(project)
i += 1
return projects
def enhance_projects(projects):
"""Add additional metadata and clean up project data."""
for project in projects:
# Ensure we have at least one use case
if not project.get('usecases'):
desc_lower = project['description'].lower()
if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower:
project['usecases'] = ['subtitles']
elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower:
project['usecases'] = ['real-time']
elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower:
project['usecases'] = ['meetings']
elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower:
project['usecases'] = ['voice-typing']
elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower:
project['usecases'] = ['model-variants']
else:
project['usecases'] = ['developer']
# Ensure we have at least one platform
if not project.get('platforms'):
desc_lower = project['description'].lower()
url_lower = project['url'].lower()
if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower:
project['platforms'] = ['web']
elif not project.get('github_repo'): # Non-GitHub links are likely web services
project['platforms'] = ['web']
else:
project['platforms'] = ['cross-platform']
return projects
def main():
script_dir = Path(__file__).parent
md_path = script_dir / 'projects.md'
json_path = script_dir / 'projects.json'
print(f"Parsing {md_path}...")
projects = parse_markdown(md_path)
print(f"Extracted {len(projects)} projects")
# Enhance with additional metadata
projects = enhance_projects(projects)
# Write to JSON
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(projects, f, indent=2, ensure_ascii=False)
print(f"Created {json_path} with {len(projects)} projects")
# Print statistics
platforms = set()
usecases = set()
for p in projects:
platforms.update(p['platforms'])
usecases.update(p['usecases'])
print(f"\nStatistics:")
print(f" Platforms: {', '.join(sorted(platforms))}")
print(f" Use cases: {', '.join(sorted(usecases))}")
# Show sample projects
print(f"\nSample projects:")
for p in projects[:5]:
print(f" - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...")
if __name__ == '__main__':
main()
|