|
|
|
|
|
""" |
|
|
Parse the Awesome Whisper Apps markdown file and generate a JSON data file. |
|
|
This script extracts project information from projects.md and creates projects.json |
|
|
""" |
|
|
|
|
|
import json |
|
|
import re |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def extract_github_repo(url): |
|
|
"""Extract GitHub repository from URL.""" |
|
|
github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url) |
|
|
if github_match: |
|
|
return github_match.group(1) |
|
|
return None |
|
|
|
|
|
|
|
|
def parse_table_row(line, context): |
|
|
"""Parse a markdown table row.""" |
|
|
|
|
|
|
|
|
parts = [p.strip() for p in line.split('|')[1:-1]] |
|
|
if len(parts) < 2: |
|
|
return None |
|
|
|
|
|
|
|
|
match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0]) |
|
|
if not match: |
|
|
return None |
|
|
|
|
|
name = match.group(1) |
|
|
url = match.group(2) |
|
|
github_repo = extract_github_repo(url) |
|
|
|
|
|
|
|
|
description = "" |
|
|
for i in range(len(parts) - 1, 0, -1): |
|
|
if '![' not in parts[i]: |
|
|
description = parts[i] |
|
|
break |
|
|
|
|
|
return { |
|
|
'name': name, |
|
|
'url': url, |
|
|
'description': description, |
|
|
'platforms': context.get('platforms', []), |
|
|
'usecases': context.get('usecases', []), |
|
|
'github_repo': github_repo, |
|
|
'has_stars': github_repo is not None |
|
|
} |
|
|
|
|
|
|
|
|
def parse_bullet_item(line, context): |
|
|
"""Parse a bullet list item.""" |
|
|
|
|
|
|
|
|
match = re.search(r'-\s+\*\*\[([^\]]+)\]\(([^)]+)\)\*\*\s+(?:!\[GitHub[^\]]*\][^-]*-\s+)?(.+)', line) |
|
|
|
|
|
if not match: |
|
|
return None |
|
|
|
|
|
name = match.group(1) |
|
|
url = match.group(2) |
|
|
description = match.group(3).strip() |
|
|
github_repo = extract_github_repo(url) |
|
|
|
|
|
return { |
|
|
'name': name, |
|
|
'url': url, |
|
|
'description': description, |
|
|
'platforms': context.get('platforms', []), |
|
|
'usecases': context.get('usecases', []), |
|
|
'github_repo': github_repo, |
|
|
'has_stars': github_repo is not None |
|
|
} |
|
|
|
|
|
|
|
|
def determine_platform_from_subsection(line, current_platforms): |
|
|
"""Determine platform from **Platform:** style markers.""" |
|
|
line_lower = line.lower() |
|
|
|
|
|
if '**cross-platform:**' in line_lower: |
|
|
return ['cross-platform'] |
|
|
elif '**linux:**' in line_lower: |
|
|
return ['linux'] |
|
|
elif '**macos:**' in line_lower: |
|
|
return ['macos'] |
|
|
elif '**windows:**' in line_lower: |
|
|
return ['windows'] |
|
|
elif '**mobile:**' in line_lower: |
|
|
return ['android', 'ios'] |
|
|
|
|
|
return current_platforms |
|
|
|
|
|
|
|
|
def parse_markdown(md_path): |
|
|
"""Parse the markdown file and extract all projects.""" |
|
|
projects = [] |
|
|
current_context = {'platforms': [], 'usecases': []} |
|
|
in_content_section = False |
|
|
|
|
|
with open(md_path, 'r', encoding='utf-8') as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
i = 0 |
|
|
while i < len(lines): |
|
|
line = lines[i].strip() |
|
|
|
|
|
|
|
|
if '## By Platform' in line: |
|
|
in_content_section = True |
|
|
current_context = {'platforms': [], 'usecases': []} |
|
|
elif '## By Use Case' in line: |
|
|
in_content_section = False |
|
|
|
|
|
|
|
|
if line.startswith('##') and in_content_section: |
|
|
if 'By Platform' in line: |
|
|
current_context = {'platforms': [], 'usecases': []} |
|
|
elif 'For Developers' in line: |
|
|
in_content_section = True |
|
|
current_context = {'platforms': [], 'usecases': ['developer']} |
|
|
elif 'SRT' in line or 'Subtitles' in line: |
|
|
current_context['usecases'] = ['subtitles'] |
|
|
elif 'Meeting' in line: |
|
|
current_context['usecases'] = ['meetings'] |
|
|
elif 'Real-Time' in line or 'Streaming' in line: |
|
|
current_context['usecases'] = ['real-time'] |
|
|
|
|
|
|
|
|
elif line.startswith('###') and in_content_section: |
|
|
|
|
|
if 'Linux' in line and '##' not in line: |
|
|
current_context['platforms'] = ['linux'] |
|
|
elif 'macOS' in line: |
|
|
current_context['platforms'] = ['macos'] |
|
|
elif 'Windows' in line: |
|
|
current_context['platforms'] = ['windows'] |
|
|
elif 'Android' in line: |
|
|
current_context['platforms'] = ['android'] |
|
|
elif 'iOS' in line: |
|
|
current_context['platforms'] = ['ios'] |
|
|
elif 'Cross-Platform' in line: |
|
|
current_context['platforms'] = ['cross-platform'] |
|
|
elif 'Embedded' in line or 'Raspberry' in line: |
|
|
current_context['platforms'] = ['embedded'] |
|
|
|
|
|
|
|
|
if 'Voice Typing' in line or 'Dictation' in line: |
|
|
current_context['usecases'] = ['voice-typing'] |
|
|
elif 'SaaS' in line or 'Cloud' in line: |
|
|
current_context['usecases'] = ['saas'] |
|
|
elif 'Subtitles' in line or 'Captioning' in line: |
|
|
current_context['usecases'] = ['subtitles'] |
|
|
elif 'Meeting' in line or 'Productivity' in line: |
|
|
current_context['usecases'] = ['meetings'] |
|
|
elif 'Web Interface' in line or 'Web UI' in line: |
|
|
current_context['usecases'] = ['web'] |
|
|
current_context['platforms'] = ['web'] |
|
|
elif 'Real-Time' in line or 'Streaming' in line: |
|
|
current_context['usecases'] = ['real-time'] |
|
|
elif 'Model Variants' in line or 'Performance' in line: |
|
|
current_context['usecases'] = ['model-variants'] |
|
|
elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line: |
|
|
current_context['usecases'] = ['developer'] |
|
|
|
|
|
|
|
|
elif line.startswith('####') and in_content_section: |
|
|
if 'Desktop Applications' in line: |
|
|
pass |
|
|
elif 'System Integration' in line: |
|
|
if not current_context.get('usecases'): |
|
|
current_context['usecases'] = ['voice-typing'] |
|
|
elif 'CLI Tools' in line: |
|
|
current_context['usecases'] = ['developer'] |
|
|
|
|
|
|
|
|
elif line.startswith('**') and ':' in line and '**' in line[:20]: |
|
|
current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', [])) |
|
|
|
|
|
|
|
|
elif in_content_section and line.startswith('|') and '[' in line and '](' in line: |
|
|
project = parse_table_row(line, current_context) |
|
|
if project and not any(p['name'] == project['name'] for p in projects): |
|
|
projects.append(project) |
|
|
|
|
|
|
|
|
elif in_content_section and line.startswith('- **['): |
|
|
project = parse_bullet_item(line, current_context) |
|
|
if project and not any(p['name'] == project['name'] for p in projects): |
|
|
projects.append(project) |
|
|
|
|
|
|
|
|
elif in_content_section and line.startswith('- [') and not line.startswith('- **['): |
|
|
match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line) |
|
|
if match: |
|
|
name = match.group(1) |
|
|
url = match.group(2) |
|
|
description = match.group(3).strip() |
|
|
github_repo = extract_github_repo(url) |
|
|
|
|
|
project = { |
|
|
'name': name, |
|
|
'url': url, |
|
|
'description': description, |
|
|
'platforms': current_context.get('platforms', ['web']), |
|
|
'usecases': current_context.get('usecases', ['saas']), |
|
|
'github_repo': github_repo, |
|
|
'has_stars': github_repo is not None |
|
|
} |
|
|
|
|
|
if not any(p['name'] == project['name'] for p in projects): |
|
|
projects.append(project) |
|
|
|
|
|
i += 1 |
|
|
|
|
|
return projects |
|
|
|
|
|
|
|
|
def enhance_projects(projects): |
|
|
"""Add additional metadata and clean up project data.""" |
|
|
for project in projects: |
|
|
|
|
|
if not project.get('usecases'): |
|
|
desc_lower = project['description'].lower() |
|
|
|
|
|
if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower: |
|
|
project['usecases'] = ['subtitles'] |
|
|
elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower: |
|
|
project['usecases'] = ['real-time'] |
|
|
elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower: |
|
|
project['usecases'] = ['meetings'] |
|
|
elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower: |
|
|
project['usecases'] = ['voice-typing'] |
|
|
elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower: |
|
|
project['usecases'] = ['model-variants'] |
|
|
else: |
|
|
project['usecases'] = ['developer'] |
|
|
|
|
|
|
|
|
if not project.get('platforms'): |
|
|
desc_lower = project['description'].lower() |
|
|
url_lower = project['url'].lower() |
|
|
|
|
|
if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower: |
|
|
project['platforms'] = ['web'] |
|
|
elif not project.get('github_repo'): |
|
|
project['platforms'] = ['web'] |
|
|
else: |
|
|
project['platforms'] = ['cross-platform'] |
|
|
|
|
|
return projects |
|
|
|
|
|
|
|
|
def main(): |
|
|
script_dir = Path(__file__).parent |
|
|
md_path = script_dir / 'projects.md' |
|
|
json_path = script_dir / 'projects.json' |
|
|
|
|
|
print(f"Parsing {md_path}...") |
|
|
projects = parse_markdown(md_path) |
|
|
|
|
|
print(f"Extracted {len(projects)} projects") |
|
|
|
|
|
|
|
|
projects = enhance_projects(projects) |
|
|
|
|
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(projects, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
print(f"Created {json_path} with {len(projects)} projects") |
|
|
|
|
|
|
|
|
platforms = set() |
|
|
usecases = set() |
|
|
for p in projects: |
|
|
platforms.update(p['platforms']) |
|
|
usecases.update(p['usecases']) |
|
|
|
|
|
print(f"\nStatistics:") |
|
|
print(f" Platforms: {', '.join(sorted(platforms))}") |
|
|
print(f" Use cases: {', '.join(sorted(usecases))}") |
|
|
|
|
|
|
|
|
print(f"\nSample projects:") |
|
|
for p in projects[:5]: |
|
|
print(f" - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|