Interesting-Whisper-Projects / parse_markdown.py
danielrosehill's picture
commit
323ad41
#!/usr/bin/env python3
"""
Parse the Awesome Whisper Apps markdown file and generate a JSON data file.
This script extracts project information from projects.md and creates projects.json
"""
import json
import re
from pathlib import Path
def extract_github_repo(url):
"""Extract GitHub repository from URL."""
github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url)
if github_match:
return github_match.group(1)
return None
def parse_table_row(line, context):
"""Parse a markdown table row."""
# Table format: | [Name](url) | stars_badge | Description |
# or: | [Name](url) | Platform | stars_badge | Description |
parts = [p.strip() for p in line.split('|')[1:-1]] # Remove empty first/last
if len(parts) < 2:
return None
# Extract name and URL from first column
match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0])
if not match:
return None
name = match.group(1)
url = match.group(2)
github_repo = extract_github_repo(url)
# Find the description (last column that doesn't contain an image badge)
description = ""
for i in range(len(parts) - 1, 0, -1):
if '![' not in parts[i]:
description = parts[i]
break
return {
'name': name,
'url': url,
'description': description,
'platforms': context.get('platforms', []),
'usecases': context.get('usecases', []),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
def parse_bullet_item(line, context):
"""Parse a bullet list item."""
# Format: - **[name](url)** description
# or: - **[name](url)** ![stars](badge) - description
match = re.search(r'-\s+\*\*\[([^\]]+)\]\(([^)]+)\)\*\*\s+(?:!\[GitHub[^\]]*\][^-]*-\s+)?(.+)', line)
if not match:
return None
name = match.group(1)
url = match.group(2)
description = match.group(3).strip()
github_repo = extract_github_repo(url)
return {
'name': name,
'url': url,
'description': description,
'platforms': context.get('platforms', []),
'usecases': context.get('usecases', []),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
def determine_platform_from_subsection(line, current_platforms):
"""Determine platform from **Platform:** style markers."""
line_lower = line.lower()
if '**cross-platform:**' in line_lower:
return ['cross-platform']
elif '**linux:**' in line_lower:
return ['linux']
elif '**macos:**' in line_lower:
return ['macos']
elif '**windows:**' in line_lower:
return ['windows']
elif '**mobile:**' in line_lower:
return ['android', 'ios']
return current_platforms
def parse_markdown(md_path):
"""Parse the markdown file and extract all projects."""
projects = []
current_context = {'platforms': [], 'usecases': []}
in_content_section = False
with open(md_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i].strip()
# Start tracking only in sections with real URLs (not reference links)
if '## By Platform' in line:
in_content_section = True
current_context = {'platforms': [], 'usecases': []}
elif '## By Use Case' in line:
in_content_section = False # Skip this section (has reference links only)
# Major section headers (##)
if line.startswith('##') and in_content_section:
if 'By Platform' in line:
current_context = {'platforms': [], 'usecases': []}
elif 'For Developers' in line:
in_content_section = True
current_context = {'platforms': [], 'usecases': ['developer']}
elif 'SRT' in line or 'Subtitles' in line:
current_context['usecases'] = ['subtitles']
elif 'Meeting' in line:
current_context['usecases'] = ['meetings']
elif 'Real-Time' in line or 'Streaming' in line:
current_context['usecases'] = ['real-time']
# Subsection headers (###)
elif line.startswith('###') and in_content_section:
# Platform sections
if 'Linux' in line and '##' not in line:
current_context['platforms'] = ['linux']
elif 'macOS' in line:
current_context['platforms'] = ['macos']
elif 'Windows' in line:
current_context['platforms'] = ['windows']
elif 'Android' in line:
current_context['platforms'] = ['android']
elif 'iOS' in line:
current_context['platforms'] = ['ios']
elif 'Cross-Platform' in line:
current_context['platforms'] = ['cross-platform']
elif 'Embedded' in line or 'Raspberry' in line:
current_context['platforms'] = ['embedded']
# Use case sections
if 'Voice Typing' in line or 'Dictation' in line:
current_context['usecases'] = ['voice-typing']
elif 'SaaS' in line or 'Cloud' in line:
current_context['usecases'] = ['saas']
elif 'Subtitles' in line or 'Captioning' in line:
current_context['usecases'] = ['subtitles']
elif 'Meeting' in line or 'Productivity' in line:
current_context['usecases'] = ['meetings']
elif 'Web Interface' in line or 'Web UI' in line:
current_context['usecases'] = ['web']
current_context['platforms'] = ['web']
elif 'Real-Time' in line or 'Streaming' in line:
current_context['usecases'] = ['real-time']
elif 'Model Variants' in line or 'Performance' in line:
current_context['usecases'] = ['model-variants']
elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line:
current_context['usecases'] = ['developer']
# Sub-subsection headers (####)
elif line.startswith('####') and in_content_section:
if 'Desktop Applications' in line:
pass # Keep current platform
elif 'System Integration' in line:
if not current_context.get('usecases'):
current_context['usecases'] = ['voice-typing']
elif 'CLI Tools' in line:
current_context['usecases'] = ['developer']
# Check for **Platform:** style subsections
elif line.startswith('**') and ':' in line and '**' in line[:20]:
current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', []))
# Parse table rows
elif in_content_section and line.startswith('|') and '[' in line and '](' in line:
project = parse_table_row(line, current_context)
if project and not any(p['name'] == project['name'] for p in projects):
projects.append(project)
# Parse bullet items
elif in_content_section and line.startswith('- **['):
project = parse_bullet_item(line, current_context)
if project and not any(p['name'] == project['name'] for p in projects):
projects.append(project)
# Parse simple links (for SaaS section)
elif in_content_section and line.startswith('- [') and not line.startswith('- **['):
match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line)
if match:
name = match.group(1)
url = match.group(2)
description = match.group(3).strip()
github_repo = extract_github_repo(url)
project = {
'name': name,
'url': url,
'description': description,
'platforms': current_context.get('platforms', ['web']),
'usecases': current_context.get('usecases', ['saas']),
'github_repo': github_repo,
'has_stars': github_repo is not None
}
if not any(p['name'] == project['name'] for p in projects):
projects.append(project)
i += 1
return projects
def enhance_projects(projects):
"""Add additional metadata and clean up project data."""
for project in projects:
# Ensure we have at least one use case
if not project.get('usecases'):
desc_lower = project['description'].lower()
if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower:
project['usecases'] = ['subtitles']
elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower:
project['usecases'] = ['real-time']
elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower:
project['usecases'] = ['meetings']
elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower:
project['usecases'] = ['voice-typing']
elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower:
project['usecases'] = ['model-variants']
else:
project['usecases'] = ['developer']
# Ensure we have at least one platform
if not project.get('platforms'):
desc_lower = project['description'].lower()
url_lower = project['url'].lower()
if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower:
project['platforms'] = ['web']
elif not project.get('github_repo'): # Non-GitHub links are likely web services
project['platforms'] = ['web']
else:
project['platforms'] = ['cross-platform']
return projects
def main():
script_dir = Path(__file__).parent
md_path = script_dir / 'projects.md'
json_path = script_dir / 'projects.json'
print(f"Parsing {md_path}...")
projects = parse_markdown(md_path)
print(f"Extracted {len(projects)} projects")
# Enhance with additional metadata
projects = enhance_projects(projects)
# Write to JSON
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(projects, f, indent=2, ensure_ascii=False)
print(f"Created {json_path} with {len(projects)} projects")
# Print statistics
platforms = set()
usecases = set()
for p in projects:
platforms.update(p['platforms'])
usecases.update(p['usecases'])
print(f"\nStatistics:")
print(f" Platforms: {', '.join(sorted(platforms))}")
print(f" Use cases: {', '.join(sorted(usecases))}")
# Show sample projects
print(f"\nSample projects:")
for p in projects[:5]:
print(f" - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...")
if __name__ == '__main__':
main()