Spaces:

danielrosehill
/

Interesting-Whisper-Projects

Running

App Files Files Community

Interesting-Whisper-Projects / parse_markdown.py

danielrosehill

commit

323ad41 about 2 months ago

raw

history blame

11.1 kB

	#!/usr/bin/env python3
	"""
	Parse the Awesome Whisper Apps markdown file and generate a JSON data file.
	This script extracts project information from projects.md and creates projects.json
	"""

	import json
	import re
	from pathlib import Path


	def extract_github_repo(url):
	"""Extract GitHub repository from URL."""
	github_match = re.search(r'github\.com/([^/]+/[^/\s)]+)', url)
	if github_match:
	return github_match.group(1)
	return None


	def parse_table_row(line, context):
	"""Parse a markdown table row."""
	# Table format: \| [Name](url) \| stars_badge \| Description \|
	# or: \| [Name](url) \| Platform \| stars_badge \| Description \|
	parts = [p.strip() for p in line.split('\|')[1:-1]] # Remove empty first/last
	if len(parts) < 2:
	return None

	# Extract name and URL from first column
	match = re.search(r'\[([^\]]+)\]\(([^)]+)\)', parts[0])
	if not match:
	return None

	name = match.group(1)
	url = match.group(2)
	github_repo = extract_github_repo(url)

	# Find the description (last column that doesn't contain an image badge)
	description = ""
	for i in range(len(parts) - 1, 0, -1):
	if '![' not in parts[i]:
	description = parts[i]
	break

	return {
	'name': name,
	'url': url,
	'description': description,
	'platforms': context.get('platforms', []),
	'usecases': context.get('usecases', []),
	'github_repo': github_repo,
	'has_stars': github_repo is not None
	}


	def parse_bullet_item(line, context):
	"""Parse a bullet list item."""
	# Format: - [name](url) description
	# or: - [name](url) ![stars](badge) - description
	match = re.search(r'-\s+\\\[([^\]]+)\]\(([^)]+)\)\\\s+(?:!\[GitHub[^\]]\][^-]-\s+)?(.+)', line)

	if not match:
	return None

	name = match.group(1)
	url = match.group(2)
	description = match.group(3).strip()
	github_repo = extract_github_repo(url)

	return {
	'name': name,
	'url': url,
	'description': description,
	'platforms': context.get('platforms', []),
	'usecases': context.get('usecases', []),
	'github_repo': github_repo,
	'has_stars': github_repo is not None
	}


	def determine_platform_from_subsection(line, current_platforms):
	"""Determine platform from Platform: style markers."""
	line_lower = line.lower()

	if 'cross-platform:' in line_lower:
	return ['cross-platform']
	elif 'linux:' in line_lower:
	return ['linux']
	elif 'macos:' in line_lower:
	return ['macos']
	elif 'windows:' in line_lower:
	return ['windows']
	elif 'mobile:' in line_lower:
	return ['android', 'ios']

	return current_platforms


	def parse_markdown(md_path):
	"""Parse the markdown file and extract all projects."""
	projects = []
	current_context = {'platforms': [], 'usecases': []}
	in_content_section = False

	with open(md_path, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	i = 0
	while i < len(lines):
	line = lines[i].strip()

	# Start tracking only in sections with real URLs (not reference links)
	if '## By Platform' in line:
	in_content_section = True
	current_context = {'platforms': [], 'usecases': []}
	elif '## By Use Case' in line:
	in_content_section = False # Skip this section (has reference links only)

	# Major section headers (##)
	if line.startswith('##') and in_content_section:
	if 'By Platform' in line:
	current_context = {'platforms': [], 'usecases': []}
	elif 'For Developers' in line:
	in_content_section = True
	current_context = {'platforms': [], 'usecases': ['developer']}
	elif 'SRT' in line or 'Subtitles' in line:
	current_context['usecases'] = ['subtitles']
	elif 'Meeting' in line:
	current_context['usecases'] = ['meetings']
	elif 'Real-Time' in line or 'Streaming' in line:
	current_context['usecases'] = ['real-time']

	# Subsection headers (###)
	elif line.startswith('###') and in_content_section:
	# Platform sections
	if 'Linux' in line and '##' not in line:
	current_context['platforms'] = ['linux']
	elif 'macOS' in line:
	current_context['platforms'] = ['macos']
	elif 'Windows' in line:
	current_context['platforms'] = ['windows']
	elif 'Android' in line:
	current_context['platforms'] = ['android']
	elif 'iOS' in line:
	current_context['platforms'] = ['ios']
	elif 'Cross-Platform' in line:
	current_context['platforms'] = ['cross-platform']
	elif 'Embedded' in line or 'Raspberry' in line:
	current_context['platforms'] = ['embedded']

	# Use case sections
	if 'Voice Typing' in line or 'Dictation' in line:
	current_context['usecases'] = ['voice-typing']
	elif 'SaaS' in line or 'Cloud' in line:
	current_context['usecases'] = ['saas']
	elif 'Subtitles' in line or 'Captioning' in line:
	current_context['usecases'] = ['subtitles']
	elif 'Meeting' in line or 'Productivity' in line:
	current_context['usecases'] = ['meetings']
	elif 'Web Interface' in line or 'Web UI' in line:
	current_context['usecases'] = ['web']
	current_context['platforms'] = ['web']
	elif 'Real-Time' in line or 'Streaming' in line:
	current_context['usecases'] = ['real-time']
	elif 'Model Variants' in line or 'Performance' in line:
	current_context['usecases'] = ['model-variants']
	elif 'Fine-Tuning' in line or 'Diarization' in line or 'Timestamps' in line:
	current_context['usecases'] = ['developer']

	# Sub-subsection headers (####)
	elif line.startswith('####') and in_content_section:
	if 'Desktop Applications' in line:
	pass # Keep current platform
	elif 'System Integration' in line:
	if not current_context.get('usecases'):
	current_context['usecases'] = ['voice-typing']
	elif 'CLI Tools' in line:
	current_context['usecases'] = ['developer']

	# Check for Platform: style subsections
	elif line.startswith('') and ':' in line and '' in line[:20]:
	current_context['platforms'] = determine_platform_from_subsection(line, current_context.get('platforms', []))

	# Parse table rows
	elif in_content_section and line.startswith('\|') and '[' in line and '](' in line:
	project = parse_table_row(line, current_context)
	if project and not any(p['name'] == project['name'] for p in projects):
	projects.append(project)

	# Parse bullet items
	elif in_content_section and line.startswith('- **['):
	project = parse_bullet_item(line, current_context)
	if project and not any(p['name'] == project['name'] for p in projects):
	projects.append(project)

	# Parse simple links (for SaaS section)
	elif in_content_section and line.startswith('- [') and not line.startswith('- **['):
	match = re.search(r'-\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)', line)
	if match:
	name = match.group(1)
	url = match.group(2)
	description = match.group(3).strip()
	github_repo = extract_github_repo(url)

	project = {
	'name': name,
	'url': url,
	'description': description,
	'platforms': current_context.get('platforms', ['web']),
	'usecases': current_context.get('usecases', ['saas']),
	'github_repo': github_repo,
	'has_stars': github_repo is not None
	}

	if not any(p['name'] == project['name'] for p in projects):
	projects.append(project)

	i += 1

	return projects


	def enhance_projects(projects):
	"""Add additional metadata and clean up project data."""
	for project in projects:
	# Ensure we have at least one use case
	if not project.get('usecases'):
	desc_lower = project['description'].lower()

	if 'subtitle' in desc_lower or 'caption' in desc_lower or 'srt' in desc_lower:
	project['usecases'] = ['subtitles']
	elif 'real-time' in desc_lower or 'streaming' in desc_lower or 'live' in desc_lower:
	project['usecases'] = ['real-time']
	elif 'meeting' in desc_lower or 'note' in desc_lower or 'minutes' in desc_lower:
	project['usecases'] = ['meetings']
	elif 'dictation' in desc_lower or 'voice' in desc_lower or 'typing' in desc_lower:
	project['usecases'] = ['voice-typing']
	elif 'model' in desc_lower or 'implementation' in desc_lower or 'whisper' in desc_lower:
	project['usecases'] = ['model-variants']
	else:
	project['usecases'] = ['developer']

	# Ensure we have at least one platform
	if not project.get('platforms'):
	desc_lower = project['description'].lower()
	url_lower = project['url'].lower()

	if 'web' in desc_lower or 'browser' in desc_lower or 'online' in desc_lower:
	project['platforms'] = ['web']
	elif not project.get('github_repo'): # Non-GitHub links are likely web services
	project['platforms'] = ['web']
	else:
	project['platforms'] = ['cross-platform']

	return projects


	def main():
	script_dir = Path(__file__).parent
	md_path = script_dir / 'projects.md'
	json_path = script_dir / 'projects.json'

	print(f"Parsing {md_path}...")
	projects = parse_markdown(md_path)

	print(f"Extracted {len(projects)} projects")

	# Enhance with additional metadata
	projects = enhance_projects(projects)

	# Write to JSON
	with open(json_path, 'w', encoding='utf-8') as f:
	json.dump(projects, f, indent=2, ensure_ascii=False)

	print(f"Created {json_path} with {len(projects)} projects")

	# Print statistics
	platforms = set()
	usecases = set()
	for p in projects:
	platforms.update(p['platforms'])
	usecases.update(p['usecases'])

	print(f"\nStatistics:")
	print(f" Platforms: {', '.join(sorted(platforms))}")
	print(f" Use cases: {', '.join(sorted(usecases))}")

	# Show sample projects
	print(f"\nSample projects:")
	for p in projects[:5]:
	print(f" - {p['name']} ({', '.join(p['platforms'])}) - {p['description'][:50]}...")


	if __name__ == '__main__':
	main()