#!/usr/bin/env python3 """ Subtitle Detection Script This script checks which videos on JW.org have subtitles available, using the same logic as the Search UI app. Results are output to a CSV file. Detects TWO types of subtitles: 1. VTT files (files[].subtitles.url) - Downloadable subtitle files 2. Burned-in (files[].subtitled=true) - Video versions with embedded subtitles Usage: python3 subtitle-detection.py [--language LANG] Example: python3 subtitle-detection.py --language E """ import requests import json import csv import argparse from typing import Dict, Any, Optional from datetime import datetime def fetch_json(url: str) -> Dict[str, Any]: """Fetch JSON data from URL.""" print(f"Fetching: {url}") response = requests.get(url, timeout=60) response.raise_for_status() return response.json() def find_subtitles(media_item: Dict[str, Any]) -> Optional[str]: """ Find subtitle URL from media item files. This uses the exact same logic as the Search UI app (subtitles_download.py). Args: media_item: Media item dictionary with 'files' array Returns: Subtitle URL string, or None if not found """ files = media_item.get('files', []) for file in files: if 'subtitles' in file: subtitle_url = file['subtitles'].get('url') if subtitle_url: return subtitle_url return None def has_burned_in_subtitles(media_item: Dict[str, Any]) -> bool: """ Check if video has a burned-in subtitle version available. Burned-in subtitles are embedded in the video stream itself (not a separate file). The JW.org website lets you switch between subtitled and non-subtitled versions. Args: media_item: Media item dictionary with 'files' array Returns: True if any file has subtitled=True, False otherwise """ files = media_item.get('files', []) for file in files: if file.get('subtitled') == True: return True return False def get_file_labels_with_subtitles(media_item: Dict[str, Any]) -> Dict[str, Any]: """ Get detailed information about which file versions have subtitles. Args: media_item: Media item dictionary with 'files' array Returns: Dict with: - burned_in_labels: list of resolution labels with burned-in subtitles - vtt_labels: list of resolution labels with VTT subtitle files """ files = media_item.get('files', []) burned_in_labels = [] vtt_labels = [] for file in files: label = file.get('label', 'unknown') if file.get('subtitled') == True: burned_in_labels.append(label) if 'subtitles' in file and file['subtitles'].get('url'): vtt_labels.append(label) return { 'burned_in_labels': burned_in_labels, 'vtt_labels': vtt_labels } def get_jw_org_url(natural_key: str, language: str = "E") -> str: """ Generate the JW.org video URL for a given natural key. Args: natural_key: The languageAgnosticNaturalKey (e.g., 'pub-osg_108_VIDEO') language: Language code (e.g., 'E' for English) Returns: Full JW.org URL for the video """ return f"https://www.jw.org/finder?srcid=share&wtlocale={language}&lank={natural_key}" def download_vod_categories(language: str) -> Dict[str, Any]: """ Download the VideoOnDemand metadata to get list of categories. Args: language: Language code (e.g., 'E' for English) Returns: VOD data dictionary with category information """ url = f"https://b.jw-cdn.org/apis/mediator/v1/categories/{language}/VideoOnDemand?detailed=1&mediaLimit=0&clientType=www" return fetch_json(url) def download_category(category_key: str, language: str) -> Dict[str, Any]: """ Download detailed category data from JW.org API. Args: category_key: Category key (e.g., 'VODStudio') language: Language code (e.g., 'E' for English) Returns: Category data dictionary with media items """ url = f"https://b.jw-cdn.org/apis/mediator/v1/categories/{language}/{category_key}?detailed=1&clientType=www" return fetch_json(url) def collect_all_media(language: str) -> Dict[str, Dict[str, Any]]: """ Collect all media items from all VOD categories. Uses the same approach as the Search UI app (jw_api.py). Args: language: Language code Returns: Dictionary mapping naturalKey -> media_item (with category context) """ print(f"\n=== Collecting all media items for language: {language} ===\n") # Get the main VOD listing vod_data = download_vod_categories(language) # Get list of top-level categories categories = vod_data.get('category', {}).get('subcategories', []) print(f"Found {len(categories)} top-level categories\n") all_media = {} # Process each category for category in categories: category_key = category.get('key', '') category_name = category.get('name', '') print(f"Processing category: {category_name} ({category_key})") try: category_data = download_category(category_key, language) # Process subcategories subcategories = category_data.get('category', {}).get('subcategories', []) for subcategory in subcategories: subcategory_name = subcategory.get('name', '') media_items = subcategory.get('media', []) for media_item in media_items: natural_key = media_item.get('languageAgnosticNaturalKey', '') if not natural_key: continue # Add category context (same as combine_media_info in jw_api.py) media_item_with_context = media_item.copy() media_item_with_context['_category'] = category_name media_item_with_context['_category_key'] = category_key media_item_with_context['_subcategory'] = subcategory_name all_media[natural_key] = media_item_with_context print(f" -> Found {len(subcategories)} subcategories") except Exception as e: print(f" -> Error: {e}") continue print(f"\n=== Total unique media items collected: {len(all_media)} ===\n") return all_media def analyze_subtitles(all_media: Dict[str, Dict[str, Any]], language: str) -> list: """ Analyze all media items for subtitle availability. Checks for TWO types of subtitles: 1. VTT files (files[].subtitles.url) - Downloadable subtitle files 2. Burned-in (files[].subtitled=true) - Video versions with embedded subtitles Args: all_media: Dictionary of all media items language: Language code Returns: List of dictionaries with analysis results """ results = [] for natural_key, media_item in all_media.items(): # Check for VTT subtitle file subtitle_url = find_subtitles(media_item) has_vtt = subtitle_url is not None # Check for burned-in subtitle versions has_burned_in = has_burned_in_subtitles(media_item) # Get detailed info about which resolutions have subtitles subtitle_info = get_file_labels_with_subtitles(media_item) # Any subtitle availability (either type) has_any_subtitles = has_vtt or has_burned_in # Get video duration duration = media_item.get('duration', 0) duration_formatted = media_item.get('durationFormattedHHMM', '') # Get first published date first_published = media_item.get('firstPublished', '') results.append({ 'natural_key': natural_key, 'title': media_item.get('title', ''), 'category': media_item.get('_category', ''), 'subcategory': media_item.get('_subcategory', ''), 'has_any_subtitles': 'Yes' if has_any_subtitles else 'No', 'has_vtt_subtitles': 'Yes' if has_vtt else 'No', 'has_burned_in_subtitles': 'Yes' if has_burned_in else 'No', 'vtt_url': subtitle_url or '', 'vtt_resolutions': ', '.join(subtitle_info['vtt_labels']) if subtitle_info['vtt_labels'] else '', 'burned_in_resolutions': ', '.join(subtitle_info['burned_in_labels']) if subtitle_info['burned_in_labels'] else '', 'duration_seconds': duration, 'duration_formatted': duration_formatted, 'first_published': first_published, 'jw_org_url': get_jw_org_url(natural_key, language) }) return results def write_csv(results: list, output_file: str) -> None: """ Write results to CSV file. Args: results: List of result dictionaries output_file: Path to output CSV file """ if not results: print("No results to write!") return # Define column order fieldnames = [ 'natural_key', 'title', 'category', 'subcategory', 'has_any_subtitles', 'has_vtt_subtitles', 'has_burned_in_subtitles', 'vtt_url', 'vtt_resolutions', 'burned_in_resolutions', 'duration_seconds', 'duration_formatted', 'first_published', 'jw_org_url' ] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(results) print(f"Results written to: {output_file}") def print_summary(results: list) -> str: """ Print and return a summary of the results. Args: results: List of result dictionaries Returns: Summary string """ total = len(results) # Count different subtitle types with_any = sum(1 for r in results if r['has_any_subtitles'] == 'Yes') with_vtt = sum(1 for r in results if r['has_vtt_subtitles'] == 'Yes') with_burned_in = sum(1 for r in results if r['has_burned_in_subtitles'] == 'Yes') with_both = sum(1 for r in results if r['has_vtt_subtitles'] == 'Yes' and r['has_burned_in_subtitles'] == 'Yes') vtt_only = sum(1 for r in results if r['has_vtt_subtitles'] == 'Yes' and r['has_burned_in_subtitles'] == 'No') burned_in_only = sum(1 for r in results if r['has_vtt_subtitles'] == 'No' and r['has_burned_in_subtitles'] == 'Yes') without_any = total - with_any # Count by category categories = {} for r in results: cat = r['category'] if cat not in categories: categories[cat] = {'total': 0, 'with_any': 0, 'with_vtt': 0, 'with_burned_in': 0} categories[cat]['total'] += 1 if r['has_any_subtitles'] == 'Yes': categories[cat]['with_any'] += 1 if r['has_vtt_subtitles'] == 'Yes': categories[cat]['with_vtt'] += 1 if r['has_burned_in_subtitles'] == 'Yes': categories[cat]['with_burned_in'] += 1 pct = lambda n: f"{100*n/total:.1f}%" if total > 0 else "0%" lines = [ "", "=" * 70, "SUBTITLE AVAILABILITY SUMMARY", "=" * 70, f"Total videos analyzed: {total}", "", "OVERALL SUBTITLE AVAILABILITY:", "-" * 40, f" Videos WITH any subtitles: {with_any:5d} ({pct(with_any)})", f" Videos WITHOUT any subtitles: {without_any:5d} ({pct(without_any)})", "", "SUBTITLE TYPE BREAKDOWN:", "-" * 40, f" VTT file subtitles: {with_vtt:5d} ({pct(with_vtt)})", f" Burned-in subtitles: {with_burned_in:5d} ({pct(with_burned_in)})", f" Both types available: {with_both:5d} ({pct(with_both)})", f" VTT only (no burned-in): {vtt_only:5d} ({pct(vtt_only)})", f" Burned-in only (no VTT): {burned_in_only:5d} ({pct(burned_in_only)})", "", "BY CATEGORY (any subtitles / VTT / burned-in):", "-" * 70 ] for cat, counts in sorted(categories.items()): pct_any = 100 * counts['with_any'] / counts['total'] if counts['total'] > 0 else 0 lines.append(f" {cat}:") lines.append(f" Any: {counts['with_any']}/{counts['total']} ({pct_any:.0f}%) | VTT: {counts['with_vtt']} | Burned-in: {counts['with_burned_in']}") lines.append("=" * 70) summary = "\n".join(lines) print(summary) return summary def main(): parser = argparse.ArgumentParser( description='Check which JW.org videos have subtitles available for download' ) parser.add_argument( '--language', '-l', default='E', help='Language code (default: E for English)' ) parser.add_argument( '--output', '-o', default=None, help='Output CSV file path (default: subtitle-detection-{language}-{timestamp}.csv)' ) args = parser.parse_args() language = args.language timestamp = datetime.now().strftime('%Y%m%d-%H%M%S') # Default output file in current working directory if args.output: output_file = args.output else: import os output_file = f'subtitle-detection-{language}-{timestamp}.csv' print(f"\n{'='*60}") print("JW.ORG SUBTITLE DETECTION SCRIPT") print(f"{'='*60}") print(f"Language: {language}") print(f"Output file: {output_file}") print(f"{'='*60}\n") # Collect all media items all_media = collect_all_media(language) # Analyze for subtitles print("Analyzing subtitle availability...") results = analyze_subtitles(all_media, language) # Sort results by category, then title results.sort(key=lambda x: (x['category'], x['title'])) # Write CSV write_csv(results, output_file) # Print summary summary = print_summary(results) # Write summary to result file result_file = output_file.replace('.csv', '.result.txt') with open(result_file, 'w', encoding='utf-8') as f: f.write(f"Subtitle Detection Results\n") f.write(f"Generated: {datetime.now().isoformat()}\n") f.write(f"Language: {language}\n") f.write(f"CSV Output: {output_file}\n") f.write(summary) print(f"\nSummary written to: {result_file}") print("\nDone!") if __name__ == '__main__': main()