Spaces:
Sleeping
Sleeping
| """Transcript statistics calculator module.""" | |
| from typing import Dict, Any | |
| from datetime import timedelta | |
| class TranscriptStatistics: | |
| """Calculate and manage transcript statistics.""" | |
| # Average reading speed (words per minute) | |
| READING_SPEED_WPM = 200 | |
| def calculate(cls, text: str, video_duration_seconds: float = None) -> Dict[str, Any]: | |
| """ | |
| Calculate comprehensive statistics for a transcript. | |
| Args: | |
| text: The transcript text | |
| video_duration_seconds: Optional video duration in seconds | |
| Returns: | |
| Dictionary containing all statistics | |
| """ | |
| if not text: | |
| return cls._empty_stats() | |
| # Basic counts | |
| word_count = cls._count_words(text) | |
| char_count = len(text) | |
| char_count_no_spaces = len(text.replace(' ', '')) | |
| paragraph_count = cls._count_paragraphs(text) | |
| sentence_count = cls._count_sentences(text) | |
| # Time calculations | |
| reading_time_minutes = cls._calculate_reading_time(word_count) | |
| # Speaking rate (if video duration provided) | |
| speaking_rate = None | |
| if video_duration_seconds and video_duration_seconds > 0: | |
| speaking_rate = cls._calculate_speaking_rate( | |
| word_count, | |
| video_duration_seconds | |
| ) | |
| return { | |
| 'word_count': word_count, | |
| 'character_count': char_count, | |
| 'character_count_no_spaces': char_count_no_spaces, | |
| 'paragraph_count': paragraph_count, | |
| 'sentence_count': sentence_count, | |
| 'reading_time_minutes': reading_time_minutes, | |
| 'reading_time_formatted': cls._format_time(reading_time_minutes), | |
| 'video_duration_seconds': video_duration_seconds, | |
| 'video_duration_formatted': cls._format_duration(video_duration_seconds) if video_duration_seconds else None, | |
| 'speaking_rate_wpm': speaking_rate, | |
| 'average_word_length': cls._average_word_length(text, word_count), | |
| 'average_sentence_length': round(word_count / sentence_count, 1) if sentence_count > 0 else 0, | |
| } | |
| def _count_words(text: str) -> int: | |
| """Count words in text.""" | |
| # Split by whitespace and filter empty strings | |
| words = [w for w in text.split() if w] | |
| return len(words) | |
| def _count_paragraphs(text: str) -> int: | |
| """Count paragraphs (separated by double newlines).""" | |
| if not text.strip(): | |
| return 0 | |
| # Split by double newlines and filter empty | |
| paragraphs = [p for p in text.split('\n\n') if p.strip()] | |
| return max(1, len(paragraphs)) | |
| def _count_sentences(text: str) -> int: | |
| """Count sentences (ending with . ! ?).""" | |
| import re | |
| # Match sentence endings | |
| sentences = re.split(r'[.!?]+', text) | |
| # Filter out empty strings | |
| sentences = [s for s in sentences if s.strip()] | |
| return max(1, len(sentences)) | |
| def _calculate_reading_time(cls, word_count: int) -> float: | |
| """Calculate reading time in minutes.""" | |
| return round(word_count / cls.READING_SPEED_WPM, 1) | |
| def _calculate_speaking_rate(cls, word_count: int, duration_seconds: float) -> int: | |
| """Calculate speaking rate in words per minute.""" | |
| if duration_seconds <= 0: | |
| return 0 | |
| duration_minutes = duration_seconds / 60 | |
| return round(word_count / duration_minutes) | |
| def _format_time(minutes: float) -> str: | |
| """Format time in minutes to human-readable string.""" | |
| if minutes < 1: | |
| seconds = int(minutes * 60) | |
| return f"{seconds} seconds" | |
| elif minutes < 60: | |
| return f"{minutes:.1f} minutes" | |
| else: | |
| hours = int(minutes // 60) | |
| remaining_minutes = int(minutes % 60) | |
| return f"{hours}h {remaining_minutes}m" | |
| def _format_duration(seconds: float) -> str: | |
| """Format duration in seconds to HH:MM:SS format.""" | |
| if seconds is None: | |
| return "N/A" | |
| td = timedelta(seconds=int(seconds)) | |
| total_seconds = int(td.total_seconds()) | |
| hours, remainder = divmod(total_seconds, 3600) | |
| minutes, secs = divmod(remainder, 60) | |
| if hours > 0: | |
| return f"{hours:02d}:{minutes:02d}:{secs:02d}" | |
| else: | |
| return f"{minutes:02d}:{secs:02d}" | |
| def _average_word_length(text: str, word_count: int) -> float: | |
| """Calculate average word length.""" | |
| if word_count == 0: | |
| return 0 | |
| # Remove punctuation for accurate measurement | |
| import re | |
| words = re.findall(r'\b\w+\b', text) | |
| if not words: | |
| return 0 | |
| total_chars = sum(len(w) for w in words) | |
| return round(total_chars / len(words), 1) | |
| def _empty_stats() -> Dict[str, Any]: | |
| """Return empty statistics dictionary.""" | |
| return { | |
| 'word_count': 0, | |
| 'character_count': 0, | |
| 'character_count_no_spaces': 0, | |
| 'paragraph_count': 0, | |
| 'sentence_count': 0, | |
| 'reading_time_minutes': 0, | |
| 'reading_time_formatted': '0 seconds', | |
| 'video_duration_seconds': None, | |
| 'video_duration_formatted': None, | |
| 'speaking_rate_wpm': None, | |
| 'average_word_length': 0, | |
| 'average_sentence_length': 0, | |
| } | |
| def calculate_transcript_stats(text: str, video_duration_seconds: float = None) -> Dict[str, Any]: | |
| """Convenience function to calculate transcript statistics.""" | |
| return TranscriptStatistics.calculate(text, video_duration_seconds) | |
| if __name__ == "__main__": | |
| # Test the statistics calculator | |
| sample_text = """ | |
| Welcome everyone to this video. Today we are going to discuss the importance of | |
| artificial intelligence in modern software development. | |
| AI has become an integral part of our daily lives. From voice assistants to | |
| recommendation systems, we interact with AI on a regular basis. | |
| In this video, we will explore how developers can leverage AI tools to improve | |
| their productivity and create better software solutions. | |
| """ | |
| stats = calculate_transcript_stats(sample_text, video_duration_seconds=180) | |
| print("Transcript Statistics:") | |
| print(f" Word Count: {stats['word_count']}") | |
| print(f" Character Count: {stats['character_count']}") | |
| print(f" Reading Time: {stats['reading_time_formatted']}") | |
| print(f" Video Duration: {stats['video_duration_formatted']}") | |
| print(f" Speaking Rate: {stats['speaking_rate_wpm']} WPM") | |
| print(f" Paragraphs: {stats['paragraph_count']}") | |
| print(f" Sentences: {stats['sentence_count']}") | |