Youtube-sum / utils /statistics.py
simplytaps's picture
Upload folder using huggingface_hub
bf00853 verified
"""Transcript statistics calculator module."""
from typing import Dict, Any
from datetime import timedelta
class TranscriptStatistics:
"""Calculate and manage transcript statistics."""
# Average reading speed (words per minute)
READING_SPEED_WPM = 200
@classmethod
def calculate(cls, text: str, video_duration_seconds: float = None) -> Dict[str, Any]:
"""
Calculate comprehensive statistics for a transcript.
Args:
text: The transcript text
video_duration_seconds: Optional video duration in seconds
Returns:
Dictionary containing all statistics
"""
if not text:
return cls._empty_stats()
# Basic counts
word_count = cls._count_words(text)
char_count = len(text)
char_count_no_spaces = len(text.replace(' ', ''))
paragraph_count = cls._count_paragraphs(text)
sentence_count = cls._count_sentences(text)
# Time calculations
reading_time_minutes = cls._calculate_reading_time(word_count)
# Speaking rate (if video duration provided)
speaking_rate = None
if video_duration_seconds and video_duration_seconds > 0:
speaking_rate = cls._calculate_speaking_rate(
word_count,
video_duration_seconds
)
return {
'word_count': word_count,
'character_count': char_count,
'character_count_no_spaces': char_count_no_spaces,
'paragraph_count': paragraph_count,
'sentence_count': sentence_count,
'reading_time_minutes': reading_time_minutes,
'reading_time_formatted': cls._format_time(reading_time_minutes),
'video_duration_seconds': video_duration_seconds,
'video_duration_formatted': cls._format_duration(video_duration_seconds) if video_duration_seconds else None,
'speaking_rate_wpm': speaking_rate,
'average_word_length': cls._average_word_length(text, word_count),
'average_sentence_length': round(word_count / sentence_count, 1) if sentence_count > 0 else 0,
}
@staticmethod
def _count_words(text: str) -> int:
"""Count words in text."""
# Split by whitespace and filter empty strings
words = [w for w in text.split() if w]
return len(words)
@staticmethod
def _count_paragraphs(text: str) -> int:
"""Count paragraphs (separated by double newlines)."""
if not text.strip():
return 0
# Split by double newlines and filter empty
paragraphs = [p for p in text.split('\n\n') if p.strip()]
return max(1, len(paragraphs))
@staticmethod
def _count_sentences(text: str) -> int:
"""Count sentences (ending with . ! ?)."""
import re
# Match sentence endings
sentences = re.split(r'[.!?]+', text)
# Filter out empty strings
sentences = [s for s in sentences if s.strip()]
return max(1, len(sentences))
@classmethod
def _calculate_reading_time(cls, word_count: int) -> float:
"""Calculate reading time in minutes."""
return round(word_count / cls.READING_SPEED_WPM, 1)
@classmethod
def _calculate_speaking_rate(cls, word_count: int, duration_seconds: float) -> int:
"""Calculate speaking rate in words per minute."""
if duration_seconds <= 0:
return 0
duration_minutes = duration_seconds / 60
return round(word_count / duration_minutes)
@staticmethod
def _format_time(minutes: float) -> str:
"""Format time in minutes to human-readable string."""
if minutes < 1:
seconds = int(minutes * 60)
return f"{seconds} seconds"
elif minutes < 60:
return f"{minutes:.1f} minutes"
else:
hours = int(minutes // 60)
remaining_minutes = int(minutes % 60)
return f"{hours}h {remaining_minutes}m"
@staticmethod
def _format_duration(seconds: float) -> str:
"""Format duration in seconds to HH:MM:SS format."""
if seconds is None:
return "N/A"
td = timedelta(seconds=int(seconds))
total_seconds = int(td.total_seconds())
hours, remainder = divmod(total_seconds, 3600)
minutes, secs = divmod(remainder, 60)
if hours > 0:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
else:
return f"{minutes:02d}:{secs:02d}"
@staticmethod
def _average_word_length(text: str, word_count: int) -> float:
"""Calculate average word length."""
if word_count == 0:
return 0
# Remove punctuation for accurate measurement
import re
words = re.findall(r'\b\w+\b', text)
if not words:
return 0
total_chars = sum(len(w) for w in words)
return round(total_chars / len(words), 1)
@staticmethod
def _empty_stats() -> Dict[str, Any]:
"""Return empty statistics dictionary."""
return {
'word_count': 0,
'character_count': 0,
'character_count_no_spaces': 0,
'paragraph_count': 0,
'sentence_count': 0,
'reading_time_minutes': 0,
'reading_time_formatted': '0 seconds',
'video_duration_seconds': None,
'video_duration_formatted': None,
'speaking_rate_wpm': None,
'average_word_length': 0,
'average_sentence_length': 0,
}
def calculate_transcript_stats(text: str, video_duration_seconds: float = None) -> Dict[str, Any]:
"""Convenience function to calculate transcript statistics."""
return TranscriptStatistics.calculate(text, video_duration_seconds)
if __name__ == "__main__":
# Test the statistics calculator
sample_text = """
Welcome everyone to this video. Today we are going to discuss the importance of
artificial intelligence in modern software development.
AI has become an integral part of our daily lives. From voice assistants to
recommendation systems, we interact with AI on a regular basis.
In this video, we will explore how developers can leverage AI tools to improve
their productivity and create better software solutions.
"""
stats = calculate_transcript_stats(sample_text, video_duration_seconds=180)
print("Transcript Statistics:")
print(f" Word Count: {stats['word_count']}")
print(f" Character Count: {stats['character_count']}")
print(f" Reading Time: {stats['reading_time_formatted']}")
print(f" Video Duration: {stats['video_duration_formatted']}")
print(f" Speaking Rate: {stats['speaking_rate_wpm']} WPM")
print(f" Paragraphs: {stats['paragraph_count']}")
print(f" Sentences: {stats['sentence_count']}")