File size: 7,242 Bytes
bf00853
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""Transcript statistics calculator module."""
from typing import Dict, Any
from datetime import timedelta


class TranscriptStatistics:
    """Calculate and manage transcript statistics."""
    
    # Average reading speed (words per minute)
    READING_SPEED_WPM = 200
    
    @classmethod
    def calculate(cls, text: str, video_duration_seconds: float = None) -> Dict[str, Any]:
        """

        Calculate comprehensive statistics for a transcript.

        

        Args:

            text: The transcript text

            video_duration_seconds: Optional video duration in seconds

            

        Returns:

            Dictionary containing all statistics

        """
        if not text:
            return cls._empty_stats()
        
        # Basic counts
        word_count = cls._count_words(text)
        char_count = len(text)
        char_count_no_spaces = len(text.replace(' ', ''))
        paragraph_count = cls._count_paragraphs(text)
        sentence_count = cls._count_sentences(text)
        
        # Time calculations
        reading_time_minutes = cls._calculate_reading_time(word_count)
        
        # Speaking rate (if video duration provided)
        speaking_rate = None
        if video_duration_seconds and video_duration_seconds > 0:
            speaking_rate = cls._calculate_speaking_rate(
                word_count, 
                video_duration_seconds
            )
        
        return {
            'word_count': word_count,
            'character_count': char_count,
            'character_count_no_spaces': char_count_no_spaces,
            'paragraph_count': paragraph_count,
            'sentence_count': sentence_count,
            'reading_time_minutes': reading_time_minutes,
            'reading_time_formatted': cls._format_time(reading_time_minutes),
            'video_duration_seconds': video_duration_seconds,
            'video_duration_formatted': cls._format_duration(video_duration_seconds) if video_duration_seconds else None,
            'speaking_rate_wpm': speaking_rate,
            'average_word_length': cls._average_word_length(text, word_count),
            'average_sentence_length': round(word_count / sentence_count, 1) if sentence_count > 0 else 0,
        }
    
    @staticmethod
    def _count_words(text: str) -> int:
        """Count words in text."""
        # Split by whitespace and filter empty strings
        words = [w for w in text.split() if w]
        return len(words)
    
    @staticmethod
    def _count_paragraphs(text: str) -> int:
        """Count paragraphs (separated by double newlines)."""
        if not text.strip():
            return 0
        # Split by double newlines and filter empty
        paragraphs = [p for p in text.split('\n\n') if p.strip()]
        return max(1, len(paragraphs))
    
    @staticmethod
    def _count_sentences(text: str) -> int:
        """Count sentences (ending with . ! ?)."""
        import re
        # Match sentence endings
        sentences = re.split(r'[.!?]+', text)
        # Filter out empty strings
        sentences = [s for s in sentences if s.strip()]
        return max(1, len(sentences))
    
    @classmethod
    def _calculate_reading_time(cls, word_count: int) -> float:
        """Calculate reading time in minutes."""
        return round(word_count / cls.READING_SPEED_WPM, 1)
    
    @classmethod
    def _calculate_speaking_rate(cls, word_count: int, duration_seconds: float) -> int:
        """Calculate speaking rate in words per minute."""
        if duration_seconds <= 0:
            return 0
        duration_minutes = duration_seconds / 60
        return round(word_count / duration_minutes)
    
    @staticmethod
    def _format_time(minutes: float) -> str:
        """Format time in minutes to human-readable string."""
        if minutes < 1:
            seconds = int(minutes * 60)
            return f"{seconds} seconds"
        elif minutes < 60:
            return f"{minutes:.1f} minutes"
        else:
            hours = int(minutes // 60)
            remaining_minutes = int(minutes % 60)
            return f"{hours}h {remaining_minutes}m"
    
    @staticmethod
    def _format_duration(seconds: float) -> str:
        """Format duration in seconds to HH:MM:SS format."""
        if seconds is None:
            return "N/A"
        
        td = timedelta(seconds=int(seconds))
        total_seconds = int(td.total_seconds())
        hours, remainder = divmod(total_seconds, 3600)
        minutes, secs = divmod(remainder, 60)
        
        if hours > 0:
            return f"{hours:02d}:{minutes:02d}:{secs:02d}"
        else:
            return f"{minutes:02d}:{secs:02d}"
    
    @staticmethod
    def _average_word_length(text: str, word_count: int) -> float:
        """Calculate average word length."""
        if word_count == 0:
            return 0
        # Remove punctuation for accurate measurement
        import re
        words = re.findall(r'\b\w+\b', text)
        if not words:
            return 0
        total_chars = sum(len(w) for w in words)
        return round(total_chars / len(words), 1)
    
    @staticmethod
    def _empty_stats() -> Dict[str, Any]:
        """Return empty statistics dictionary."""
        return {
            'word_count': 0,
            'character_count': 0,
            'character_count_no_spaces': 0,
            'paragraph_count': 0,
            'sentence_count': 0,
            'reading_time_minutes': 0,
            'reading_time_formatted': '0 seconds',
            'video_duration_seconds': None,
            'video_duration_formatted': None,
            'speaking_rate_wpm': None,
            'average_word_length': 0,
            'average_sentence_length': 0,
        }


def calculate_transcript_stats(text: str, video_duration_seconds: float = None) -> Dict[str, Any]:
    """Convenience function to calculate transcript statistics."""
    return TranscriptStatistics.calculate(text, video_duration_seconds)


if __name__ == "__main__":
    # Test the statistics calculator
    sample_text = """

    Welcome everyone to this video. Today we are going to discuss the importance of 

    artificial intelligence in modern software development.

    

    AI has become an integral part of our daily lives. From voice assistants to 

    recommendation systems, we interact with AI on a regular basis.

    

    In this video, we will explore how developers can leverage AI tools to improve 

    their productivity and create better software solutions.

    """
    
    stats = calculate_transcript_stats(sample_text, video_duration_seconds=180)
    
    print("Transcript Statistics:")
    print(f"  Word Count: {stats['word_count']}")
    print(f"  Character Count: {stats['character_count']}")
    print(f"  Reading Time: {stats['reading_time_formatted']}")
    print(f"  Video Duration: {stats['video_duration_formatted']}")
    print(f"  Speaking Rate: {stats['speaking_rate_wpm']} WPM")
    print(f"  Paragraphs: {stats['paragraph_count']}")
    print(f"  Sentences: {stats['sentence_count']}")