File size: 2,776 Bytes
60be371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Filler Words Detection Service
Deteksi kata-kata pengisi (ehm, anu, itu, dll)
"""

import re
from typing import Dict, List


class FillerWordsService:
    """Service untuk deteksi kata pengisi"""
    
    # Daftar kata pengisi bahasa Indonesia
    FILLER_WORDS = [
        # Suara pengisi
        'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm', 'uhh', 'umm',
        
        # Kata pengisi umum
        'anu', 'ini', 'itu', 'gitu', 'kayak', 'seperti',
        
        # Kata ragu
        'ya', 'kan', 'sih', 'deh', 'lah',
        
        # Kata repetitif
        'jadi', 'terus', 'nah', 'yaudah', 'gimana'
    ]
    
    def __init__(self):
        """Initialize service"""
        print("πŸ—£οΈ Initializing Filler Words Service")
        print(f"πŸ“ Monitoring {len(self.FILLER_WORDS)} filler words")
        print("βœ… Filler Words Service ready!\n")
    
    def detect(self, transcript: str) -> Dict:
        """
        Deteksi kata pengisi dalam transkrip
        
        Args:
            transcript: Text transkrip
            
        Returns:
            Dict hasil deteksi
        """
        print("πŸ” Detecting filler words...")
        
        if not transcript or not transcript.strip():
            return {
                'has_filler': False,
                'count': 0,
                'ratio': 0.0,
                'words_found': [],
                'total_words': 0,
                'positions': []
            }
        
        # Clean and split transcript
        words = transcript.lower().split()
        total_words = len(words)
        
        # Detect filler words
        filler_found = []
        filler_positions = []
        filler_count = 0
        
        for i, word in enumerate(words):
            # Clean word (remove punctuation)
            clean_word = re.sub(r'[^\w\s]', '', word)
            
            if clean_word in self.FILLER_WORDS:
                filler_count += 1
                filler_found.append(clean_word)
                filler_positions.append({
                    'word': clean_word,
                    'position': i,
                    'context': ' '.join(words[max(0, i-2):min(len(words), i+3)])
                })
        
        # Calculate ratio
        filler_ratio = filler_count / total_words if total_words > 0 else 0
        
        # Has filler?
        has_filler = filler_count > 0
        
        print(f"βœ… Found {filler_count} filler words\n")
        
        return {
            'has_filler': has_filler,
            'count': filler_count,
            'ratio': round(filler_ratio, 3),
            'words_found': list(set(filler_found)),  # Unique words
            'total_words': total_words,
            'positions': filler_positions[:5]  # Return max 5 examples
        }