File size: 2,776 Bytes
60be371 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
"""
Filler Words Detection Service
Deteksi kata-kata pengisi (ehm, anu, itu, dll)
"""
import re
from typing import Dict, List
class FillerWordsService:
"""Service untuk deteksi kata pengisi"""
# Daftar kata pengisi bahasa Indonesia
FILLER_WORDS = [
# Suara pengisi
'eh', 'ehm', 'em', 'aa', 'ah', 'mm', 'hmm', 'uhh', 'umm',
# Kata pengisi umum
'anu', 'ini', 'itu', 'gitu', 'kayak', 'seperti',
# Kata ragu
'ya', 'kan', 'sih', 'deh', 'lah',
# Kata repetitif
'jadi', 'terus', 'nah', 'yaudah', 'gimana'
]
def __init__(self):
"""Initialize service"""
print("π£οΈ Initializing Filler Words Service")
print(f"π Monitoring {len(self.FILLER_WORDS)} filler words")
print("β
Filler Words Service ready!\n")
def detect(self, transcript: str) -> Dict:
"""
Deteksi kata pengisi dalam transkrip
Args:
transcript: Text transkrip
Returns:
Dict hasil deteksi
"""
print("π Detecting filler words...")
if not transcript or not transcript.strip():
return {
'has_filler': False,
'count': 0,
'ratio': 0.0,
'words_found': [],
'total_words': 0,
'positions': []
}
# Clean and split transcript
words = transcript.lower().split()
total_words = len(words)
# Detect filler words
filler_found = []
filler_positions = []
filler_count = 0
for i, word in enumerate(words):
# Clean word (remove punctuation)
clean_word = re.sub(r'[^\w\s]', '', word)
if clean_word in self.FILLER_WORDS:
filler_count += 1
filler_found.append(clean_word)
filler_positions.append({
'word': clean_word,
'position': i,
'context': ' '.join(words[max(0, i-2):min(len(words), i+3)])
})
# Calculate ratio
filler_ratio = filler_count / total_words if total_words > 0 else 0
# Has filler?
has_filler = filler_count > 0
print(f"β
Found {filler_count} filler words\n")
return {
'has_filler': has_filler,
'count': filler_count,
'ratio': round(filler_ratio, 3),
'words_found': list(set(filler_found)), # Unique words
'total_words': total_words,
'positions': filler_positions[:5] # Return max 5 examples
}
|