| |
| |
| FILLER_VARIANTS = { |
| |
| "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er", |
| "umm": "um", "uhh": "uh", "mmm": "hmm", |
| "like": "like", "you know": "you know", "so": "so", "well": "well", |
| |
| "मतलब": "matlab", "matlab": "matlab", |
| "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain", |
| "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na", |
| "ऐसा है": "aisa hai", "aisa hai": "aisa hai", |
| "हाँ": "haan", "haan": "haan", "हा": "haan", |
| "अच्छा": "acha", "acha": "acha", |
| |
| "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na", |
| "அப்பரம்": "apparam", "apparam": "apparam", |
| "என்ன": "enna", "enna": "enna" |
| } |
|
|
| def detect_fillers(transcript): |
| """ |
| Detects filler words in the transcript. |
| |
| Args: |
| transcript: Full transcript text |
| |
| Returns: |
| tuple: (filler_count, filler_occurrences) |
| """ |
| transcript_lower = transcript.lower() |
| filler_count = 0 |
| |
| filler_occurrences = {} |
| |
| for variant, label in FILLER_VARIANTS.items(): |
| if variant in transcript_lower: |
| count = transcript_lower.count(variant) |
| if count > 0: |
| filler_count += count |
| |
| filler_occurrences[label] = filler_occurrences.get(label, 0) + count |
| |
| return filler_count, filler_occurrences |
|
|
| def analyze_filler_words(filler_count, filler_occurrences, duration): |
| """ |
| Analyzes filler word usage in speech. |
| |
| Args: |
| filler_count: Total count of filler words |
| filler_occurrences: Dictionary of specific filler words and their counts |
| duration: Duration of the audio in seconds |
| |
| Returns: |
| dict: Contains the filler words score and insight text |
| """ |
| |
| filler_examples = [] |
| if filler_occurrences: |
| |
| sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True) |
| for label, count in sorted_fillers[:2]: |
| filler_examples.append(label) |
| |
| |
| filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0 |
| |
| if filler_count == 0: |
| filler_score = 10 |
| elif filler_per_min < 1: |
| filler_score = 9 |
| elif filler_per_min < 3: |
| filler_score = 8 |
| elif filler_per_min < 5: |
| filler_score = 6 |
| elif filler_per_min < 10: |
| filler_score = 4 |
| else: |
| filler_score = 2 |
| |
| filler_score = max(0, filler_score) |
| |
| |
| if filler_count == 0: |
| insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear." |
| elif filler_count <= 2: |
| example = filler_examples[0] if filler_examples else "um" |
| insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact." |
| elif filler_count <= 5: |
| examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words" |
| insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity." |
| else: |
| examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'" |
| insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty." |
| |
| return { |
| "score": int(filler_score), |
| "insight": insight |
| } |