| import re |
| import whisper |
| from pydub import AudioSegment |
|
|
| def analyze_fillers(file_path: str, model_size: str = "base", transcript = None ) -> dict: |
| """ |
| Analyzes English filler words in audio with proper duration handling. |
| """ |
| try: |
| FILLER_WORDS = [ |
| "um", "uh", "hmm", "ah", "er", "eh", |
| "umm", "uhh", "mmm", "ahh", "err", |
| "like", "you know", "well", "so", "actually", "basically", |
| "right", "okay", "sort of", "kind of" |
| ] |
| |
| |
| audio = AudioSegment.from_file(file_path) |
| duration = len(audio) / 1000 |
| |
| if transcript is None: |
| |
| model = whisper.load_model(model_size) |
| result = model.transcribe(file_path, word_timestamps=False, fp16=False) |
| transcript = result["text"] |
| |
| |
| pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)" |
| matches = re.findall(pattern, transcript, re.IGNORECASE) |
| |
| |
| filler_counts = {} |
| for word in matches: |
| key = word.lower() |
| filler_counts[key] = filler_counts.get(key, 0) + 1 |
| total_fillers = sum(filler_counts.values()) |
| |
| |
| filler_per_min = (total_fillers / duration) * 60 if duration > 0 else 0 |
| |
| |
| if total_fillers == 0: |
| filler_score = 100 |
| elif filler_per_min < 1: |
| filler_score = 90 |
| elif filler_per_min < 3: |
| filler_score = 80 |
| elif filler_per_min < 5: |
| filler_score = 60 |
| elif filler_per_min < 10: |
| filler_score = 40 |
| else: |
| filler_score = 20 |
| |
| |
| top_fillers = sorted(filler_counts.items(), key=lambda x: x[1], reverse=True)[:2] |
| |
| if total_fillers == 0: |
| insight = "Excellent! No filler words detected." |
| elif total_fillers <= 2: |
| insight = f"Minimal fillers ({total_fillers} total), mostly '{top_fillers[0][0]}'." |
| elif total_fillers <= 5: |
| examples = ", ".join(f"'{f[0]}'" for f in top_fillers) |
| insight = f"Moderate fillers ({total_fillers} total), mainly {examples}." |
| else: |
| examples = ", ".join(f"'{f[0]}'" for f in top_fillers) |
| insight = f"Excessive fillers ({total_fillers} total), dominated by {examples}." |
| |
| return { |
| "filler_counts": filler_counts, |
| "total_fillers": total_fillers, |
| "filler_score": filler_score, |
| "filler_rate_per_min": round(filler_per_min, 1), |
| } |
| |
| except Exception as e: |
| raise RuntimeError(f"Analysis failed: {str(e)}") |