Spaces:
Running
Running
| import re | |
| def remove_brackets_content(text: str) -> str: | |
| """ | |
| Remove all content inside square brackets [], | |
| round brackets () and curly brackets {}. | |
| Useful for removing sound descriptions, speaker labels, | |
| stage directions like [applause], (laughs), {music}. | |
| """ | |
| text = re.sub(r'\[.*?\]', '', text) # [applause] | |
| text = re.sub(r'\(.*?\)', '', text) # (laughs) | |
| text = re.sub(r'\{.*?\}', '', text) # {music} | |
| return text.strip() | |
| def remove_non_alphabetic(text: str) -> str: | |
| """ | |
| Remove all non-alphabetic characters except spaces. | |
| Keeps only letters A-Z, a-z and whitespace. | |
| Useful for stripping punctuation, numbers, special symbols. | |
| """ | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| return text.strip() | |
| def remove_newlines(text: str) -> str: | |
| """ | |
| Remove newline characters and replace them with spaces. | |
| Merges multi-line subtitle blocks into single lines. | |
| """ | |
| text = text.replace('\n', ' ') | |
| text = text.replace('\r', ' ') | |
| text = re.sub(r' +', ' ', text) # collapse multiple spaces | |
| return text.strip() | |
| def remove_dialog_punctuation(text: str) -> str: | |
| """ | |
| Remove dialog-specific punctuation: dashes at line start (- text), | |
| ellipsis (...), double dashes (--), quotation marks, | |
| and excessive punctuation used in subtitles. | |
| """ | |
| text = re.sub(r'^\s*-+\s*', '', text, flags=re.MULTILINE) # leading dashes | |
| text = re.sub(r'\.{2,}', '', text) # ellipsis ... | |
| text = re.sub(r'-{2,}', '', text) # double dash -- | |
| text = re.sub(r'["""\'\'\']+', '', text) # quotes | |
| text = re.sub(r'[!?,;:]+', '', text) # dialog punctuation | |
| return text.strip() | |
| def remove_timestamps(text: str) -> str: | |
| """ | |
| Remove SRT/VTT subtitle timestamps. | |
| Handles formats like: | |
| - 00:01:23,456 --> 00:01:25,789 (SRT) | |
| - 00:01:23.456 --> 00:01:25.789 (VTT) | |
| Also removes bare sequence numbers (1, 2, 3...) used in SRT files. | |
| """ | |
| # SRT timestamps | |
| text = re.sub( | |
| r'\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}', | |
| '', text | |
| ) | |
| # VTT cue identifiers | |
| text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) | |
| # WEBVTT header | |
| text = re.sub(r'WEBVTT.*\n?', '', text) | |
| return text.strip() | |
| def remove_speaker_labels(text: str) -> str: | |
| """ | |
| Remove speaker labels commonly found in subtitles. | |
| Handles formats like: | |
| - JOHN: text | |
| - John: text | |
| - [JOHN]: text | |
| - <v John> text (VTT format) | |
| """ | |
| text = re.sub(r'^[A-Z][A-Z\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # JOHN: | |
| text = re.sub(r'^\w[\w\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # John: | |
| text = re.sub(r'<v\s+[^>]+>', '', text) # <v John> | |
| return text.strip() | |
| def remove_html_tags(text: str) -> str: | |
| """ | |
| Remove HTML/XML tags commonly found in subtitles. | |
| Handles: <i>, <b>, <u>, <font color="">, <c.colorname> etc. | |
| Used in SRT and VTT files for styling. | |
| """ | |
| text = re.sub(r'<[^>]+>', '', text) | |
| return text.strip() | |
| def normalize_whitespace(text: str) -> str: | |
| """ | |
| Normalize all whitespace: collapse multiple spaces into one, | |
| strip leading/trailing spaces from each line, | |
| remove empty lines. | |
| Final cleanup step — use after all other tools. | |
| """ | |
| lines = text.split('\n') | |
| lines = [re.sub(r' +', ' ', line).strip() for line in lines] | |
| lines = [line for line in lines if line] # remove empty | |
| return ' '.join(lines) | |
| def lowercase_text(text: str) -> str: | |
| """ | |
| Convert all text to lowercase. | |
| Recommended for sentiment analysis preprocessing | |
| to ensure uniform token representation. | |
| """ | |
| return text.lower() | |
| def remove_filler_words(text: str) -> str: | |
| """ | |
| Remove common spoken filler words that add noise for sentiment analysis. | |
| Removes: um, uh, hmm, ah, oh, er, erm, hm, gonna, wanna, gotta etc. | |
| """ | |
| fillers = r'\b(um+|uh+|hmm+|hm+|ah+|oh+|er+|erm+|gonna|wanna|gotta|kinda|sorta|like|okay|ok|yeah|yep|nope)\b' | |
| text = re.sub(fillers, '', text, flags=re.IGNORECASE) | |
| text = re.sub(r' +', ' ', text) | |
| return text.strip() | |
| def clean_subtitle(text: str): | |
| """Deterministic cleaning pipeline — no LLM needed.""" | |
| text = remove_timestamps(text) | |
| text = remove_brackets_content(text) | |
| text = remove_html_tags(text) | |
| text = remove_speaker_labels(text) | |
| text = remove_dialog_punctuation(text) | |
| text = remove_newlines(text) | |
| text = remove_non_alphabetic(text) | |
| text = remove_filler_words(text) | |
| text = lowercase_text(text) | |
| text = normalize_whitespace(text) | |
| return text | |