CineGraph / app /cleaner.py
PopovDanil's picture
fix3
d2141dd
import re
def remove_brackets_content(text: str) -> str:
"""
Remove all content inside square brackets [],
round brackets () and curly brackets {}.
Useful for removing sound descriptions, speaker labels,
stage directions like [applause], (laughs), {music}.
"""
text = re.sub(r'\[.*?\]', '', text) # [applause]
text = re.sub(r'\(.*?\)', '', text) # (laughs)
text = re.sub(r'\{.*?\}', '', text) # {music}
return text.strip()
def remove_non_alphabetic(text: str) -> str:
"""
Remove all non-alphabetic characters except spaces.
Keeps only letters A-Z, a-z and whitespace.
Useful for stripping punctuation, numbers, special symbols.
"""
text = re.sub(r'[^a-zA-Z\s]', '', text)
return text.strip()
def remove_newlines(text: str) -> str:
"""
Remove newline characters and replace them with spaces.
Merges multi-line subtitle blocks into single lines.
"""
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
text = re.sub(r' +', ' ', text) # collapse multiple spaces
return text.strip()
def remove_dialog_punctuation(text: str) -> str:
"""
Remove dialog-specific punctuation: dashes at line start (- text),
ellipsis (...), double dashes (--), quotation marks,
and excessive punctuation used in subtitles.
"""
text = re.sub(r'^\s*-+\s*', '', text, flags=re.MULTILINE) # leading dashes
text = re.sub(r'\.{2,}', '', text) # ellipsis ...
text = re.sub(r'-{2,}', '', text) # double dash --
text = re.sub(r'["""\'\'\']+', '', text) # quotes
text = re.sub(r'[!?,;:]+', '', text) # dialog punctuation
return text.strip()
def remove_timestamps(text: str) -> str:
"""
Remove SRT/VTT subtitle timestamps.
Handles formats like:
- 00:01:23,456 --> 00:01:25,789 (SRT)
- 00:01:23.456 --> 00:01:25.789 (VTT)
Also removes bare sequence numbers (1, 2, 3...) used in SRT files.
"""
# SRT timestamps
text = re.sub(
r'\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}',
'', text
)
# VTT cue identifiers
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
# WEBVTT header
text = re.sub(r'WEBVTT.*\n?', '', text)
return text.strip()
def remove_speaker_labels(text: str) -> str:
"""
Remove speaker labels commonly found in subtitles.
Handles formats like:
- JOHN: text
- John: text
- [JOHN]: text
- <v John> text (VTT format)
"""
text = re.sub(r'^[A-Z][A-Z\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # JOHN:
text = re.sub(r'^\w[\w\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # John:
text = re.sub(r'<v\s+[^>]+>', '', text) # <v John>
return text.strip()
def remove_html_tags(text: str) -> str:
"""
Remove HTML/XML tags commonly found in subtitles.
Handles: <i>, <b>, <u>, <font color="">, <c.colorname> etc.
Used in SRT and VTT files for styling.
"""
text = re.sub(r'<[^>]+>', '', text)
return text.strip()
def normalize_whitespace(text: str) -> str:
"""
Normalize all whitespace: collapse multiple spaces into one,
strip leading/trailing spaces from each line,
remove empty lines.
Final cleanup step — use after all other tools.
"""
lines = text.split('\n')
lines = [re.sub(r' +', ' ', line).strip() for line in lines]
lines = [line for line in lines if line] # remove empty
return ' '.join(lines)
def lowercase_text(text: str) -> str:
"""
Convert all text to lowercase.
Recommended for sentiment analysis preprocessing
to ensure uniform token representation.
"""
return text.lower()
def remove_filler_words(text: str) -> str:
"""
Remove common spoken filler words that add noise for sentiment analysis.
Removes: um, uh, hmm, ah, oh, er, erm, hm, gonna, wanna, gotta etc.
"""
fillers = r'\b(um+|uh+|hmm+|hm+|ah+|oh+|er+|erm+|gonna|wanna|gotta|kinda|sorta|like|okay|ok|yeah|yep|nope)\b'
text = re.sub(fillers, '', text, flags=re.IGNORECASE)
text = re.sub(r' +', ' ', text)
return text.strip()
def clean_subtitle(text: str):
"""Deterministic cleaning pipeline — no LLM needed."""
text = remove_timestamps(text)
text = remove_brackets_content(text)
text = remove_html_tags(text)
text = remove_speaker_labels(text)
text = remove_dialog_punctuation(text)
text = remove_newlines(text)
text = remove_non_alphabetic(text)
text = remove_filler_words(text)
text = lowercase_text(text)
text = normalize_whitespace(text)
return text