Spaces:

PopovDanil
/

CineGraph

Running

App Files Files Community

CineGraph / app /cleaner.py

PopovDanil

fix3

d2141dd 18 days ago

raw

history blame contribute delete

4.65 kB

	import re


	def remove_brackets_content(text: str) -> str:
	"""
	Remove all content inside square brackets [],
	round brackets () and curly brackets {}.
	Useful for removing sound descriptions, speaker labels,
	stage directions like [applause], (laughs), {music}.
	"""
	text = re.sub(r'\[.*?\]', '', text) # [applause]
	text = re.sub(r'$.*?$', '', text) # (laughs)
	text = re.sub(r'\{.*?\}', '', text) # {music}
	return text.strip()


	def remove_non_alphabetic(text: str) -> str:
	"""
	Remove all non-alphabetic characters except spaces.
	Keeps only letters A-Z, a-z and whitespace.
	Useful for stripping punctuation, numbers, special symbols.
	"""
	text = re.sub(r'[^a-zA-Z\s]', '', text)
	return text.strip()


	def remove_newlines(text: str) -> str:
	"""
	Remove newline characters and replace them with spaces.
	Merges multi-line subtitle blocks into single lines.
	"""
	text = text.replace('\n', ' ')
	text = text.replace('\r', ' ')
	text = re.sub(r' +', ' ', text) # collapse multiple spaces
	return text.strip()


	def remove_dialog_punctuation(text: str) -> str:
	"""
	Remove dialog-specific punctuation: dashes at line start (- text),
	ellipsis (...), double dashes (--), quotation marks,
	and excessive punctuation used in subtitles.
	"""
	text = re.sub(r'^\s-+\s', '', text, flags=re.MULTILINE) # leading dashes
	text = re.sub(r'\.{2,}', '', text) # ellipsis ...
	text = re.sub(r'-{2,}', '', text) # double dash --
	text = re.sub(r'["""\'\'\']+', '', text) # quotes
	text = re.sub(r'[!?,;:]+', '', text) # dialog punctuation
	return text.strip()


	def remove_timestamps(text: str) -> str:
	"""
	Remove SRT/VTT subtitle timestamps.
	Handles formats like:
	- 00:01:23,456 --> 00:01:25,789 (SRT)
	- 00:01:23.456 --> 00:01:25.789 (VTT)
	Also removes bare sequence numbers (1, 2, 3...) used in SRT files.
	"""
	# SRT timestamps
	text = re.sub(
	r'\d{2}:\d{2}:\d{2}[.,]\d{3}\s-->\s\d{2}:\d{2}:\d{2}[.,]\d{3}',
	'', text
	)
	# VTT cue identifiers
	text = re.sub(r'^\s\d+\s$', '', text, flags=re.MULTILINE)
	# WEBVTT header
	text = re.sub(r'WEBVTT.*\n?', '', text)
	return text.strip()


	def remove_speaker_labels(text: str) -> str:
	"""
	Remove speaker labels commonly found in subtitles.
	Handles formats like:
	- JOHN: text
	- John: text
	- [JOHN]: text
	- <v John> text (VTT format)
	"""
	text = re.sub(r'^[A-Z][A-Z\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # JOHN:
	text = re.sub(r'^\w[\w\s]{1,20}:\s*', '', text, flags=re.MULTILINE) # John:
	text = re.sub(r'<v\s+[^>]+>', '', text) # <v John>
	return text.strip()


	def remove_html_tags(text: str) -> str:
	"""
	Remove HTML/XML tags commonly found in subtitles.
	Handles: <i>, <b>, <u>, <font color="">, <c.colorname> etc.
	Used in SRT and VTT files for styling.
	"""
	text = re.sub(r'<[^>]+>', '', text)
	return text.strip()


	def normalize_whitespace(text: str) -> str:
	"""
	Normalize all whitespace: collapse multiple spaces into one,
	strip leading/trailing spaces from each line,
	remove empty lines.
	Final cleanup step — use after all other tools.
	"""
	lines = text.split('\n')
	lines = [re.sub(r' +', ' ', line).strip() for line in lines]
	lines = [line for line in lines if line] # remove empty
	return ' '.join(lines)


	def lowercase_text(text: str) -> str:
	"""
	Convert all text to lowercase.
	Recommended for sentiment analysis preprocessing
	to ensure uniform token representation.
	"""
	return text.lower()


	def remove_filler_words(text: str) -> str:
	"""
	Remove common spoken filler words that add noise for sentiment analysis.
	Removes: um, uh, hmm, ah, oh, er, erm, hm, gonna, wanna, gotta etc.
	"""
	fillers = r'\b(um+\|uh+\|hmm+\|hm+\|ah+\|oh+\|er+\|erm+\|gonna\|wanna\|gotta\|kinda\|sorta\|like\|okay\|ok\|yeah\|yep\|nope)\b'
	text = re.sub(fillers, '', text, flags=re.IGNORECASE)
	text = re.sub(r' +', ' ', text)
	return text.strip()


	def clean_subtitle(text: str):
	"""Deterministic cleaning pipeline — no LLM needed."""
	text = remove_timestamps(text)
	text = remove_brackets_content(text)
	text = remove_html_tags(text)
	text = remove_speaker_labels(text)
	text = remove_dialog_punctuation(text)
	text = remove_newlines(text)
	text = remove_non_alphabetic(text)
	text = remove_filler_words(text)
	text = lowercase_text(text)
	text = normalize_whitespace(text)
	return text