Spaces:

cracker0935
/

adtrack-v2

Running

App Files Files Community

adtrack-v2 / models /model_v2 /preprocessing.py

cracker0935

Audit Backend

97ea4f7 3 months ago

raw

history blame contribute delete

2.55 kB

	import re

	class LiveFeatureExtractor:
	def __init__(self):
	self.patterns = {
	'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE),
	'repetition': re.compile(r'\[/+\]'),
	'retracing': re.compile(r'\[//\]'),
	'incomplete': re.compile(r'\+[\./]+'),
	'errors': re.compile(r'\[\.?\]'),
	'pauses': re.compile(r'\(\.+\)')
	}

	def clean_for_bert(self, raw_text):
	text = re.sub(r'^\*PAR:\s+', '', raw_text)
	text = re.sub(r'\x15\d+_\d+\x15', '', text) # Remove timestamps
	text = re.sub(r'<\|>', '', text) # Remove brackets, keep text
	text = re.sub(r'\[.*?\]', '', text) # Remove codes like [//]
	# text = re.sub(r'&-([a-z]+)', '', text) # Keep fillers
	text = re.sub(r'\(\.+\)', '[PAUSE]', text)
	text = text.replace('_', ' ')
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def get_features(self, raw_text):
	stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()}
	clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
	clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
	clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
	words = clean_for_stats.lower().split()
	stats['word_count'] = len(words)
	return stats

	def get_vector(self, raw_text, global_ttr_override=None):
	stats = self.get_features(raw_text)
	n = stats['word_count'] if stats['word_count'] > 0 else 1
	ttr = global_ttr_override if global_ttr_override is not None else 0.5
	vec = [
	ttr,
	stats['fillers']/n,
	stats['repetition']/n,
	stats['retracing']/n,
	stats['incomplete']/n,
	stats['pauses']/n
	]
	return vec

	def parse_cha_header(content_str):
	age = 65.0
	gender = 0

	id_match = re.search(r'@ID:.*\\|PAR\\|(\d+);\\|([a-z]+)\\|', content_str, re.IGNORECASE)
	if id_match:
	try:
	age = float(id_match.group(1))
	except:
	pass

	g_str = id_match.group(2).lower()
	if 'male' in g_str and 'female' not in g_str:
	gender = 1

	return age, gender

	def parse_cha_transcript(content_str):
	lines = content_str.split('\n')
	par_lines = []
	for line in lines:
	if line.startswith('*PAR:'):
	clean_line = line.replace('*PAR:\t', '').strip()
	par_lines.append(clean_line)
	return " ".join(par_lines)