import re class LiveFeatureExtractor: def __init__(self): self.patterns = { 'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE), 'repetition': re.compile(r'\[/+\]'), 'retracing': re.compile(r'\[//\]'), 'incomplete': re.compile(r'\+[\./]+'), 'errors': re.compile(r'\[\*.*?\]'), 'pauses': re.compile(r'\(\.+\)') } def clean_for_bert(self, raw_text): text = re.sub(r'^\*PAR:\s+', '', raw_text) text = re.sub(r'\x15\d+_\d+\x15', '', text) # Remove timestamps text = re.sub(r'<|>', '', text) # Remove brackets, keep text text = re.sub(r'\[.*?\]', '', text) # Remove codes like [//] # text = re.sub(r'&-([a-z]+)', '', text) # Keep fillers text = re.sub(r'\(\.+\)', '[PAUSE]', text) text = text.replace('_', ' ') text = re.sub(r'\s+', ' ', text).strip() return text def get_features(self, raw_text): stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()} clean_for_stats = re.sub(r'\[.*?\]', '', raw_text) clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats) clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats) words = clean_for_stats.lower().split() stats['word_count'] = len(words) return stats def get_vector(self, raw_text, global_ttr_override=None): stats = self.get_features(raw_text) n = stats['word_count'] if stats['word_count'] > 0 else 1 ttr = global_ttr_override if global_ttr_override is not None else 0.5 vec = [ ttr, stats['fillers']/n, stats['repetition']/n, stats['retracing']/n, stats['incomplete']/n, stats['pauses']/n ] return vec def parse_cha_header(content_str): age = 65.0 gender = 0 id_match = re.search(r'@ID:.*\|PAR\|(\d+);\|([a-z]+)\|', content_str, re.IGNORECASE) if id_match: try: age = float(id_match.group(1)) except: pass g_str = id_match.group(2).lower() if 'male' in g_str and 'female' not in g_str: gender = 1 return age, gender def parse_cha_transcript(content_str): lines = content_str.split('\n') par_lines = [] for line in lines: if line.startswith('*PAR:'): clean_line = line.replace('*PAR:\t', '').strip() par_lines.append(clean_line) return " ".join(par_lines)