Spaces:
Running
Running
| import re | |
| class LiveFeatureExtractor: | |
| def __init__(self): | |
| self.patterns = { | |
| 'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE), | |
| 'repetition': re.compile(r'\[/+\]'), | |
| 'retracing': re.compile(r'\[//\]'), | |
| 'incomplete': re.compile(r'\+[\./]+'), | |
| 'errors': re.compile(r'\[\*.*?\]'), | |
| 'pauses': re.compile(r'\(\.+\)') | |
| } | |
| def clean_for_bert(self, raw_text): | |
| text = re.sub(r'^\*PAR:\s+', '', raw_text) | |
| text = re.sub(r'\x15\d+_\d+\x15', '', text) # Remove timestamps | |
| text = re.sub(r'<|>', '', text) # Remove brackets, keep text | |
| text = re.sub(r'\[.*?\]', '', text) # Remove codes like [//] | |
| # text = re.sub(r'&-([a-z]+)', '', text) # Keep fillers | |
| text = re.sub(r'\(\.+\)', '[PAUSE]', text) | |
| text = text.replace('_', ' ') | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def get_features(self, raw_text): | |
| stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()} | |
| clean_for_stats = re.sub(r'\[.*?\]', '', raw_text) | |
| clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats) | |
| clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats) | |
| words = clean_for_stats.lower().split() | |
| stats['word_count'] = len(words) | |
| return stats | |
| def get_vector(self, raw_text, global_ttr_override=None): | |
| stats = self.get_features(raw_text) | |
| n = stats['word_count'] if stats['word_count'] > 0 else 1 | |
| ttr = global_ttr_override if global_ttr_override is not None else 0.5 | |
| vec = [ | |
| ttr, | |
| stats['fillers']/n, | |
| stats['repetition']/n, | |
| stats['retracing']/n, | |
| stats['incomplete']/n, | |
| stats['pauses']/n | |
| ] | |
| return vec | |
| def parse_cha_header(content_str): | |
| age = 65.0 | |
| gender = 0 | |
| id_match = re.search(r'@ID:.*\|PAR\|(\d+);\|([a-z]+)\|', content_str, re.IGNORECASE) | |
| if id_match: | |
| try: | |
| age = float(id_match.group(1)) | |
| except: | |
| pass | |
| g_str = id_match.group(2).lower() | |
| if 'male' in g_str and 'female' not in g_str: | |
| gender = 1 | |
| return age, gender | |
| def parse_cha_transcript(content_str): | |
| lines = content_str.split('\n') | |
| par_lines = [] | |
| for line in lines: | |
| if line.startswith('*PAR:'): | |
| clean_line = line.replace('*PAR:\t', '').strip() | |
| par_lines.append(clean_line) | |
| return " ".join(par_lines) |