Spaces:
Running
Running
File size: 2,552 Bytes
97ea4f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import re
class LiveFeatureExtractor:
def __init__(self):
self.patterns = {
'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE),
'repetition': re.compile(r'\[/+\]'),
'retracing': re.compile(r'\[//\]'),
'incomplete': re.compile(r'\+[\./]+'),
'errors': re.compile(r'\[\*.*?\]'),
'pauses': re.compile(r'\(\.+\)')
}
def clean_for_bert(self, raw_text):
text = re.sub(r'^\*PAR:\s+', '', raw_text)
text = re.sub(r'\x15\d+_\d+\x15', '', text) # Remove timestamps
text = re.sub(r'<|>', '', text) # Remove brackets, keep text
text = re.sub(r'\[.*?\]', '', text) # Remove codes like [//]
# text = re.sub(r'&-([a-z]+)', '', text) # Keep fillers
text = re.sub(r'\(\.+\)', '[PAUSE]', text)
text = text.replace('_', ' ')
text = re.sub(r'\s+', ' ', text).strip()
return text
def get_features(self, raw_text):
stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()}
clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
words = clean_for_stats.lower().split()
stats['word_count'] = len(words)
return stats
def get_vector(self, raw_text, global_ttr_override=None):
stats = self.get_features(raw_text)
n = stats['word_count'] if stats['word_count'] > 0 else 1
ttr = global_ttr_override if global_ttr_override is not None else 0.5
vec = [
ttr,
stats['fillers']/n,
stats['repetition']/n,
stats['retracing']/n,
stats['incomplete']/n,
stats['pauses']/n
]
return vec
def parse_cha_header(content_str):
age = 65.0
gender = 0
id_match = re.search(r'@ID:.*\|PAR\|(\d+);\|([a-z]+)\|', content_str, re.IGNORECASE)
if id_match:
try:
age = float(id_match.group(1))
except:
pass
g_str = id_match.group(2).lower()
if 'male' in g_str and 'female' not in g_str:
gender = 1
return age, gender
def parse_cha_transcript(content_str):
lines = content_str.split('\n')
par_lines = []
for line in lines:
if line.startswith('*PAR:'):
clean_line = line.replace('*PAR:\t', '').strip()
par_lines.append(clean_line)
return " ".join(par_lines) |