File size: 2,552 Bytes
97ea4f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re

class LiveFeatureExtractor:
    def __init__(self):
        self.patterns = {
            'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE),
            'repetition': re.compile(r'\[/+\]'),
            'retracing': re.compile(r'\[//\]'),
            'incomplete': re.compile(r'\+[\./]+'),
            'errors': re.compile(r'\[\*.*?\]'),
            'pauses': re.compile(r'\(\.+\)')
        }

    def clean_for_bert(self, raw_text):
        text = re.sub(r'^\*PAR:\s+', '', raw_text)
        text = re.sub(r'\x15\d+_\d+\x15', '', text) # Remove timestamps
        text = re.sub(r'<|>', '', text) # Remove brackets, keep text
        text = re.sub(r'\[.*?\]', '', text) # Remove codes like [//]
        # text = re.sub(r'&-([a-z]+)', '', text) # Keep fillers
        text = re.sub(r'\(\.+\)', '[PAUSE]', text) 
        text = text.replace('_', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_features(self, raw_text):
        stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()}
        clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
        clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
        clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
        words = clean_for_stats.lower().split()
        stats['word_count'] = len(words)
        return stats

    def get_vector(self, raw_text, global_ttr_override=None):
        stats = self.get_features(raw_text)
        n = stats['word_count'] if stats['word_count'] > 0 else 1
        ttr = global_ttr_override if global_ttr_override is not None else 0.5 
        vec = [
            ttr,
            stats['fillers']/n,
            stats['repetition']/n,
            stats['retracing']/n,
            stats['incomplete']/n,
            stats['pauses']/n
        ]
        return vec

def parse_cha_header(content_str):
    age = 65.0
    gender = 0  
    
    id_match = re.search(r'@ID:.*\|PAR\|(\d+);\|([a-z]+)\|', content_str, re.IGNORECASE)
    if id_match:
        try:
            age = float(id_match.group(1))
        except:
            pass
        
        g_str = id_match.group(2).lower()
        if 'male' in g_str and 'female' not in g_str:
            gender = 1
            
    return age, gender

def parse_cha_transcript(content_str):
    lines = content_str.split('\n')
    par_lines = []
    for line in lines:
        if line.startswith('*PAR:'):
            clean_line = line.replace('*PAR:\t', '').strip()
            par_lines.append(clean_line)
    return " ".join(par_lines)