File size: 3,382 Bytes
eb48929
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import numpy as np

class LiveFeatureExtractor:
    def __init__(self):
        self.patterns = {
            'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE),
            'repetition': re.compile(r'\[/+\]'),
            'retracing': re.compile(r'\[//\]'),
            'incomplete': re.compile(r'\+[\./]+'),
            'errors': re.compile(r'\[\*.*?\]'),
            'pauses': re.compile(r'\(\.+\)')
        }

    def clean_for_bert(self, raw_text):
        text = re.sub(r'^\*PAR:\s+', '', raw_text)
        text = re.sub(r'\x15\d+_\d+\x15', '', text)
        text = re.sub(r'<|>', '', text)
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\(\.+\)', '[PAUSE]', text) 
        text = text.replace('_', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_features(self, raw_text):
        stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()}
        clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
        clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
        clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
        words = clean_for_stats.lower().split()
        stats['word_count'] = len(words)
        return stats

    def get_vector(self, raw_text, global_ttr_override=None):
        stats = self.get_features(raw_text)
        n = stats['word_count'] if stats['word_count'] > 0 else 1
        
        if global_ttr_override is not None:
            ttr = global_ttr_override
        else:
            clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
            clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
            clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
            words = clean_for_stats.lower().split()
            ttr = (len(set(words)) / n) if n > 0 else 0.0

        return [
            ttr,
            stats['fillers']/n,
            stats['repetition']/n,
            stats['retracing']/n,
            stats['errors']/n,
            stats['pauses']/n
        ]

class ChaParser:
    def __init__(self):
        self.extractor = LiveFeatureExtractor()
    
    def parse(self, file_content_lines):
        sentences = []
        features = []
        raw_lines = []
        all_words_in_session = []
        
        decoded_lines = [line.decode('utf-8') if isinstance(line, bytes) else line for line in file_content_lines]

        for line in decoded_lines:
            if line.startswith('*PAR:'):
                clean_line = re.sub(r'[^\w\s]', '', line.replace('*PAR:', ''))
                words = clean_line.lower().split()
                all_words_in_session.extend(words)

        unique_words = len(set(all_words_in_session))
        total_words = len(all_words_in_session)
        global_ttr = unique_words / total_words if total_words > 0 else 0.0
        
        for line in decoded_lines:
            if line.startswith('*PAR:'):
                display_text = self.extractor.clean_for_bert(line)
                feat_vec = self.extractor.get_vector(line, global_ttr_override=global_ttr)
                
                sentences.append(display_text)
                features.append(feat_vec)
                raw_lines.append(line.strip())
                    
        return sentences, features, raw_lines