Spaces:

cracker0935
/

adtrack-backend

Sleeping

App Files Files Community

adtrack-backend / preprocessing.py

cracker0935

add Backend_files

eb48929 verified 3 months ago

raw

history blame contribute delete

3.38 kB

	import re
	import numpy as np

	class LiveFeatureExtractor:
	def __init__(self):
	self.patterns = {
	'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE),
	'repetition': re.compile(r'\[/+\]'),
	'retracing': re.compile(r'\[//\]'),
	'incomplete': re.compile(r'\+[\./]+'),
	'errors': re.compile(r'\[\.?\]'),
	'pauses': re.compile(r'\(\.+\)')
	}

	def clean_for_bert(self, raw_text):
	text = re.sub(r'^\*PAR:\s+', '', raw_text)
	text = re.sub(r'\x15\d+_\d+\x15', '', text)
	text = re.sub(r'<\|>', '', text)
	text = re.sub(r'\[.*?\]', '', text)
	text = re.sub(r'\(\.+\)', '[PAUSE]', text)
	text = text.replace('_', ' ')
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def get_features(self, raw_text):
	stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()}
	clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
	clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
	clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
	words = clean_for_stats.lower().split()
	stats['word_count'] = len(words)
	return stats

	def get_vector(self, raw_text, global_ttr_override=None):
	stats = self.get_features(raw_text)
	n = stats['word_count'] if stats['word_count'] > 0 else 1

	if global_ttr_override is not None:
	ttr = global_ttr_override
	else:
	clean_for_stats = re.sub(r'\[.*?\]', '', raw_text)
	clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats)
	clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats)
	words = clean_for_stats.lower().split()
	ttr = (len(set(words)) / n) if n > 0 else 0.0

	return [
	ttr,
	stats['fillers']/n,
	stats['repetition']/n,
	stats['retracing']/n,
	stats['errors']/n,
	stats['pauses']/n
	]

	class ChaParser:
	def __init__(self):
	self.extractor = LiveFeatureExtractor()

	def parse(self, file_content_lines):
	sentences = []
	features = []
	raw_lines = []
	all_words_in_session = []

	decoded_lines = [line.decode('utf-8') if isinstance(line, bytes) else line for line in file_content_lines]

	for line in decoded_lines:
	if line.startswith('*PAR:'):
	clean_line = re.sub(r'[^\w\s]', '', line.replace('*PAR:', ''))
	words = clean_line.lower().split()
	all_words_in_session.extend(words)

	unique_words = len(set(all_words_in_session))
	total_words = len(all_words_in_session)
	global_ttr = unique_words / total_words if total_words > 0 else 0.0

	for line in decoded_lines:
	if line.startswith('*PAR:'):
	display_text = self.extractor.clean_for_bert(line)
	feat_vec = self.extractor.get_vector(line, global_ttr_override=global_ttr)

	sentences.append(display_text)
	features.append(feat_vec)
	raw_lines.append(line.strip())

	return sentences, features, raw_lines