Spaces:

HunterNope
/

AutoCenzurer

Sleeping

AutoCenzurer / scripts /x_sensitive /extract_labeled_spans.py

ACZ-1 - Created gradio app for HF Space

5f2a5b3 3 months ago

2.05 kB

	import datasets

	def extract_spans(df, highlight_field):
	def extract_span(row):
	text = row['text']
	highlights = row[highlight_field]

	if len(highlights) == 0:
	return ''

	words = text.split()

	max_len = 0
	selected_highlight = ''
	for hl in highlights:
	hl_text = hl[0] # since each is [str]
	hl_words = hl_text.split()
	if len(hl_words) > max_len and len(hl_words) <= 10:
	max_len = len(hl_words)
	selected_highlight = hl_text

	if not selected_highlight:
	return ''

	hl_words = selected_highlight.split()

	start_idx = -1
	for i in range(len(words) - len(hl_words) + 1):
	if words[i:i+len(hl_words)] == hl_words:
	start_idx = i
	break

	if start_idx == -1:
	return selected_highlight

	current_span = hl_words[:]
	left = start_idx - 1
	right = start_idx + len(hl_words)

	while len(current_span) < 10:
	added = False
	if left >= 0:
	current_span.insert(0, words[left])
	left -= 1
	added = True
	if len(current_span) < 10 and right < len(words):
	current_span.append(words[right])
	right += 1
	added = True
	if not added:
	break

	return ' '.join(current_span)
	df = df.copy()
	df['span'] = df.apply(extract_span, axis=1)
	df = df[df['span'] != '']
	df['original_index'] = df.index
	new_df = df[['original_index', 'span']].copy()

	return new_df

	data = datasets.load_dataset('cardiffnlp/x_sensitive', split='train')
	df = data.to_pandas()

	df = df[(df['profanity'] == 1) & (df['conflictual'] != 1) & (df['selfharm'] != 1)]

	new_df = extract_spans(df, 'profanity_highlight')
	new_df.to_csv('unharmful_profanity.csv', sep=';', index=False)