AutoCenzurer / scripts /x_sensitive /extract_normal_spans.py
HunterNope's picture
ACZ-1 - Created gradio app for HF Space
5f2a5b3
import datasets
import re
def cut_span(record):
text = record['text']
# Remove leading "@user" prefixes (one or more)
text = re.sub(r'^(@user\s*)+', '', text)
words = text.split()[:10]
return ' '.join(words)
data = datasets.load_dataset('cardiffnlp/x_sensitive', split='train')
df = data.to_pandas()
df = df[df['labels'].apply(len) == 0]
df['span'] = df.apply(cut_span, axis=1)
df['original_index'] = df.index
new_df = df[['original_index', 'span']].copy()
new_df.to_csv('none.csv', sep=';', index=False)