File size: 529 Bytes
5f2a5b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import datasets
import re

def cut_span(record):
    text = record['text']
    # Remove leading "@user" prefixes (one or more)
    text = re.sub(r'^(@user\s*)+', '', text)
    words = text.split()[:10]
    return ' '.join(words)

data = datasets.load_dataset('cardiffnlp/x_sensitive', split='train')
df = data.to_pandas()

df = df[df['labels'].apply(len) == 0]

df['span'] = df.apply(cut_span, axis=1)
df['original_index'] = df.index
new_df = df[['original_index', 'span']].copy()

new_df.to_csv('none.csv', sep=';', index=False)