|
|
import re |
|
|
import os |
|
|
from transformers import BertTokenizer |
|
|
from datasketch import MinHash, MinHashLSH |
|
|
from nltk import ngrams |
|
|
|
|
|
|
|
|
def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10): |
|
|
RE_SUSPICIOUS = re.compile(re_expression) |
|
|
def impurity(text, min_len=min_len): |
|
|
"""returns the share of suspicious characters in a text""" |
|
|
if text == None or len(text) < min_len: |
|
|
return 0 |
|
|
else: |
|
|
return len(RE_SUSPICIOUS.findall(text))/len(text) |
|
|
df['impurity'] = df['text'].apply(impurity, min_len=min_len) |
|
|
total_num_docs = len(df) |
|
|
impurity_num_docs = len(df[df['impurity'] >= threshold]) |
|
|
impurity_ratio = impurity_num_docs / total_num_docs |
|
|
purity_df = df[df['impurity'] < threshold] |
|
|
return purity_df, impurity_ratio |
|
|
|
|
|
|
|
|
def toxic_eliminate(df, l_kind='en'): |
|
|
''' |
|
|
l_kind = ['en', 'zh'] |
|
|
''' |
|
|
os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}") |
|
|
with open(f'./{l_kind}', 'r') as f: |
|
|
lines = f.readlines() |
|
|
banned_words = set([line.rstrip('\n') for line in lines]) |
|
|
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) |
|
|
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) |
|
|
total_num_docs = len(df) |
|
|
biased_num_docs = df['matches'].sum() |
|
|
biased_content_ratio = biased_num_docs / total_num_docs |
|
|
non_toxic_df = df[df['matches'] == 0] |
|
|
return non_toxic_df, biased_content_ratio |
|
|
|
|
|
|
|
|
def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100): |
|
|
|
|
|
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) |
|
|
total_num_docs = len(df) |
|
|
too_short_docs = len(df[df['text_length'] <= min_len]) |
|
|
too_short_doc_ratio = too_short_docs / total_num_docs |
|
|
not_short_df = df[df['text_length'] > min_len] |
|
|
return not_short_df, too_short_doc_ratio |
|
|
|
|
|
|
|
|
def process_data(df): |
|
|
minhashes = {} |
|
|
for idx, text in enumerate(df['text']): |
|
|
minhash = MinHash(num_perm=128) |
|
|
for d in ngrams(text, 13): |
|
|
s = "".join(d).encode('utf-8') |
|
|
minhash.update(s) |
|
|
minhashes[idx] = minhash |
|
|
return minhashes |
|
|
|
|
|
def contamination_eliminate(train_dataset, test_dataset): |
|
|
train_minhashes = process_data(train_dataset) |
|
|
test_minhashes = process_data(test_dataset) |
|
|
|
|
|
|
|
|
lsh = MinHashLSH(threshold=0.8, num_perm=128) |
|
|
|
|
|
for idx, minhash in train_minhashes.items(): |
|
|
lsh.insert(idx, minhash) |
|
|
|
|
|
duplicates_count = 0 |
|
|
for idx, minhash in test_minhashes.items(): |
|
|
result = lsh.query(minhash) |
|
|
if len(result) > 0: |
|
|
duplicates_count += 1 |
|
|
contamination_ratio = duplicates_count / len(test_dataset) |
|
|
return contamination_ratio |
|
|
|
|
|
|
|
|
def duplication_eliminate(df): |
|
|
lsh = MinHashLSH(threshold=0.85, num_perm=128) |
|
|
for i, text in enumerate(df['text']): |
|
|
minhash = MinHash(num_perm=128) |
|
|
for word in text.split(): |
|
|
minhash.update(word.encode('utf-8')) |
|
|
lsh.insert(str(i), minhash) |
|
|
|
|
|
unique_documents = set() |
|
|
|
|
|
for i, text in enumerate(df['text']): |
|
|
query_minhash = MinHash(num_perm=128) |
|
|
for word in text.split(): |
|
|
query_minhash.update(word.encode('utf-8')) |
|
|
results = lsh.query(query_minhash) |
|
|
try: |
|
|
unique_documents.add(results[0]) |
|
|
except Exception as e: |
|
|
print(f'error: {e}') |
|
|
total_unique_documents = len(unique_documents) |
|
|
total_documents = len(df) |
|
|
duplication_ratio = (total_documents - total_unique_documents) / total_documents |
|
|
return unique_documents, duplication_ratio |
|
|
|
|
|
|
|
|
|