| import os |
| from collections import defaultdict |
|
|
| from lm_eval.decontamination.janitor import ( |
| Janitor, |
| form_ngrams, |
| split_indices, |
| word_ngrams, |
| word_ngrams_indices, |
| ) |
|
|
|
|
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| TEST_SEQUENCE = ( |
| "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" |
| " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much." |
| ) |
|
|
| JANITOR_EXPECTED = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing " |
| " characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| JANITOR_FILTH1 = "filth lots of dirty filthy filth" |
| JANITOR_FILTH2 = "filth lots of filthy dirty filth" |
|
|
|
|
| def simple_ngram(sequence, n): |
| ngrams = list() |
| ngram = [] |
| for x in sequence: |
| ngram.extend([x]) |
| if len(ngram) == n: |
| ngrams.extend([tuple(ngram)]) |
| ngram = ngram[1:] |
|
|
| return ngrams |
|
|
|
|
| def test_form_ngrams(): |
| sequence = TEST_SEQUENCE |
|
|
| n_values = [1, 2, 3, 5, 13] |
| for n in n_values: |
| comparison = simple_ngram(sequence, n) |
| result_to_test = list(form_ngrams(iter(sequence), n)) |
| assert len(comparison) == len(result_to_test) |
| assert comparison == result_to_test |
|
|
|
|
| def test_word_ngrams(): |
| sequence = TEST_SEQUENCE |
|
|
| words = sequence.split() |
|
|
| n_values = [1, 2, 3, 5, 13] |
| for n in n_values: |
| comparison = simple_ngram(words, n) |
| comparison = [" ".join(ngram) for ngram in comparison] |
| result_to_test = list(word_ngrams(sequence, n)) |
| assert len(comparison) == len(result_to_test) |
| assert result_to_test == comparison |
|
|
|
|
| def test_split_indices(): |
| sequence = TEST_SEQUENCE |
|
|
| comparison = [] |
| current_word = "" |
| for i, c in enumerate(sequence): |
| if c != " ": |
| current_word += c |
| else: |
| if current_word: |
| comparison.extend([(current_word, (i - len(current_word), i - 1))]) |
| current_word = "" |
|
|
| if current_word: |
| len_sequence = len(sequence) |
| comparison.extend( |
| [ |
| ( |
| current_word, |
| (len_sequence - len(current_word), len_sequence - 1), |
| ) |
| ] |
| ) |
| current_word = "" |
|
|
| result_to_test = list(split_indices(sequence)) |
| assert len(comparison) == len(result_to_test) |
| assert comparison == result_to_test |
|
|
|
|
| def test_word_ngrams_indices(): |
| sequence = TEST_SEQUENCE |
|
|
| n_values = [1, 2, 3, 5, 13] |
|
|
| for n in n_values: |
| ngrams = [" ".join(ngram) for ngram in simple_ngram(sequence.split(), n)] |
| tracker = defaultdict(int) |
| comparison = [] |
| for ngram in ngrams: |
| while True: |
| start = sequence.find(ngram, tracker[ngram]) |
| assert start != -1 |
|
|
| end = start + len(ngram) - 1 |
| tracker[ngram] = end + 1 |
|
|
| |
| if not ( |
| (start != 0 and sequence[start - 1] != " ") |
| or (end != len(sequence) - 1 and sequence[end + 1] != " ") |
| ): |
| break |
|
|
| comparison.extend([(ngram, (start, end))]) |
|
|
| result_to_test = list(word_ngrams_indices(sequence, n)) |
| assert len(result_to_test) == len(comparison) |
| assert result_to_test == comparison |
|
|
|
|
| |
| |
|
|
|
|
| |
| def test_janitor1(): |
| |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| filth = "filth" |
|
|
| expected_result = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing " |
| ) |
|
|
| janitor = Janitor( |
| ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| janitor.register_contaminant(filth) |
| assert janitor.dirt_ngrams == {filth} |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == expected_result |
|
|
|
|
| def test_janitor2(): |
| |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| filth = "filth" |
|
|
| janitor = Janitor( |
| ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| janitor.register_contaminant(filth) |
| assert janitor.dirt_ngrams == {filth} |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == JANITOR_EXPECTED |
|
|
|
|
| def test_janitor3(): |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| janitor = Janitor( |
| ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| janitor.register_contaminant(JANITOR_FILTH1) |
| assert janitor.dirt_ngrams == {JANITOR_FILTH1} |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == JANITOR_EXPECTED |
|
|
|
|
| def test_janitor4(): |
| |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| janitor = Janitor( |
| ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| janitor.register_contaminant(JANITOR_FILTH1) |
| assert janitor.dirt_ngrams == {JANITOR_FILTH1} |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == JANITOR_EXPECTED |
|
|
|
|
| def test_janitor5(): |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| filths = [JANITOR_FILTH1, JANITOR_FILTH2] |
|
|
| janitor = Janitor( |
| ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| for filth in filths: |
| janitor.register_contaminant(filth) |
| assert janitor.dirt_ngrams == set(filths) |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == JANITOR_EXPECTED |
|
|
|
|
| def test_janitor6(): |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| filths = [JANITOR_FILTH1, JANITOR_FILTH2] |
|
|
| janitor = Janitor( |
| ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| for filth in filths: |
| janitor.register_contaminant(filth) |
| assert janitor.dirt_ngrams == set(filths) |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == JANITOR_EXPECTED |
|
|
|
|
| def test_janitor7(): |
| |
|
|
| sequence = ( |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "FILTH. lots of dirty filtHy FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "FILTH. lots of filtHy dirty FIlTh " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| "This is a @line #containing a certain number of characters, 76 to be exact. " |
| ) |
|
|
| filths = [JANITOR_FILTH1, JANITOR_FILTH2] |
|
|
| expected_result = "" |
|
|
| janitor = Janitor( |
| ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200 |
| ) |
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == sequence |
|
|
| for filth in filths: |
| janitor.register_contaminant(filth) |
| assert janitor.dirt_ngrams == set(filths) |
|
|
| result = janitor.clean_python(sequence) |
| result = "".join(result) |
| assert result == expected_result |
|
|
|
|
| def test_janitor8(): |
| |
| pass |
|
|