File size: 1,004 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from unittest import TestCase

from datasets import Dataset
from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters


def get_dataset():
    data_dict = {
        "repo_name": ["test_repo1", "test_repo2", "test_repo3"],
        "path": ["test_1.py", "test_2.py", "unit_test.py"],
        "content": ["a " * 20, "a " * 30, "b " * 7],
    }
    dataset = Dataset.from_dict(data_dict)
    return dataset


class MakeDuplicateClustersTest(TestCase):
    def test_make_duplicate_clusters(self):
        ds = get_dataset()
        duplicate_clusters = make_duplicate_clusters(ds, 0.85)
        self.assertEqual(len(duplicate_clusters[0]), 2)

    def test_deduplicate_dataset(self):
        ds = get_dataset()
        ds_filter, duplicate_clusters = deduplicate_dataset(ds)
        self.assertEqual(len(ds_filter), 2)
        print(duplicate_clusters)
        self.assertEqual(duplicate_clusters[0][0]["copies"], 2)
        self.assertEqual(duplicate_clusters[0][0]["is_extreme"], True)