File size: 2,114 Bytes
a8639ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
from urllib.parse import urlparse
import glob
import shutil
from tqdm import tqdm

path = os.path.expanduser("~/torch_datasets/github-python/mega_corpus_all_files")
output_path = os.path.expanduser(
    "~/torch_datasets/github-python/mega_licensed_all_files"
)

with open("python_files.txt", "r") as f:
    all_urls = {line.strip() for line in f if line.strip()}

with open("python_files_allowed.txt", "r") as f:
    allowed_urls = {line.strip() for line in f if line.strip()}

# Find URLs in python_files_allowed.txt that are not in python_files.txt
missing_urls = allowed_urls - all_urls

if missing_urls:
    print(
        "The following URLs are in python_files_allowed.txt but not in python_files.txt:"
    )
    for url in missing_urls:
        print(url)
else:
    print("All URLs in python_files_allowed.txt are contained in python_files.txt.")


# Rename all .py files in the input path to ensure they have a single .py extension
for root, _, files in tqdm(os.walk(path)):
    for file in files:
        if file.endswith(".py"):
            old_file_path = os.path.join(root, file)
            new_file_name = file.split(".py")[0] + ".py"
            new_file_path = os.path.join(root, new_file_name)
            if old_file_path != new_file_path:
                os.rename(old_file_path, new_file_path)
print("Renaming completed.")


with open("python_files_allowed.txt", "r") as f:
    urls = [line.strip() for line in f if line.strip()]
    repo_paths = set(["/".join(url.split("//")[1].split("/")[1:3]) for url in urls])
    print(repo_paths)

    num_existing = 0
    all_files = glob.glob(os.path.join(path, "*.py"))

    for file in (pbar := tqdm(all_files)):
        if any(repo_path in file.replace("_", "/") for repo_path in repo_paths):
            num_existing += 1
            file_name = os.path.basename(file)
            shutil.copy(file, os.path.join(output_path, file_name))
            pbar.set_description(f"Copied {num_existing} files")

        else:
            # print(f"File not found: {file}")
            pass

    print(f"Number of existing files: {num_existing}")