File size: 3,453 Bytes
a8639ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import os
from tqdm import tqdm
import time

allowed_licenses = [
    "MIT License",
    "MIT",
    "Apache License",
    "Apache 2.0",
    "Apache-2.0",
    "BSD License",
    "BSD",
    "BSD-2-Clause",
    "BSD-3-Clause",
    "CC0",
    "CC0 1.0",
    "Creative Commons Zero",
    "Creative Commons CC0",
    "CC0 Public Domain Dedication",
    "Unlicense",
    "The Unlicense",
    "Public Domain",
    "WTFPL",
    "DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE",
    "WTFPL License",
    "ISC",
    "ISC License",
    "Zlib",
    "Zlib License",
    "Boost Software License",
    "BSL-1.0",
    "Boost License",
    "Artistic License 2.0",
    "Python Software Foundation License",
    "PSF License",
    "SOFTWARE IS PROVIDED",
    "f*ck",
    "fuck",
]


allowed_repos = []


def check_license_from_direct_url(direct_url):
    resp = requests.get(direct_url)

    if resp.status_code == 200:
        text = resp.text
        for lic in allowed_licenses:
            if lic.lower() in text.lower():
                return True
    elif resp.status_code == 429:
        print("Rate-limited, waiting for 5m...")
        time.sleep(300)  # sleep for 5 minutes
        resp = requests.get(direct_url)
        if resp.status_code == 200:
            text = resp.text
            for lic in allowed_licenses:
                if lic.lower() in text.lower():
                    return True
        else:
            raise ValueError
    else:
        raise ValueError
    return False


def check_license_from_file_url(file_url):
    # example: https://raw.githubusercontent.com/ssloy/tinyoptimizer/main/analyzer.py
    base = "/".join(file_url.split("/")[:-1])
    suffixes = [
        "LICENSE",
        "license",
        "LICENSE.txt",
        "license.txt",
        "LICENSE.md",
        "license.md",
    ]
    if base in allowed_repos:
        return True
    for suffix in suffixes:
        try:
            if check_license_from_direct_url(f"{base}/{suffix}"):
                allowed_repos.append(base)
                return True
        except ValueError:
            continue
    return False


def get_last_line_number(file_path):
    """Reads the last processed line number."""
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            try:
                return int(f.read().strip())
            except ValueError:
                return 0
    return 0


def save_last_line_number(file_path, line_number):
    """Saves the last processed line number."""
    with open(file_path, "w") as f:
        f.write(str(line_number))


allowed_files = []
line_number_file = "license_line_number.txt"

with open("python_files.txt", "r") as f:
    files = [line.strip() for line in f if line.strip()]

last_line_number = get_last_line_number(line_number_file)
num_allowed = 0

# Use enumerate to track line numbers
for index, file in enumerate(
    tqdm(files[last_line_number:], initial=last_line_number, total=len(files)),
    start=last_line_number,
):
    try:
        if check_license_from_file_url(file):
            allowed_files.append(file)
            num_allowed += 1
    except Exception:
        pass
    # Update the progress description and save progress at each iteration
    tqdm.write(f"Allowed: {num_allowed}")
    save_last_line_number(line_number_file, index + 1)

    with open("python_files_allowed.txt", "w") as f:
        for file in allowed_files:
            f.write(file + "\n")