|
|
import requests |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
import time |
|
|
|
|
|
allowed_licenses = [ |
|
|
"MIT License", |
|
|
"MIT", |
|
|
"Apache License", |
|
|
"Apache 2.0", |
|
|
"Apache-2.0", |
|
|
"BSD License", |
|
|
"BSD", |
|
|
"BSD-2-Clause", |
|
|
"BSD-3-Clause", |
|
|
"CC0", |
|
|
"CC0 1.0", |
|
|
"Creative Commons Zero", |
|
|
"Creative Commons CC0", |
|
|
"CC0 Public Domain Dedication", |
|
|
"Unlicense", |
|
|
"The Unlicense", |
|
|
"Public Domain", |
|
|
"WTFPL", |
|
|
"DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE", |
|
|
"WTFPL License", |
|
|
"ISC", |
|
|
"ISC License", |
|
|
"Zlib", |
|
|
"Zlib License", |
|
|
"Boost Software License", |
|
|
"BSL-1.0", |
|
|
"Boost License", |
|
|
"Artistic License 2.0", |
|
|
"Python Software Foundation License", |
|
|
"PSF License", |
|
|
"SOFTWARE IS PROVIDED", |
|
|
"f*ck", |
|
|
"fuck", |
|
|
] |
|
|
|
|
|
|
|
|
allowed_repos = [] |
|
|
|
|
|
|
|
|
def check_license_from_direct_url(direct_url): |
|
|
resp = requests.get(direct_url) |
|
|
|
|
|
if resp.status_code == 200: |
|
|
text = resp.text |
|
|
for lic in allowed_licenses: |
|
|
if lic.lower() in text.lower(): |
|
|
return True |
|
|
elif resp.status_code == 429: |
|
|
print("Rate-limited, waiting for 5m...") |
|
|
time.sleep(300) |
|
|
resp = requests.get(direct_url) |
|
|
if resp.status_code == 200: |
|
|
text = resp.text |
|
|
for lic in allowed_licenses: |
|
|
if lic.lower() in text.lower(): |
|
|
return True |
|
|
else: |
|
|
raise ValueError |
|
|
else: |
|
|
raise ValueError |
|
|
return False |
|
|
|
|
|
|
|
|
def check_license_from_file_url(file_url): |
|
|
|
|
|
base = "/".join(file_url.split("/")[:-1]) |
|
|
suffixes = [ |
|
|
"LICENSE", |
|
|
"license", |
|
|
"LICENSE.txt", |
|
|
"license.txt", |
|
|
"LICENSE.md", |
|
|
"license.md", |
|
|
] |
|
|
if base in allowed_repos: |
|
|
return True |
|
|
for suffix in suffixes: |
|
|
try: |
|
|
if check_license_from_direct_url(f"{base}/{suffix}"): |
|
|
allowed_repos.append(base) |
|
|
return True |
|
|
except ValueError: |
|
|
continue |
|
|
return False |
|
|
|
|
|
|
|
|
def get_last_line_number(file_path): |
|
|
"""Reads the last processed line number.""" |
|
|
if os.path.exists(file_path): |
|
|
with open(file_path, "r") as f: |
|
|
try: |
|
|
return int(f.read().strip()) |
|
|
except ValueError: |
|
|
return 0 |
|
|
return 0 |
|
|
|
|
|
|
|
|
def save_last_line_number(file_path, line_number): |
|
|
"""Saves the last processed line number.""" |
|
|
with open(file_path, "w") as f: |
|
|
f.write(str(line_number)) |
|
|
|
|
|
|
|
|
allowed_files = [] |
|
|
line_number_file = "license_line_number.txt" |
|
|
|
|
|
with open("python_files.txt", "r") as f: |
|
|
files = [line.strip() for line in f if line.strip()] |
|
|
|
|
|
last_line_number = get_last_line_number(line_number_file) |
|
|
num_allowed = 0 |
|
|
|
|
|
|
|
|
for index, file in enumerate( |
|
|
tqdm(files[last_line_number:], initial=last_line_number, total=len(files)), |
|
|
start=last_line_number, |
|
|
): |
|
|
try: |
|
|
if check_license_from_file_url(file): |
|
|
allowed_files.append(file) |
|
|
num_allowed += 1 |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
tqdm.write(f"Allowed: {num_allowed}") |
|
|
save_last_line_number(line_number_file, index + 1) |
|
|
|
|
|
with open("python_files_allowed.txt", "w") as f: |
|
|
for file in allowed_files: |
|
|
f.write(file + "\n") |
|
|
|