File size: 3,453 Bytes
a8639ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import requests
import os
from tqdm import tqdm
import time
allowed_licenses = [
"MIT License",
"MIT",
"Apache License",
"Apache 2.0",
"Apache-2.0",
"BSD License",
"BSD",
"BSD-2-Clause",
"BSD-3-Clause",
"CC0",
"CC0 1.0",
"Creative Commons Zero",
"Creative Commons CC0",
"CC0 Public Domain Dedication",
"Unlicense",
"The Unlicense",
"Public Domain",
"WTFPL",
"DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE",
"WTFPL License",
"ISC",
"ISC License",
"Zlib",
"Zlib License",
"Boost Software License",
"BSL-1.0",
"Boost License",
"Artistic License 2.0",
"Python Software Foundation License",
"PSF License",
"SOFTWARE IS PROVIDED",
"f*ck",
"fuck",
]
allowed_repos = []
def check_license_from_direct_url(direct_url):
resp = requests.get(direct_url)
if resp.status_code == 200:
text = resp.text
for lic in allowed_licenses:
if lic.lower() in text.lower():
return True
elif resp.status_code == 429:
print("Rate-limited, waiting for 5m...")
time.sleep(300) # sleep for 5 minutes
resp = requests.get(direct_url)
if resp.status_code == 200:
text = resp.text
for lic in allowed_licenses:
if lic.lower() in text.lower():
return True
else:
raise ValueError
else:
raise ValueError
return False
def check_license_from_file_url(file_url):
# example: https://raw.githubusercontent.com/ssloy/tinyoptimizer/main/analyzer.py
base = "/".join(file_url.split("/")[:-1])
suffixes = [
"LICENSE",
"license",
"LICENSE.txt",
"license.txt",
"LICENSE.md",
"license.md",
]
if base in allowed_repos:
return True
for suffix in suffixes:
try:
if check_license_from_direct_url(f"{base}/{suffix}"):
allowed_repos.append(base)
return True
except ValueError:
continue
return False
def get_last_line_number(file_path):
"""Reads the last processed line number."""
if os.path.exists(file_path):
with open(file_path, "r") as f:
try:
return int(f.read().strip())
except ValueError:
return 0
return 0
def save_last_line_number(file_path, line_number):
"""Saves the last processed line number."""
with open(file_path, "w") as f:
f.write(str(line_number))
allowed_files = []
line_number_file = "license_line_number.txt"
with open("python_files.txt", "r") as f:
files = [line.strip() for line in f if line.strip()]
last_line_number = get_last_line_number(line_number_file)
num_allowed = 0
# Use enumerate to track line numbers
for index, file in enumerate(
tqdm(files[last_line_number:], initial=last_line_number, total=len(files)),
start=last_line_number,
):
try:
if check_license_from_file_url(file):
allowed_files.append(file)
num_allowed += 1
except Exception:
pass
# Update the progress description and save progress at each iteration
tqdm.write(f"Allowed: {num_allowed}")
save_last_line_number(line_number_file, index + 1)
with open("python_files_allowed.txt", "w") as f:
for file in allowed_files:
f.write(file + "\n")
|