code-completion / scraping /files /license_check.py
jblitzar's picture
Upload folder using huggingface_hub
a8639ac verified
import requests
import os
from tqdm import tqdm
import time
allowed_licenses = [
"MIT License",
"MIT",
"Apache License",
"Apache 2.0",
"Apache-2.0",
"BSD License",
"BSD",
"BSD-2-Clause",
"BSD-3-Clause",
"CC0",
"CC0 1.0",
"Creative Commons Zero",
"Creative Commons CC0",
"CC0 Public Domain Dedication",
"Unlicense",
"The Unlicense",
"Public Domain",
"WTFPL",
"DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE",
"WTFPL License",
"ISC",
"ISC License",
"Zlib",
"Zlib License",
"Boost Software License",
"BSL-1.0",
"Boost License",
"Artistic License 2.0",
"Python Software Foundation License",
"PSF License",
"SOFTWARE IS PROVIDED",
"f*ck",
"fuck",
]
allowed_repos = []
def check_license_from_direct_url(direct_url):
resp = requests.get(direct_url)
if resp.status_code == 200:
text = resp.text
for lic in allowed_licenses:
if lic.lower() in text.lower():
return True
elif resp.status_code == 429:
print("Rate-limited, waiting for 5m...")
time.sleep(300) # sleep for 5 minutes
resp = requests.get(direct_url)
if resp.status_code == 200:
text = resp.text
for lic in allowed_licenses:
if lic.lower() in text.lower():
return True
else:
raise ValueError
else:
raise ValueError
return False
def check_license_from_file_url(file_url):
# example: https://raw.githubusercontent.com/ssloy/tinyoptimizer/main/analyzer.py
base = "/".join(file_url.split("/")[:-1])
suffixes = [
"LICENSE",
"license",
"LICENSE.txt",
"license.txt",
"LICENSE.md",
"license.md",
]
if base in allowed_repos:
return True
for suffix in suffixes:
try:
if check_license_from_direct_url(f"{base}/{suffix}"):
allowed_repos.append(base)
return True
except ValueError:
continue
return False
def get_last_line_number(file_path):
"""Reads the last processed line number."""
if os.path.exists(file_path):
with open(file_path, "r") as f:
try:
return int(f.read().strip())
except ValueError:
return 0
return 0
def save_last_line_number(file_path, line_number):
"""Saves the last processed line number."""
with open(file_path, "w") as f:
f.write(str(line_number))
allowed_files = []
line_number_file = "license_line_number.txt"
with open("python_files.txt", "r") as f:
files = [line.strip() for line in f if line.strip()]
last_line_number = get_last_line_number(line_number_file)
num_allowed = 0
# Use enumerate to track line numbers
for index, file in enumerate(
tqdm(files[last_line_number:], initial=last_line_number, total=len(files)),
start=last_line_number,
):
try:
if check_license_from_file_url(file):
allowed_files.append(file)
num_allowed += 1
except Exception:
pass
# Update the progress description and save progress at each iteration
tqdm.write(f"Allowed: {num_allowed}")
save_last_line_number(line_number_file, index + 1)
with open("python_files_allowed.txt", "w") as f:
for file in allowed_files:
f.write(file + "\n")