|
|
import hashlib |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import subprocess |
|
|
import sys |
|
|
from collections import defaultdict |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
import requests |
|
|
from huggingface_hub import HfApi, snapshot_download |
|
|
|
|
|
|
|
|
def extract_urls_from_file(input_filename, output_filename): |
|
|
""" |
|
|
Extracts all URLs from an input text file and writes them to an output file. |
|
|
""" |
|
|
|
|
|
|
|
|
URL_REGEX = r"https?://\S+|www\.\S+" |
|
|
|
|
|
try: |
|
|
|
|
|
with open(input_filename, "r", encoding="utf-8") as f_in: |
|
|
content = f_in.read() |
|
|
|
|
|
|
|
|
urls = re.findall(URL_REGEX, content) |
|
|
|
|
|
|
|
|
unique_urls = sorted(list(set(urls))) |
|
|
|
|
|
|
|
|
with open(output_filename, "w", encoding="utf-8") as f_out: |
|
|
for url in unique_urls: |
|
|
f_out.write(url + "\n") |
|
|
|
|
|
print( |
|
|
f"Found {len(unique_urls)} unique URLs and saved them to {output_filename}" |
|
|
) |
|
|
|
|
|
except FileNotFoundError: |
|
|
print(f"Error: The file '{input_filename}' was not found.") |
|
|
except Exception as e: |
|
|
print(f"An error occurred: {e}") |
|
|
|
|
|
|
|
|
def remove_chars_from_file(input_filename, chars_to_remove): |
|
|
""" |
|
|
Reads a text file, removes specified characters, and writes the changes back to the file. |
|
|
|
|
|
Args: |
|
|
input_filename (str): The name of the input text file. |
|
|
chars_to_remove (list): A list of characters to be removed (e.g., [',', '"', '}']). |
|
|
""" |
|
|
try: |
|
|
|
|
|
with open(input_filename, "r") as file: |
|
|
content = file.read() |
|
|
|
|
|
|
|
|
for char in chars_to_remove: |
|
|
content = content.replace(char, "") |
|
|
|
|
|
|
|
|
with open(input_filename, "w") as file: |
|
|
file.write(content) |
|
|
|
|
|
print( |
|
|
f"Successfully removed characters {chars_to_remove} from {input_filename}" |
|
|
) |
|
|
|
|
|
except FileNotFoundError: |
|
|
print(f"Error: The file '{input_filename}' was not found.") |
|
|
except Exception as e: |
|
|
print(f"An error occurred: {e}") |
|
|
|
|
|
|
|
|
def calculate_file_hash(filepath, block_size=65536): |
|
|
"""Calculates the SHA256 hash of a file's content.""" |
|
|
sha256 = hashlib.sha256() |
|
|
try: |
|
|
with open(filepath, "rb") as f: |
|
|
while chunk := f.read(block_size): |
|
|
sha256.update(chunk) |
|
|
except FileNotFoundError: |
|
|
return None |
|
|
|
|
|
return sha256.hexdigest() |
|
|
|
|
|
|
|
|
def find_and_remove_duplicates(directory="."): |
|
|
"""Finds duplicate files in the given directory and removes the one with the longer filename.""" |
|
|
hashes_to_files = defaultdict(list) |
|
|
files_to_hash = {} |
|
|
|
|
|
|
|
|
for filename in os.listdir(directory): |
|
|
filepath = os.path.join(directory, filename) |
|
|
if os.path.isfile(filepath): |
|
|
file_hash = calculate_file_hash(filepath) |
|
|
if file_hash: |
|
|
hashes_to_files[file_hash].append(filepath) |
|
|
files_to_hash[filepath] = file_hash |
|
|
|
|
|
|
|
|
duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1} |
|
|
|
|
|
if not duplicates: |
|
|
print("No duplicate files found.") |
|
|
return |
|
|
|
|
|
|
|
|
for file_hash, file_list in duplicates.items(): |
|
|
|
|
|
|
|
|
files_sorted_by_length = sorted(file_list, key=len) |
|
|
file_to_keep = files_sorted_by_length[0] |
|
|
files_to_delete = files_sorted_by_length[1:] |
|
|
|
|
|
print(f"\nDuplicate group (Hash: {file_hash[:10]}...):") |
|
|
print(f" Keeping: {file_to_keep}") |
|
|
for file_to_delete in files_to_delete: |
|
|
try: |
|
|
os.remove(file_to_delete) |
|
|
print(f" Deleted: {file_to_delete} (longer filename)") |
|
|
except OSError as e: |
|
|
print(f" Error deleting {file_to_delete}: {e}") |
|
|
|
|
|
|
|
|
def download_file(url, local_dir): |
|
|
"""Helper function to download a single file.""" |
|
|
try: |
|
|
|
|
|
filename = url.split("/")[-1].split("?")[0] or "downloaded_file" |
|
|
save_path = os.path.join(local_dir, filename) |
|
|
|
|
|
|
|
|
response = requests.get(url, stream=True, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
with open(save_path, "wb") as f: |
|
|
for chunk in response.iter_content(chunk_size=8192): |
|
|
f.write(chunk) |
|
|
return f"Successfully downloaded: {filename}" |
|
|
except Exception as e: |
|
|
return f"Failed to download {url}: {e}" |
|
|
|
|
|
|
|
|
def download_files_from_txt(filename, local_dir): |
|
|
"""Main function to read URLs and download them using 20 threads.""" |
|
|
|
|
|
if not os.path.exists(local_dir): |
|
|
os.makedirs(local_dir) |
|
|
|
|
|
|
|
|
with open(filename, "r") as f: |
|
|
urls = [line.strip() for line in f if line.strip()] |
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=20) as executor: |
|
|
|
|
|
results = [executor.submit(download_file, url, local_dir) for url in urls] |
|
|
|
|
|
|
|
|
for future in results: |
|
|
print(future.result()) |
|
|
|
|
|
|
|
|
def download_files_from_txt_aria(filename, local_dir): |
|
|
command = [ |
|
|
"aria2c", |
|
|
"--input-file", |
|
|
filename, |
|
|
"--dir", |
|
|
local_dir, |
|
|
"-c", |
|
|
"-j", |
|
|
"30", |
|
|
"-x", |
|
|
"16", |
|
|
] |
|
|
print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}") |
|
|
try: |
|
|
|
|
|
subprocess.run( |
|
|
command, |
|
|
check=True, |
|
|
stdout=subprocess.PIPE, |
|
|
stderr=subprocess.PIPE, |
|
|
text=True, |
|
|
) |
|
|
print("All downloads finished successfully.") |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"An error occurred during aria2c execution: {e.stderr}") |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred: {e}") |
|
|
finally: |
|
|
|
|
|
print(f"Downloaded all files: {filename}") |
|
|
|
|
|
|
|
|
def download_hf_repo(repo_id, local_dir, repo_type, token): |
|
|
if not token: |
|
|
token = os.getenv("HF_TOKEN") |
|
|
""" |
|
|
Downloads an entire Hugging Face repository to a specified local directory. |
|
|
""" |
|
|
print(f"Downloading {repo_id} to {local_dir}...") |
|
|
|
|
|
|
|
|
os.makedirs(local_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
downloaded_path = snapshot_download( |
|
|
repo_id=repo_id, |
|
|
local_dir=local_dir, |
|
|
token=token, |
|
|
local_dir_use_symlinks=False, |
|
|
repo_type=repo_type, |
|
|
) |
|
|
|
|
|
print(f"Download complete! Files are located in: {downloaded_path}") |
|
|
return downloaded_path |
|
|
|
|
|
|
|
|
def remove_duplicate_lines(input_file_path, output_file_path): |
|
|
""" |
|
|
Reads lines from input_file_path, removes duplicates, and writes |
|
|
unique lines to output_file_path while preserving order. |
|
|
""" |
|
|
try: |
|
|
|
|
|
|
|
|
unique_lines_dict = {} |
|
|
with open(input_file_path, "r") as input_file: |
|
|
for line in input_file: |
|
|
|
|
|
unique_lines_dict[line] = None |
|
|
|
|
|
unique_lines = unique_lines_dict.keys() |
|
|
|
|
|
with open(output_file_path, "w") as output_file: |
|
|
|
|
|
output_file.writelines(unique_lines) |
|
|
|
|
|
print(f"Duplicates removed. Unique lines saved to '{output_file_path}'") |
|
|
|
|
|
except FileNotFoundError: |
|
|
print(f"Error: The file '{input_file_path}' was not found.") |
|
|
except Exception as e: |
|
|
print(f"An error occurred: {e}") |
|
|
|
|
|
|
|
|
def push_to_hf(repo_id, repo_type): |
|
|
api = HfApi() |
|
|
|
|
|
print(f"Uploading current directory to: {repo_id}") |
|
|
|
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=".", |
|
|
repo_id=repo_id, |
|
|
repo_type=repo_type, |
|
|
commit_message="Initial model upload", |
|
|
) |
|
|
print("Upload complete!") |
|
|
|
|
|
|
|
|
def push_large_folder_to_hf(repo_id, repo_type): |
|
|
api = HfApi() |
|
|
print(f"Starting large folder upload to: {repo_id}") |
|
|
|
|
|
|
|
|
|
|
|
api.upload_large_folder( |
|
|
folder_path=".", |
|
|
repo_id=repo_id, |
|
|
repo_type=repo_type, |
|
|
|
|
|
ignore_patterns=[ |
|
|
".git/", |
|
|
"__pycache__/", |
|
|
"*.tmp", |
|
|
".DS_Store", |
|
|
"*.cache", |
|
|
"*.trash", |
|
|
], |
|
|
) |
|
|
|
|
|
print( |
|
|
"\nUpload complete! Progress was cached locally; if it failed, just run again to resume." |
|
|
) |
|
|
|
|
|
|
|
|
def get_model_hash(model_path): |
|
|
""" |
|
|
Get the hash of a model file |
|
|
""" |
|
|
|
|
|
try: |
|
|
with open(model_path, "rb") as f: |
|
|
f.seek( |
|
|
-10000 * 1024, 2 |
|
|
) |
|
|
hash_result = hashlib.md5(f.read()).hexdigest() |
|
|
|
|
|
return hash_result |
|
|
except IOError: |
|
|
with open(model_path, "rb") as f: |
|
|
hash_result = hashlib.md5(f.read()).hexdigest() |
|
|
|
|
|
return hash_result |
|
|
|
|
|
|
|
|
def download_file_if_missing(url, local_path): |
|
|
""" |
|
|
Download a file from a URL if it doesn't exist locally |
|
|
""" |
|
|
print(f"Checking if {local_path} needs to be downloaded from {url}") |
|
|
if not os.path.exists(local_path): |
|
|
print(f"Downloading {url} to {local_path}") |
|
|
with requests.get(url, stream=True, timeout=10) as r: |
|
|
r.raise_for_status() |
|
|
with open(local_path, "wb") as f: |
|
|
for chunk in r.iter_content(chunk_size=8192): |
|
|
f.write(chunk) |
|
|
print(f"Downloaded {url} to {local_path}") |
|
|
else: |
|
|
print(f"{local_path} already exists. Skipping download.") |
|
|
|
|
|
|
|
|
def load_json_data(file_path): |
|
|
""" |
|
|
Load JSON data from a file |
|
|
""" |
|
|
print(f"Loading JSON data from {file_path}") |
|
|
try: |
|
|
with open(file_path, "r", encoding="utf-8") as file: |
|
|
data = json.load(file) |
|
|
print(f"Loaded JSON data successfully from {file_path}") |
|
|
return data |
|
|
except FileNotFoundError: |
|
|
print(f"{file_path} not found.") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def iterate_and_hash( |
|
|
directory, |
|
|
vr_model_data_url, |
|
|
mdx_model_data_url, |
|
|
vr_model_data_local_path, |
|
|
mdx_model_data_local_path, |
|
|
): |
|
|
""" |
|
|
Iterate through a directory and hash all model files |
|
|
""" |
|
|
print(f"Iterating through directory {directory} to hash model files") |
|
|
model_files = [ |
|
|
(file, os.path.join(root, file)) |
|
|
for root, _, files in os.walk(directory) |
|
|
for file in files |
|
|
if file.endswith((".pth", ".onnx")) |
|
|
] |
|
|
|
|
|
download_file_if_missing(vr_model_data_url, vr_model_data_local_path) |
|
|
download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path) |
|
|
|
|
|
vr_model_data = load_json_data(vr_model_data_local_path) |
|
|
mdx_model_data = load_json_data(mdx_model_data_local_path) |
|
|
|
|
|
combined_model_params = { |
|
|
**vr_model_data, |
|
|
**mdx_model_data, |
|
|
} |
|
|
|
|
|
model_info_list = [] |
|
|
for file, file_path in sorted(model_files): |
|
|
file_hash = get_model_hash(file_path) |
|
|
model_info = { |
|
|
"file": file, |
|
|
"hash": file_hash, |
|
|
"params": combined_model_params.get(file_hash, "Parameters not found"), |
|
|
} |
|
|
model_info_list.append(model_info) |
|
|
|
|
|
print(f"Writing model info list to {OUTPUT_PATH}") |
|
|
with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file: |
|
|
json.dump(model_info_list, json_file, indent=4) |
|
|
print(f"Successfully wrote model info list to {OUTPUT_PATH}") |
|
|
|
|
|
|
|
|
def sort_links_by_extension(input_file, output_file): |
|
|
|
|
|
priority = { |
|
|
".json": 0, |
|
|
".yaml": 1, |
|
|
".th": 2, |
|
|
".pth": 3, |
|
|
".ckpt": 4, |
|
|
".onnx": 5, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
with open(input_file, "r") as f: |
|
|
|
|
|
links = [line.strip() for line in f if line.strip()] |
|
|
|
|
|
def sort_key(link): |
|
|
|
|
|
_, ext = os.path.splitext(link.lower()) |
|
|
|
|
|
return priority.get(ext, 100), link |
|
|
|
|
|
|
|
|
sorted_links = sorted(links, key=sort_key) |
|
|
|
|
|
with open(output_file, "w") as f: |
|
|
for link in sorted_links: |
|
|
f.write(link + "\n") |
|
|
|
|
|
print(f"Successfully sorted links into: {output_file}") |
|
|
|
|
|
except FileNotFoundError: |
|
|
print(f"Error: The file '{input_file}' was not found.") |
|
|
|
|
|
|
|
|
|
|
|
def get_links_from_json(file_input): |
|
|
try: |
|
|
with open(file_input, "r") as file: |
|
|
data = json.load(file) |
|
|
except FileNotFoundError: |
|
|
print("Error: file not found.") |
|
|
data = {} |
|
|
|
|
|
|
|
|
for model_name, links in data.items(): |
|
|
if not isinstance(links, list) or len(links) == 0: |
|
|
continue |
|
|
|