import hashlib import json import os import sys import subprocess import requests from huggingface_hub import HfApi, snapshot_download import hashlib from collections import defaultdict def calculate_file_hash(filepath, block_size=65536): """Calculates the SHA256 hash of a file's content.""" sha256 = hashlib.sha256() try: with open(filepath, "rb") as f: while chunk := f.read(block_size): sha256.update(chunk) except FileNotFoundError: return None # Handle cases where a file might be deleted during the scan return sha256.hexdigest() def find_and_remove_duplicates(directory="."): """Finds duplicate files in the given directory and removes the one with the longer filename.""" hashes_to_files = defaultdict(list) files_to_hash = {} # Step 1: Hash all files in the directory for filename in os.listdir(directory): filepath = os.path.join(directory, filename) if os.path.isfile(filepath): file_hash = calculate_file_hash(filepath) if file_hash: hashes_to_files[file_hash].append(filepath) files_to_hash[filepath] = file_hash # Step 2: Identify duplicate groups (more than one file per hash) duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1} if not duplicates: print("No duplicate files found.") return # Step 3: Iterate over duplicates, compare filename length, and delete the longer one for file_hash, file_list in duplicates.items(): # Sort files by filename length (ascending). The one to keep is the first item. # If lengths are equal, an arbitrary one is kept. files_sorted_by_length = sorted(file_list, key=len) file_to_keep = files_sorted_by_length[0] files_to_delete = files_sorted_by_length[1:] print(f"\nDuplicate group (Hash: {file_hash[:10]}...):") print(f" Keeping: {file_to_keep}") for file_to_delete in files_to_delete: try: os.remove(file_to_delete) print(f" Deleted: {file_to_delete} (longer filename)") except OSError as e: print(f" Error deleting {file_to_delete}: {e}") def download_files_from_txt(filename, local_dir): command = [ "aria2c", "--input-file", filename, "--dir", local_dir, "-c", # Continue downloading a partially downloaded file "-j", "30", # Set max concurrent downloads (adjust as needed) "-x", "16", # Set max connections per server (adjust as needed) ] print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}") try: # Execute the command subprocess.run( command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) print("All downloads finished successfully.") except subprocess.CalledProcessError as e: print(f"An error occurred during aria2c execution: {e.stderr}") except Exception as e: print(f"An unexpected error occurred: {e}") finally: # os.remove(filename) print(f"Downloaded all files: {filename}") def download_hf_repo(repo_id, local_dir, repo_type, token): if not token: token = os.getenv("HF_TOKEN") """ Downloads an entire Hugging Face repository to a specified local directory. """ print(f"Downloading {repo_id} to {local_dir}...") # Ensure the target directory exists os.makedirs(local_dir, exist_ok=True) # Download the snapshot downloaded_path = snapshot_download( repo_id=repo_id, local_dir=local_dir, token=token, local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir repo_type=repo_type, ) print(f"Download complete! Files are located in: {downloaded_path}") return downloaded_path def remove_duplicate_lines(input_file_path, output_file_path): """ Reads lines from input_file_path, removes duplicates, and writes unique lines to output_file_path while preserving order. """ try: # Use an ordered set to maintain the original file's line order. # An easy way to do this in Python 3.7+ is using a dictionary's keys. unique_lines_dict = {} with open(input_file_path, "r") as input_file: for line in input_file: # Store line as a dictionary key; duplicates will be ignored unique_lines_dict[line] = None unique_lines = unique_lines_dict.keys() with open(output_file_path, "w") as output_file: # Write all unique lines to the new file output_file.writelines(unique_lines) print(f"Duplicates removed. Unique lines saved to '{output_file_path}'") except FileNotFoundError: print(f"Error: The file '{input_file_path}' was not found.") except Exception as e: print(f"An error occurred: {e}") def push_to_hf(repo_id, repo_type): api = HfApi() print(f"Uploading current directory to: {repo_id}") # Upload everything in the current directory ('.') to the repo root api.upload_folder( folder_path=".", repo_id=repo_id, repo_type=repo_type, commit_message="Initial model upload", ) print("Upload complete!") def push_large_folder_to_hf(repo_id, repo_type): api = HfApi() print(f"Starting large folder upload to: {repo_id}") # 3. Use upload_large_folder for resilience and speed # This automatically handles multi-threading and local caching for resuming api.upload_large_folder( folder_path=".", repo_id=repo_id, repo_type=repo_type, # Optional: ignore large junk files to save time ignore_patterns=[ ".git/", "__pycache__/", "*.tmp", ".DS_Store", "*.cache", "*.trash", ], ) print( "\nUpload complete! Progress was cached locally; if it failed, just run again to resume." ) def get_model_hash(model_path): """ Get the hash of a model file """ # print(f"Getting hash for model at {model_path}") try: with open(model_path, "rb") as f: f.seek( -10000 * 1024, 2 ) # Move the file pointer 10MB before the end of the file hash_result = hashlib.md5(f.read()).hexdigest() # print(f"Hash for {model_path}: {hash_result}") return hash_result except IOError: with open(model_path, "rb") as f: hash_result = hashlib.md5(f.read()).hexdigest() # print(f"IOError encountered, hash for {model_path}: {hash_result}") return hash_result def download_file_if_missing(url, local_path): """ Download a file from a URL if it doesn't exist locally """ print(f"Checking if {local_path} needs to be downloaded from {url}") if not os.path.exists(local_path): print(f"Downloading {url} to {local_path}") with requests.get(url, stream=True, timeout=10) as r: r.raise_for_status() with open(local_path, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print(f"Downloaded {url} to {local_path}") else: print(f"{local_path} already exists. Skipping download.") def load_json_data(file_path): """ Load JSON data from a file """ print(f"Loading JSON data from {file_path}") try: with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) print(f"Loaded JSON data successfully from {file_path}") return data except FileNotFoundError: print(f"{file_path} not found.") sys.exit(1) def iterate_and_hash( directory, vr_model_data_url, mdx_model_data_url, vr_model_data_local_path, mdx_model_data_local_path, ): """ Iterate through a directory and hash all model files """ print(f"Iterating through directory {directory} to hash model files") model_files = [ (file, os.path.join(root, file)) for root, _, files in os.walk(directory) for file in files if file.endswith((".pth", ".onnx")) ] download_file_if_missing(vr_model_data_url, vr_model_data_local_path) download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path) vr_model_data = load_json_data(vr_model_data_local_path) mdx_model_data = load_json_data(mdx_model_data_local_path) combined_model_params = { **vr_model_data, **mdx_model_data, } model_info_list = [] for file, file_path in sorted(model_files): file_hash = get_model_hash(file_path) model_info = { "file": file, "hash": file_hash, "params": combined_model_params.get(file_hash, "Parameters not found"), } model_info_list.append(model_info) print(f"Writing model info list to {OUTPUT_PATH}") with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file: json.dump(model_info_list, json_file, indent=4) print(f"Successfully wrote model info list to {OUTPUT_PATH}") def sort_links_by_extension(input_file, output_file): # Define the custom priority order priority = { ".json": 0, ".yaml": 1, ".th": 2, ".pth": 3, ".ckpt": 4, ".onnx": 5, # Added .onnx (common typo for .onnx or .onx) } # Handle the specific user request for .onnx # Example: Map .onnx to priority 5 # priority['.onnx'] = 5 try: with open(input_file, "r") as f: # Read lines and strip whitespace/newlines links = [line.strip() for line in f if line.strip()] def sort_key(link): # Extract extension (case-insensitive) _, ext = os.path.splitext(link.lower()) # Return priority index; if not in list, place at the end (index 100) return priority.get(ext, 100), link # Sort the links sorted_links = sorted(links, key=sort_key) with open(output_file, "w") as f: for link in sorted_links: f.write(link + "\n") print(f"Successfully sorted links into: {output_file}") except FileNotFoundError: print(f"Error: The file '{input_file}' was not found.") # 1. Load the JSON data # Ensure 'models.json' is in your current directory def get_links_from_json(file_input): try: with open(file_input, "r") as file: data = json.load(file) except FileNotFoundError: print("Error: 'models.json' not found.") data = {} # 2. Process and Download for model_name, links in data.items(): if not isinstance(links, list) or len(links) == 0: continue