lainlives's picture
Add files using upload-large-folder tool
db35eaa verified
import hashlib
import json
import os
import sys
import subprocess
import requests
from huggingface_hub import HfApi, snapshot_download
import hashlib
from collections import defaultdict
def calculate_file_hash(filepath, block_size=65536):
"""Calculates the SHA256 hash of a file's content."""
sha256 = hashlib.sha256()
try:
with open(filepath, "rb") as f:
while chunk := f.read(block_size):
sha256.update(chunk)
except FileNotFoundError:
return None # Handle cases where a file might be deleted during the scan
return sha256.hexdigest()
def find_and_remove_duplicates(directory="."):
"""Finds duplicate files in the given directory and removes the one with the longer filename."""
hashes_to_files = defaultdict(list)
files_to_hash = {}
# Step 1: Hash all files in the directory
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
file_hash = calculate_file_hash(filepath)
if file_hash:
hashes_to_files[file_hash].append(filepath)
files_to_hash[filepath] = file_hash
# Step 2: Identify duplicate groups (more than one file per hash)
duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1}
if not duplicates:
print("No duplicate files found.")
return
# Step 3: Iterate over duplicates, compare filename length, and delete the longer one
for file_hash, file_list in duplicates.items():
# Sort files by filename length (ascending). The one to keep is the first item.
# If lengths are equal, an arbitrary one is kept.
files_sorted_by_length = sorted(file_list, key=len)
file_to_keep = files_sorted_by_length[0]
files_to_delete = files_sorted_by_length[1:]
print(f"\nDuplicate group (Hash: {file_hash[:10]}...):")
print(f" Keeping: {file_to_keep}")
for file_to_delete in files_to_delete:
try:
os.remove(file_to_delete)
print(f" Deleted: {file_to_delete} (longer filename)")
except OSError as e:
print(f" Error deleting {file_to_delete}: {e}")
def download_files_from_txt(filename, local_dir):
command = [
"aria2c",
"--input-file",
filename,
"--dir",
local_dir,
"-c", # Continue downloading a partially downloaded file
"-j",
"30", # Set max concurrent downloads (adjust as needed)
"-x",
"16", # Set max connections per server (adjust as needed)
]
print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}")
try:
# Execute the command
subprocess.run(
command,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
print("All downloads finished successfully.")
except subprocess.CalledProcessError as e:
print(f"An error occurred during aria2c execution: {e.stderr}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
finally:
# os.remove(filename)
print(f"Downloaded all files: {filename}")
def download_hf_repo(repo_id, local_dir, repo_type, token):
if not token:
token = os.getenv("HF_TOKEN")
"""
Downloads an entire Hugging Face repository to a specified local directory.
"""
print(f"Downloading {repo_id} to {local_dir}...")
# Ensure the target directory exists
os.makedirs(local_dir, exist_ok=True)
# Download the snapshot
downloaded_path = snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
token=token,
local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir
repo_type=repo_type,
)
print(f"Download complete! Files are located in: {downloaded_path}")
return downloaded_path
def remove_duplicate_lines(input_file_path, output_file_path):
"""
Reads lines from input_file_path, removes duplicates, and writes
unique lines to output_file_path while preserving order.
"""
try:
# Use an ordered set to maintain the original file's line order.
# An easy way to do this in Python 3.7+ is using a dictionary's keys.
unique_lines_dict = {}
with open(input_file_path, "r") as input_file:
for line in input_file:
# Store line as a dictionary key; duplicates will be ignored
unique_lines_dict[line] = None
unique_lines = unique_lines_dict.keys()
with open(output_file_path, "w") as output_file:
# Write all unique lines to the new file
output_file.writelines(unique_lines)
print(f"Duplicates removed. Unique lines saved to '{output_file_path}'")
except FileNotFoundError:
print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
def push_to_hf(repo_id, repo_type):
api = HfApi()
print(f"Uploading current directory to: {repo_id}")
# Upload everything in the current directory ('.') to the repo root
api.upload_folder(
folder_path=".",
repo_id=repo_id,
repo_type=repo_type,
commit_message="Initial model upload",
)
print("Upload complete!")
def push_large_folder_to_hf(repo_id, repo_type):
api = HfApi()
print(f"Starting large folder upload to: {repo_id}")
# 3. Use upload_large_folder for resilience and speed
# This automatically handles multi-threading and local caching for resuming
api.upload_large_folder(
folder_path=".",
repo_id=repo_id,
repo_type=repo_type,
# Optional: ignore large junk files to save time
ignore_patterns=[
".git/",
"__pycache__/",
"*.tmp",
".DS_Store",
"*.cache",
"*.trash",
],
)
print(
"\nUpload complete! Progress was cached locally; if it failed, just run again to resume."
)
def get_model_hash(model_path):
"""
Get the hash of a model file
"""
# print(f"Getting hash for model at {model_path}")
try:
with open(model_path, "rb") as f:
f.seek(
-10000 * 1024, 2
) # Move the file pointer 10MB before the end of the file
hash_result = hashlib.md5(f.read()).hexdigest()
# print(f"Hash for {model_path}: {hash_result}")
return hash_result
except IOError:
with open(model_path, "rb") as f:
hash_result = hashlib.md5(f.read()).hexdigest()
# print(f"IOError encountered, hash for {model_path}: {hash_result}")
return hash_result
def download_file_if_missing(url, local_path):
"""
Download a file from a URL if it doesn't exist locally
"""
print(f"Checking if {local_path} needs to be downloaded from {url}")
if not os.path.exists(local_path):
print(f"Downloading {url} to {local_path}")
with requests.get(url, stream=True, timeout=10) as r:
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded {url} to {local_path}")
else:
print(f"{local_path} already exists. Skipping download.")
def load_json_data(file_path):
"""
Load JSON data from a file
"""
print(f"Loading JSON data from {file_path}")
try:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
print(f"Loaded JSON data successfully from {file_path}")
return data
except FileNotFoundError:
print(f"{file_path} not found.")
sys.exit(1)
def iterate_and_hash(
directory,
vr_model_data_url,
mdx_model_data_url,
vr_model_data_local_path,
mdx_model_data_local_path,
):
"""
Iterate through a directory and hash all model files
"""
print(f"Iterating through directory {directory} to hash model files")
model_files = [
(file, os.path.join(root, file))
for root, _, files in os.walk(directory)
for file in files
if file.endswith((".pth", ".onnx"))
]
download_file_if_missing(vr_model_data_url, vr_model_data_local_path)
download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path)
vr_model_data = load_json_data(vr_model_data_local_path)
mdx_model_data = load_json_data(mdx_model_data_local_path)
combined_model_params = {
**vr_model_data,
**mdx_model_data,
}
model_info_list = []
for file, file_path in sorted(model_files):
file_hash = get_model_hash(file_path)
model_info = {
"file": file,
"hash": file_hash,
"params": combined_model_params.get(file_hash, "Parameters not found"),
}
model_info_list.append(model_info)
print(f"Writing model info list to {OUTPUT_PATH}")
with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file:
json.dump(model_info_list, json_file, indent=4)
print(f"Successfully wrote model info list to {OUTPUT_PATH}")
def sort_links_by_extension(input_file, output_file):
# Define the custom priority order
priority = {
".json": 0,
".yaml": 1,
".th": 2,
".pth": 3,
".ckpt": 4,
".onnx": 5, # Added .onnx (common typo for .onnx or .onx)
}
# Handle the specific user request for .onnx
# Example: Map .onnx to priority 5
# priority['.onnx'] = 5
try:
with open(input_file, "r") as f:
# Read lines and strip whitespace/newlines
links = [line.strip() for line in f if line.strip()]
def sort_key(link):
# Extract extension (case-insensitive)
_, ext = os.path.splitext(link.lower())
# Return priority index; if not in list, place at the end (index 100)
return priority.get(ext, 100), link
# Sort the links
sorted_links = sorted(links, key=sort_key)
with open(output_file, "w") as f:
for link in sorted_links:
f.write(link + "\n")
print(f"Successfully sorted links into: {output_file}")
except FileNotFoundError:
print(f"Error: The file '{input_file}' was not found.")
# 1. Load the JSON data
# Ensure 'models.json' is in your current directory
def get_links_from_json(file_input):
try:
with open(file_input, "r") as file:
data = json.load(file)
except FileNotFoundError:
print("Error: 'models.json' not found.")
data = {}
# 2. Process and Download
for model_name, links in data.items():
if not isinstance(links, list) or len(links) == 0:
continue