audio-separator-models-old / assets /model_tools.py

Add files using upload-large-folder tool

db35eaa verified 15 days ago

11.1 kB

	import hashlib
	import json
	import os
	import sys
	import subprocess
	import requests
	from huggingface_hub import HfApi, snapshot_download
	import hashlib
	from collections import defaultdict


	def calculate_file_hash(filepath, block_size=65536):
	"""Calculates the SHA256 hash of a file's content."""
	sha256 = hashlib.sha256()
	try:
	with open(filepath, "rb") as f:
	while chunk := f.read(block_size):
	sha256.update(chunk)
	except FileNotFoundError:
	return None # Handle cases where a file might be deleted during the scan

	return sha256.hexdigest()


	def find_and_remove_duplicates(directory="."):
	"""Finds duplicate files in the given directory and removes the one with the longer filename."""
	hashes_to_files = defaultdict(list)
	files_to_hash = {}

	# Step 1: Hash all files in the directory
	for filename in os.listdir(directory):
	filepath = os.path.join(directory, filename)
	if os.path.isfile(filepath):
	file_hash = calculate_file_hash(filepath)
	if file_hash:
	hashes_to_files[file_hash].append(filepath)
	files_to_hash[filepath] = file_hash

	# Step 2: Identify duplicate groups (more than one file per hash)
	duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1}

	if not duplicates:
	print("No duplicate files found.")
	return

	# Step 3: Iterate over duplicates, compare filename length, and delete the longer one
	for file_hash, file_list in duplicates.items():
	# Sort files by filename length (ascending). The one to keep is the first item.
	# If lengths are equal, an arbitrary one is kept.
	files_sorted_by_length = sorted(file_list, key=len)
	file_to_keep = files_sorted_by_length[0]
	files_to_delete = files_sorted_by_length[1:]

	print(f"\nDuplicate group (Hash: {file_hash[:10]}...):")
	print(f" Keeping: {file_to_keep}")
	for file_to_delete in files_to_delete:
	try:
	os.remove(file_to_delete)
	print(f" Deleted: {file_to_delete} (longer filename)")
	except OSError as e:
	print(f" Error deleting {file_to_delete}: {e}")


	def download_files_from_txt(filename, local_dir):
	command = [
	"aria2c",
	"--input-file",
	filename,
	"--dir",
	local_dir,
	"-c", # Continue downloading a partially downloaded file
	"-j",
	"30", # Set max concurrent downloads (adjust as needed)
	"-x",
	"16", # Set max connections per server (adjust as needed)
	]
	print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}")
	try:
	# Execute the command
	subprocess.run(
	command,
	check=True,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	)
	print("All downloads finished successfully.")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred during aria2c execution: {e.stderr}")
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	finally:
	# os.remove(filename)
	print(f"Downloaded all files: {filename}")


	def download_hf_repo(repo_id, local_dir, repo_type, token):
	if not token:
	token = os.getenv("HF_TOKEN")
	"""
	Downloads an entire Hugging Face repository to a specified local directory.
	"""
	print(f"Downloading {repo_id} to {local_dir}...")

	# Ensure the target directory exists
	os.makedirs(local_dir, exist_ok=True)

	# Download the snapshot
	downloaded_path = snapshot_download(
	repo_id=repo_id,
	local_dir=local_dir,
	token=token,
	local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir
	repo_type=repo_type,
	)

	print(f"Download complete! Files are located in: {downloaded_path}")
	return downloaded_path


	def remove_duplicate_lines(input_file_path, output_file_path):
	"""
	Reads lines from input_file_path, removes duplicates, and writes
	unique lines to output_file_path while preserving order.
	"""
	try:
	# Use an ordered set to maintain the original file's line order.
	# An easy way to do this in Python 3.7+ is using a dictionary's keys.
	unique_lines_dict = {}
	with open(input_file_path, "r") as input_file:
	for line in input_file:
	# Store line as a dictionary key; duplicates will be ignored
	unique_lines_dict[line] = None

	unique_lines = unique_lines_dict.keys()

	with open(output_file_path, "w") as output_file:
	# Write all unique lines to the new file
	output_file.writelines(unique_lines)

	print(f"Duplicates removed. Unique lines saved to '{output_file_path}'")

	except FileNotFoundError:
	print(f"Error: The file '{input_file_path}' was not found.")
	except Exception as e:
	print(f"An error occurred: {e}")


	def push_to_hf(repo_id, repo_type):
	api = HfApi()

	print(f"Uploading current directory to: {repo_id}")

	# Upload everything in the current directory ('.') to the repo root
	api.upload_folder(
	folder_path=".",
	repo_id=repo_id,
	repo_type=repo_type,
	commit_message="Initial model upload",
	)
	print("Upload complete!")


	def push_large_folder_to_hf(repo_id, repo_type):
	api = HfApi()
	print(f"Starting large folder upload to: {repo_id}")

	# 3. Use upload_large_folder for resilience and speed
	# This automatically handles multi-threading and local caching for resuming
	api.upload_large_folder(
	folder_path=".",
	repo_id=repo_id,
	repo_type=repo_type,
	# Optional: ignore large junk files to save time
	ignore_patterns=[
	".git/",
	"__pycache__/",
	"*.tmp",
	".DS_Store",
	"*.cache",
	"*.trash",
	],
	)

	print(
	"\nUpload complete! Progress was cached locally; if it failed, just run again to resume."
	)


	def get_model_hash(model_path):
	"""
	Get the hash of a model file
	"""
	# print(f"Getting hash for model at {model_path}")
	try:
	with open(model_path, "rb") as f:
	f.seek(
	-10000 * 1024, 2
	) # Move the file pointer 10MB before the end of the file
	hash_result = hashlib.md5(f.read()).hexdigest()
	# print(f"Hash for {model_path}: {hash_result}")
	return hash_result
	except IOError:
	with open(model_path, "rb") as f:
	hash_result = hashlib.md5(f.read()).hexdigest()
	# print(f"IOError encountered, hash for {model_path}: {hash_result}")
	return hash_result


	def download_file_if_missing(url, local_path):
	"""
	Download a file from a URL if it doesn't exist locally
	"""
	print(f"Checking if {local_path} needs to be downloaded from {url}")
	if not os.path.exists(local_path):
	print(f"Downloading {url} to {local_path}")
	with requests.get(url, stream=True, timeout=10) as r:
	r.raise_for_status()
	with open(local_path, "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	print(f"Downloaded {url} to {local_path}")
	else:
	print(f"{local_path} already exists. Skipping download.")


	def load_json_data(file_path):
	"""
	Load JSON data from a file
	"""
	print(f"Loading JSON data from {file_path}")
	try:
	with open(file_path, "r", encoding="utf-8") as file:
	data = json.load(file)
	print(f"Loaded JSON data successfully from {file_path}")
	return data
	except FileNotFoundError:
	print(f"{file_path} not found.")
	sys.exit(1)


	def iterate_and_hash(
	directory,
	vr_model_data_url,
	mdx_model_data_url,
	vr_model_data_local_path,
	mdx_model_data_local_path,
	):
	"""
	Iterate through a directory and hash all model files
	"""
	print(f"Iterating through directory {directory} to hash model files")
	model_files = [
	(file, os.path.join(root, file))
	for root, _, files in os.walk(directory)
	for file in files
	if file.endswith((".pth", ".onnx"))
	]

	download_file_if_missing(vr_model_data_url, vr_model_data_local_path)
	download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path)

	vr_model_data = load_json_data(vr_model_data_local_path)
	mdx_model_data = load_json_data(mdx_model_data_local_path)

	combined_model_params = {
	**vr_model_data,
	**mdx_model_data,
	}

	model_info_list = []
	for file, file_path in sorted(model_files):
	file_hash = get_model_hash(file_path)
	model_info = {
	"file": file,
	"hash": file_hash,
	"params": combined_model_params.get(file_hash, "Parameters not found"),
	}
	model_info_list.append(model_info)

	print(f"Writing model info list to {OUTPUT_PATH}")
	with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file:
	json.dump(model_info_list, json_file, indent=4)
	print(f"Successfully wrote model info list to {OUTPUT_PATH}")


	def sort_links_by_extension(input_file, output_file):
	# Define the custom priority order
	priority = {
	".json": 0,
	".yaml": 1,
	".th": 2,
	".pth": 3,
	".ckpt": 4,
	".onnx": 5, # Added .onnx (common typo for .onnx or .onx)
	}

	# Handle the specific user request for .onnx
	# Example: Map .onnx to priority 5
	# priority['.onnx'] = 5

	try:
	with open(input_file, "r") as f:
	# Read lines and strip whitespace/newlines
	links = [line.strip() for line in f if line.strip()]

	def sort_key(link):
	# Extract extension (case-insensitive)
	_, ext = os.path.splitext(link.lower())
	# Return priority index; if not in list, place at the end (index 100)
	return priority.get(ext, 100), link

	# Sort the links
	sorted_links = sorted(links, key=sort_key)

	with open(output_file, "w") as f:
	for link in sorted_links:
	f.write(link + "\n")

	print(f"Successfully sorted links into: {output_file}")

	except FileNotFoundError:
	print(f"Error: The file '{input_file}' was not found.")


	# 1. Load the JSON data
	# Ensure 'models.json' is in your current directory
	def get_links_from_json(file_input):
	try:
	with open(file_input, "r") as file:
	data = json.load(file)
	except FileNotFoundError:
	print("Error: 'models.json' not found.")
	data = {}

	# 2. Process and Download
	for model_name, links in data.items():
	if not isinstance(links, list) or len(links) == 0:
	continue