lainlives's picture
Add files using upload-large-folder tool
34d5737 verified
import hashlib
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import requests
from huggingface_hub import HfApi, snapshot_download
def extract_urls_from_file(input_filename, output_filename):
"""
Extracts all URLs from an input text file and writes them to an output file.
"""
# A general regular expression for finding URLs
# It looks for strings starting with http:// or https://, followed by non-whitespace characters
URL_REGEX = r"https?://\S+|www\.\S+"
try:
# 1. Read the contents of the input file
with open(input_filename, "r", encoding="utf-8") as f_in:
content = f_in.read()
# 2. Find all URLs in the content using re.findall()
urls = re.findall(URL_REGEX, content)
# Ensure only unique URLs are written by converting the list to a set and back to a list
unique_urls = sorted(list(set(urls)))
# 3. Write the extracted URLs to the output file, each on a new line
with open(output_filename, "w", encoding="utf-8") as f_out:
for url in unique_urls:
f_out.write(url + "\n")
print(
f"Found {len(unique_urls)} unique URLs and saved them to {output_filename}"
)
except FileNotFoundError:
print(f"Error: The file '{input_filename}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
def remove_chars_from_file(input_filename, chars_to_remove):
"""
Reads a text file, removes specified characters, and writes the changes back to the file.
Args:
input_filename (str): The name of the input text file.
chars_to_remove (list): A list of characters to be removed (e.g., [',', '"', '}']).
"""
try:
# Read the file content
with open(input_filename, "r") as file:
content = file.read()
# Remove the characters
for char in chars_to_remove:
content = content.replace(char, "")
# Write the modified content back to the file
with open(input_filename, "w") as file:
file.write(content)
print(
f"Successfully removed characters {chars_to_remove} from {input_filename}"
)
except FileNotFoundError:
print(f"Error: The file '{input_filename}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
def calculate_file_hash(filepath, block_size=65536):
"""Calculates the SHA256 hash of a file's content."""
sha256 = hashlib.sha256()
try:
with open(filepath, "rb") as f:
while chunk := f.read(block_size):
sha256.update(chunk)
except FileNotFoundError:
return None # Handle cases where a file might be deleted during the scan
return sha256.hexdigest()
def find_and_remove_duplicates(directory="."):
"""Finds duplicate files in the given directory and removes the one with the longer filename."""
hashes_to_files = defaultdict(list)
files_to_hash = {}
# Step 1: Hash all files in the directory
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if os.path.isfile(filepath):
file_hash = calculate_file_hash(filepath)
if file_hash:
hashes_to_files[file_hash].append(filepath)
files_to_hash[filepath] = file_hash
# Step 2: Identify duplicate groups (more than one file per hash)
duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1}
if not duplicates:
print("No duplicate files found.")
return
# Step 3: Iterate over duplicates, compare filename length, and delete the longer one
for file_hash, file_list in duplicates.items():
# Sort files by filename length (ascending). The one to keep is the first item.
# If lengths are equal, an arbitrary one is kept.
files_sorted_by_length = sorted(file_list, key=len)
file_to_keep = files_sorted_by_length[0]
files_to_delete = files_sorted_by_length[1:]
print(f"\nDuplicate group (Hash: {file_hash[:10]}...):")
print(f" Keeping: {file_to_keep}")
for file_to_delete in files_to_delete:
try:
os.remove(file_to_delete)
print(f" Deleted: {file_to_delete} (longer filename)")
except OSError as e:
print(f" Error deleting {file_to_delete}: {e}")
def download_file(url, local_dir):
"""Helper function to download a single file."""
try:
# Extract filename from URL (e.g., https://example.com/file.jpg -> file.jpg)
filename = url.split("/")[-1].split("?")[0] or "downloaded_file"
save_path = os.path.join(local_dir, filename)
# Download the file content
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return f"Successfully downloaded: {filename}"
except Exception as e:
return f"Failed to download {url}: {e}"
def download_files_from_txt(filename, local_dir):
"""Main function to read URLs and download them using 20 threads."""
# Ensure local directory exists
if not os.path.exists(local_dir):
os.makedirs(local_dir)
# Read URLs from the text file
with open(filename, "r") as f:
urls = [line.strip() for line in f if line.strip()]
# Use ThreadPoolExecutor to handle 20 downloads at a time
with ThreadPoolExecutor(max_workers=20) as executor:
# Submit all download tasks to the pool
results = [executor.submit(download_file, url, local_dir) for url in urls]
# Monitor results as they complete
for future in results:
print(future.result())
def download_files_from_txt_aria(filename, local_dir):
command = [
"aria2c",
"--input-file",
filename,
"--dir",
local_dir,
"-c", # Continue downloading a partially downloaded file
"-j",
"30", # Set max concurrent downloads (adjust as needed)
"-x",
"16", # Set max connections per server (adjust as needed)
]
print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}")
try:
# Execute the command
subprocess.run(
command,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
print("All downloads finished successfully.")
except subprocess.CalledProcessError as e:
print(f"An error occurred during aria2c execution: {e.stderr}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
finally:
# os.remove(filename)
print(f"Downloaded all files: {filename}")
def download_hf_repo(repo_id, local_dir, repo_type, token):
if not token:
token = os.getenv("HF_TOKEN")
"""
Downloads an entire Hugging Face repository to a specified local directory.
"""
print(f"Downloading {repo_id} to {local_dir}...")
# Ensure the target directory exists
os.makedirs(local_dir, exist_ok=True)
# Download the snapshot
downloaded_path = snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
token=token,
local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir
repo_type=repo_type,
)
print(f"Download complete! Files are located in: {downloaded_path}")
return downloaded_path
def remove_duplicate_lines(input_file_path, output_file_path):
"""
Reads lines from input_file_path, removes duplicates, and writes
unique lines to output_file_path while preserving order.
"""
try:
# Use an ordered set to maintain the original file's line order.
# An easy way to do this in Python 3.7+ is using a dictionary's keys.
unique_lines_dict = {}
with open(input_file_path, "r") as input_file:
for line in input_file:
# Store line as a dictionary key; duplicates will be ignored
unique_lines_dict[line] = None
unique_lines = unique_lines_dict.keys()
with open(output_file_path, "w") as output_file:
# Write all unique lines to the new file
output_file.writelines(unique_lines)
print(f"Duplicates removed. Unique lines saved to '{output_file_path}'")
except FileNotFoundError:
print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")
def push_to_hf(repo_id, repo_type):
api = HfApi()
print(f"Uploading current directory to: {repo_id}")
# Upload everything in the current directory ('.') to the repo root
api.upload_folder(
folder_path=".",
repo_id=repo_id,
repo_type=repo_type,
commit_message="Initial model upload",
)
print("Upload complete!")
def push_large_folder_to_hf(repo_id, repo_type):
api = HfApi()
print(f"Starting large folder upload to: {repo_id}")
# 3. Use upload_large_folder for resilience and speed
# This automatically handles multi-threading and local caching for resuming
api.upload_large_folder(
folder_path=".",
repo_id=repo_id,
repo_type=repo_type,
# Optional: ignore large junk files to save time
ignore_patterns=[
".git/",
"__pycache__/",
"*.tmp",
".DS_Store",
"*.cache",
"*.trash",
],
)
print(
"\nUpload complete! Progress was cached locally; if it failed, just run again to resume."
)
def get_model_hash(model_path):
"""
Get the hash of a model file
"""
# print(f"Getting hash for model at {model_path}")
try:
with open(model_path, "rb") as f:
f.seek(
-10000 * 1024, 2
) # Move the file pointer 10MB before the end of the file
hash_result = hashlib.md5(f.read()).hexdigest()
# print(f"Hash for {model_path}: {hash_result}")
return hash_result
except IOError:
with open(model_path, "rb") as f:
hash_result = hashlib.md5(f.read()).hexdigest()
# print(f"IOError encountered, hash for {model_path}: {hash_result}")
return hash_result
def download_file_if_missing(url, local_path):
"""
Download a file from a URL if it doesn't exist locally
"""
print(f"Checking if {local_path} needs to be downloaded from {url}")
if not os.path.exists(local_path):
print(f"Downloading {url} to {local_path}")
with requests.get(url, stream=True, timeout=10) as r:
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded {url} to {local_path}")
else:
print(f"{local_path} already exists. Skipping download.")
def load_json_data(file_path):
"""
Load JSON data from a file
"""
print(f"Loading JSON data from {file_path}")
try:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
print(f"Loaded JSON data successfully from {file_path}")
return data
except FileNotFoundError:
print(f"{file_path} not found.")
sys.exit(1)
def iterate_and_hash(
directory,
vr_model_data_url,
mdx_model_data_url,
vr_model_data_local_path,
mdx_model_data_local_path,
):
"""
Iterate through a directory and hash all model files
"""
print(f"Iterating through directory {directory} to hash model files")
model_files = [
(file, os.path.join(root, file))
for root, _, files in os.walk(directory)
for file in files
if file.endswith((".pth", ".onnx"))
]
download_file_if_missing(vr_model_data_url, vr_model_data_local_path)
download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path)
vr_model_data = load_json_data(vr_model_data_local_path)
mdx_model_data = load_json_data(mdx_model_data_local_path)
combined_model_params = {
**vr_model_data,
**mdx_model_data,
}
model_info_list = []
for file, file_path in sorted(model_files):
file_hash = get_model_hash(file_path)
model_info = {
"file": file,
"hash": file_hash,
"params": combined_model_params.get(file_hash, "Parameters not found"),
}
model_info_list.append(model_info)
print(f"Writing model info list to {OUTPUT_PATH}")
with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file:
json.dump(model_info_list, json_file, indent=4)
print(f"Successfully wrote model info list to {OUTPUT_PATH}")
def sort_links_by_extension(input_file, output_file):
# Define the custom priority order
priority = {
".json": 0,
".yaml": 1,
".th": 2,
".pth": 3,
".ckpt": 4,
".onnx": 5, # Added .onnx (common typo for .onnx or .onx)
}
# Handle the specific user request for .onnx
# Example: Map .onnx to priority 5
# priority['.onnx'] = 5
try:
with open(input_file, "r") as f:
# Read lines and strip whitespace/newlines
links = [line.strip() for line in f if line.strip()]
def sort_key(link):
# Extract extension (case-insensitive)
_, ext = os.path.splitext(link.lower())
# Return priority index; if not in list, place at the end (index 100)
return priority.get(ext, 100), link
# Sort the links
sorted_links = sorted(links, key=sort_key)
with open(output_file, "w") as f:
for link in sorted_links:
f.write(link + "\n")
print(f"Successfully sorted links into: {output_file}")
except FileNotFoundError:
print(f"Error: The file '{input_file}' was not found.")
# 1. Load the JSON data
def get_links_from_json(file_input):
try:
with open(file_input, "r") as file:
data = json.load(file)
except FileNotFoundError:
print("Error: file not found.")
data = {}
# 2. Process and Download
for model_name, links in data.items():
if not isinstance(links, list) or len(links) == 0:
continue