jblitzar
/

code-completion

Model card Files Files and versions

Metrics Training metrics Community

code-completion / scraping /files /get_size.py

jblitzar's picture

Upload folder using huggingface_hub

a8639ac verified 6 months ago

history blame contribute delete

1.63 kB

	import requests
	from concurrent.futures import ThreadPoolExecutor
	from tqdm import tqdm
	import os
	import random

	# Path to the file containing URLs
	file_path = "python_files.txt"


	# Function to get the size of a file from a URL using a HEAD request
	def get_file_size(url):
	try:
	response = requests.head(url, allow_redirects=True, timeout=10)
	# Extract the content length header
	size = int(response.headers.get("Content-Length", 0))
	return size
	except (requests.RequestException, ValueError):
	# Return 0 if any error occurs (e.g., timeout, invalid URL, or missing header)
	return 0


	# Main function to calculate total size of all URLs in file
	def calculate_total_size(file_path):
	# Read URLs from file
	if not os.path.exists(file_path):
	print("File not found!")
	return 0

	with open(file_path, "r") as file:
	urls = [line.strip() for line in file if line.strip()]

	random.shuffle(urls)
	urls = urls[:5000]

	# Use threading to perform requests concurrently with a progress bar
	total_size = 0
	with ThreadPoolExecutor() as executor:
	# Wrap the map in tqdm for a progress bar
	file_sizes = list(
	tqdm(
	executor.map(get_file_size, urls),
	total=len(urls),
	desc=f"Processing URLs.",
	)
	)

	# Calculate the total size
	total_size = sum(file_sizes)
	return total_size


	# Calculate and print the total size
	total_size = calculate_total_size(file_path)
	print(f"Total size of all files: {total_size / (1024 * 1024):.2f} MB")