Spaces:

tfrere
/

leaderboard-parser-agent

Build error

App Files Files Community

leaderboard-parser-agent / src /file_utils.py

tfrere HF Staff

first commit

0821095 11 months ago

raw

history blame contribute delete

10.6 kB

	"""
	Utilities for file management.
	"""
	import json
	import os
	import datetime
	import shutil
	import time
	import random
	import tempfile
	import logging
	from filelock import FileLock

	logger = logging.getLogger("leaderboard-parser")

	def save_results(results, file_path):
	"""
	Save results to a JSON file.

	Args:
	results: The results to save
	file_path: The path to the file
	"""
	with open(file_path, "w") as f:
	json.dump(results, f, indent=2)


	def create_category_slug(category_name):
	"""
	Creates a slug from a category name.
	The slug uses only hyphens as separators (no underscore).

	Args:
	category_name: The category name

	Returns:
	The category slug
	"""
	if not category_name:
	return ""
	# Convert to lowercase and replace spaces with hyphens
	# Ensure no underscores are used in the category slug
	return category_name.lower().replace(" ", "-").replace("_", "-")


	def create_combined_id(category, uid):
	"""
	Creates a normalized combined identifier from a category and UID.
	First normalizes the category using create_category_slug.

	Args:
	category: The category name
	uid: The UID of the leaderboard

	Returns:
	The combined identifier in the format category_slug_uid
	"""
	normalized_category = create_category_slug(category)
	return f"{normalized_category}_{uid}"


	def validate_leaderboard_result(result):
	"""
	Validates and corrects if necessary a leaderboard result to ensure identifier consistency.

	This function checks:
	1. That 'uid' is present and correctly formatted (category_original_uid)
	2. That 'original_uid' is present
	3. That 'category' is present and normalized
	4. That 'uid' corresponds to the combination of category and original_uid

	Args:
	result: The leaderboard result to validate (dict)

	Returns:
	The validated and corrected result, or None if validation is impossible
	"""
	if not isinstance(result, dict):
	logger.error(f"Validation error: the result is not a dictionary")
	return None

	# Check if required fields are present
	if "original_uid" not in result:
	logger.error(f"Validation error: original_uid missing from result")
	return None

	if "category" not in result:
	logger.error(f"Validation error: category missing from result")
	return None

	original_uid = result["original_uid"]
	category = result["category"]

	# Normalize the category if necessary
	normalized_category = create_category_slug(category)
	if normalized_category != category:
	logger.warning(f"Category not normalized: '{category}' -> '{normalized_category}'")
	result["category"] = normalized_category

	# Recalculate the correct combined uid
	correct_uid = create_combined_id(normalized_category, original_uid)

	# Check if existing uid is correct
	if "uid" not in result:
	logger.warning(f"uid missing, adding calculated uid: {correct_uid}")
	result["uid"] = correct_uid
	elif result["uid"] != correct_uid:
	logger.warning(f"uid inconsistent: '{result['uid']}' does not match '{correct_uid}', correction applied")
	result["uid"] = correct_uid

	return result


	def load_and_validate_results(file_path):
	"""
	Loads results from the file without strict validation.

	Args:
	file_path: Path to the results file

	Returns:
	List of results, or empty list in case of error
	"""
	try:
	# Load results from the file
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	results_data = json.load(f)
	except (FileNotFoundError, json.JSONDecodeError) as e:
	logger.warning(f"Unable to load file {file_path}: {str(e)}")
	return []

	# Convert from dict with "leaderboards" to array if necessary
	if isinstance(results_data, dict) and "leaderboards" in results_data:
	array_results = []
	for uid, item in results_data["leaderboards"].items():
	item_copy = item.copy()
	item_copy["uid"] = uid
	array_results.append(item_copy)
	results_data = array_results

	# Ensure results_data is a list
	if not isinstance(results_data, list):
	logger.warning(f"Invalid data format in {file_path}, initializing empty list")
	return []

	# Sort results
	results_data.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))

	logger.info(f"Load successful: {len(results_data)} results")
	return results_data

	except Exception as e:
	logger.error(f"Error loading results: {str(e)}")
	return []


	def update_leaderboard_result(leaderboard_result, file_path, max_wait_seconds=30):
	"""
	Updates a leaderboard result in the specified file.
	If an entry with the same uid already exists, it is updated.
	Otherwise, a new entry is added.

	Args:
	leaderboard_result: The leaderboard result to update (must contain a uid)
	file_path: Path to the results file
	max_wait_seconds: Maximum wait time for file lock (in seconds)

	Returns:
	Updated results list or None in case of error
	"""
	if not leaderboard_result or "uid" not in leaderboard_result:
	logger.error("Unable to update: invalid or missing leaderboard result or uid")
	return None

	# Create parent directory if necessary
	os.makedirs(os.path.dirname(file_path), exist_ok=True)

	# Use a lock to avoid concurrent writes
	lock_path = f"{file_path}.lock"
	lock = FileLock(lock_path, timeout=max_wait_seconds)

	try:
	with lock:
	# Load existing results
	current_results = load_and_validate_results(file_path)

	# Index by uid for easy update
	results_by_uid = {r.get("uid", ""): r for r in current_results if "uid" in r}

	# Update or add result
	uid = leaderboard_result["uid"]
	if uid in results_by_uid:
	# Update existing result
	results_by_uid[uid].update(leaderboard_result)
	logger.info(f"Result updated for uid: {uid}")
	else:
	# Add new result
	results_by_uid[uid] = leaderboard_result
	logger.info(f"New result added for uid: {uid}")

	# Convert to list for writing
	updated_results = list(results_by_uid.values())

	# Sort results
	updated_results.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))

	# Write to temporary file then rename for atomicity
	fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(file_path))
	try:
	with os.fdopen(fd, 'w', encoding='utf-8') as f:
	json.dump(updated_results, f, indent=2, ensure_ascii=False)

	# Replace original file with temporary file
	shutil.move(temp_path, file_path)
	logger.info(f"File updated successfully: {file_path}")

	return updated_results
	except Exception as e:
	# Clean up in case of error
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	raise e

	except Exception as e:
	logger.error(f"Error updating file {file_path}: {str(e)}")
	return None


	def split_combined_id(combined_id):
	"""
	Splits a combined identifier (category_uid) into its components.
	Uses only the first underscore "_" as separator.

	Args:
	combined_id: The combined identifier (category_uid)

	Returns:
	A tuple (category, uid) or (None, combined_id) if no underscore
	"""
	if not combined_id:
	return None, None

	# Search for the first underscore to separate category and uid
	parts = combined_id.split("_", 1)
	if len(parts) == 2:
	return parts[0], parts[1]
	else:
	# If no underscore, consider it as just a uid without category
	return None, combined_id


	def format_datetime(dt_str):
	"""
	Format a datetime string to a human readable format.

	Args:
	dt_str: The datetime string to format

	Returns:
	A formatted datetime string
	"""
	try:
	# Check if input is already a datetime object
	if isinstance(dt_str, datetime.datetime):
	dt = dt_str
	else:
	# Convert ISO format to datetime object
	# Handle different formats of ISO dates including fractional seconds and timezone
	try:
	dt = datetime.datetime.fromisoformat(dt_str)
	except ValueError:
	# Handle other common formats
	formats = [
	"%Y-%m-%dT%H:%M:%S.%f%z",
	"%Y-%m-%dT%H:%M:%S.%f",
	"%Y-%m-%dT%H:%M:%S%z",
	"%Y-%m-%dT%H:%M:%S",
	"%Y-%m-%d %H:%M:%S",
	"%Y-%m-%d"
	]

	for fmt in formats:
	try:
	dt = datetime.datetime.strptime(dt_str, fmt)
	break
	except ValueError:
	continue
	else:
	# If no format matches
	return dt_str

	# Format the datetime object
	return dt.strftime("%d/%m/%Y à %H:%M:%S")
	except (ValueError, TypeError) as e:
	print(f"Error formatting date {dt_str}: {e}")
	return dt_str


	def clean_output_files(results_file):
	"""
	Clean the output files, but keep a backup of the original.

	Args:
	results_file: The results file to clean
	"""
	# If results file exists, make a backup
	if os.path.exists(results_file):
	backup_file = f"{results_file}.backup"
	shutil.copy2(results_file, backup_file)
	print(f"Backup of {results_file} created in {backup_file}")

	# Create an empty results file
	with open(results_file, "w") as f:
	json.dump([], f, indent=2)
	print(f"File {results_file} cleaned")