ai-deadlines

Running

App Files Files Community

ai-deadlines / .github /scripts /update_conferences.py

nielsr HF Staff

Ruff

004e138 2 months ago

raw

history blame contribute delete

10.4 kB

	import yaml
	import requests
	from datetime import datetime
	from typing import Dict, List, Any


	def fetch_conference_files() -> List[Dict[str, Any]]:
	"""Fetch all conference YAML files from ccfddl repository."""

	# First get the directory listing from GitHub API
	api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
	response = requests.get(api_url)
	files = response.json()

	conferences = []
	for file in files:
	if file["name"].endswith(".yml"):
	yaml_content = requests.get(file["download_url"]).text
	conf_data = yaml.safe_load(yaml_content)
	# The data is a list with a single item
	if isinstance(conf_data, list) and len(conf_data) > 0:
	conferences.append(conf_data[0])

	return conferences


	def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
	"""Parse various date formats and return start and end dates."""
	# Remove the year if it appears at the end of the string
	date_str = date_str.replace(f", {year}", "")

	# Handle various date formats
	try:
	# Split into start and end dates
	if " - " in date_str:
	start, end = date_str.split(" - ")
	elif "-" in date_str:
	start, end = date_str.split("-")
	else:
	# For single date format like "May 19, 2025"
	start = end = date_str

	# Clean up month abbreviations
	month_map = {
	"Sept": "September", # Handle Sept before Sep
	"Jan": "January",
	"Feb": "February",
	"Mar": "March",
	"Apr": "April",
	"Jun": "June",
	"Jul": "July",
	"Aug": "August",
	"Sep": "September",
	"Oct": "October",
	"Nov": "November",
	"Dec": "December",
	}

	# Create a set of all month names (full and abbreviated)
	all_months = set(month_map.keys()) \| set(month_map.values())

	# Handle cases like "April 29-May 4"
	has_month = any(month in end for month in all_months)
	if not has_month:
	# End is just a day number, use start's month
	start_parts = start.split()
	if len(start_parts) >= 1:
	end = f"{start_parts[0]} {end.strip()}"

	# Replace month abbreviations
	for abbr, full in month_map.items():
	start = start.replace(abbr, full)
	end = end.replace(abbr, full)

	# Clean up any extra spaces
	start = " ".join(start.split())
	end = " ".join(end.split())

	# Parse start date
	start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")

	# Parse end date
	end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")

	return start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")

	except Exception as e:
	raise ValueError(f"Could not parse date: {date_str} ({e})")


	def transform_conference_data(
	conferences: List[Dict[str, Any]],
	) -> List[Dict[str, Any]]:
	"""Transform ccfddl format to our format."""
	transformed = []
	current_year = datetime.now().year

	for conf in conferences:
	# Get the most recent or upcoming conference instance
	recent_conf = None
	if "confs" in conf:
	for instance in conf["confs"]:
	if instance["year"] >= current_year:
	recent_conf = instance
	break

	if not recent_conf:
	continue

	# Transform to our format
	transformed_conf = {
	"title": conf.get("title", ""),
	"year": recent_conf["year"],
	"id": recent_conf["id"],
	"full_name": conf.get("description", ""),
	"link": recent_conf.get("link", ""),
	"deadline": recent_conf.get("timeline", [{}])[0].get("deadline", ""),
	"timezone": recent_conf.get("timezone", ""),
	"date": recent_conf.get("date", ""),
	"tags": [], # We'll need to maintain a mapping for tags
	}

	# Handle city and country fields instead of place
	place = recent_conf.get("place", "")
	if place:
	# Try to parse the place into city and country if it contains a comma
	if "," in place:
	city, country = place.split(",", 1)
	transformed_conf["city"] = city.strip()
	transformed_conf["country"] = country.strip()
	else:
	# If we can't parse, just set the country
	transformed_conf["country"] = place.strip()

	# Add optional fields
	timeline = recent_conf.get("timeline", [{}])[0]
	if "abstract_deadline" in timeline:
	transformed_conf["abstract_deadline"] = timeline["abstract_deadline"]

	# Parse date range for start/end
	try:
	if transformed_conf["date"]:
	start_date, end_date = parse_date_range(
	transformed_conf["date"], str(transformed_conf["year"])
	)
	transformed_conf["start"] = start_date
	transformed_conf["end"] = end_date
	except Exception as e:
	print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")

	# Add rankings as separate field
	if "rank" in conf:
	rankings = []
	for rank_type, rank_value in conf["rank"].items():
	rankings.append(f"{rank_type.upper()}: {rank_value}")
	if rankings:
	transformed_conf["rankings"] = ", ".join(rankings)

	transformed.append(transformed_conf)

	return transformed


	def main():
	try:
	# Fetch current conferences.yml
	current_file = "src/data/conferences.yml"
	with open(current_file, "r") as f:
	current_conferences = yaml.safe_load(f)

	# Fetch and transform new data
	new_conferences = fetch_conference_files()
	if not new_conferences:
	print("Warning: No conferences fetched from ccfddl")
	return

	transformed_conferences = transform_conference_data(new_conferences)
	if not transformed_conferences:
	print("Warning: No conferences transformed")
	return

	# Create a dictionary of current conferences by ID
	current_conf_dict = {conf["id"]: conf for conf in current_conferences}

	# Create a set of existing conference title+year combinations to check for duplicates
	existing_conf_keys = {
	(conf["title"], conf["year"]) for conf in current_conferences
	}

	# Update or add new conferences while preserving existing ones
	for new_conf in transformed_conferences:
	# Check if this is a duplicate based on title and year
	conf_key = (new_conf["title"], new_conf["year"])

	# Skip if we already have a conference with this title and year but different ID
	if (
	conf_key in existing_conf_keys
	and new_conf["id"] not in current_conf_dict
	):
	print(
	f"Skipping duplicate conference: {new_conf['title']} {new_conf['year']} (ID: {new_conf['id']})"
	)
	continue

	if new_conf["id"] in current_conf_dict:
	# Update existing conference while preserving fields
	curr_conf = current_conf_dict[new_conf["id"]]

	# Preserve existing fields
	preserved_fields = [
	"tags",
	"venue",
	"hindex",
	"submission_deadline",
	"timezone_submission",
	"rebuttal_period_start",
	"rebuttal_period_end",
	"final_decision_date",
	"review_release_date",
	"commitment_deadline",
	"start",
	"end",
	"note",
	"city",
	"country", # Added city and country to preserved fields
	]
	for field in preserved_fields:
	if field in curr_conf:
	new_conf[field] = curr_conf[field]

	# If start/end not in current conference but we parsed them, keep the parsed ones
	if "start" not in curr_conf and "start" in new_conf:
	new_conf["start"] = new_conf["start"]
	if "end" not in curr_conf and "end" in new_conf:
	new_conf["end"] = new_conf["end"]

	# Preserve existing rankings if available
	if "rankings" in curr_conf:
	new_conf["rankings"] = curr_conf["rankings"]

	# Update the conference in the dictionary
	current_conf_dict[new_conf["id"]] = new_conf
	else:
	# Add new conference to the dictionary
	current_conf_dict[new_conf["id"]] = new_conf
	# Add to our set of existing conference keys
	existing_conf_keys.add(conf_key)

	# Convert back to list and sort by deadline
	all_conferences = list(current_conf_dict.values())
	all_conferences.sort(key=lambda x: x.get("deadline", "9999"))

	# Write back to file with newlines between conferences
	with open(current_file, "w") as f:
	for i, conf in enumerate(all_conferences):
	if i > 0:
	f.write("\n\n") # Add two newlines between conferences

	yaml_str = yaml.dump(
	[conf],
	allow_unicode=True,
	sort_keys=False,
	default_flow_style=False,
	explicit_start=False,
	explicit_end=False,
	width=float("inf"),
	indent=2,
	default_style=None,
	)
	f.write(yaml_str.rstrip()) # Remove trailing whitespace

	# Add final newline
	f.write("\n")

	print(f"Successfully updated {len(all_conferences)} conferences")

	except Exception as e:
	print(f"Error: {e}")
	raise


	if __name__ == "__main__":
	main()