Spaces:

huggingface
/

ai-deadlines

Running

App Files Files Community

ai-deadlines / .github /scripts /update_conferences_new.py

nielsr HF Staff

Refactor conference data

f3cc2d7 3 months ago

raw

history blame contribute delete

12.1 kB

	import yaml
	import requests
	import os
	import re
	from datetime import datetime
	from typing import Dict, List, Any


	def fetch_conference_files() -> List[Dict[str, Any]]:
	"""Fetch all conference YAML files from ccfddl repository."""

	# First get the directory listing from GitHub API
	api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
	response = requests.get(api_url)
	files = response.json()

	conferences = []
	for file in files:
	if file['name'].endswith('.yml'):
	yaml_content = requests.get(file['download_url']).text
	conf_data = yaml.safe_load(yaml_content)
	# The data is a list with a single item
	if isinstance(conf_data, list) and len(conf_data) > 0:
	conferences.append(conf_data[0])

	return conferences


	def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
	"""Parse various date formats and return start and end dates."""
	# Remove the year if it appears at the end of the string
	date_str = date_str.replace(f", {year}", "")

	# Handle various date formats
	try:
	# Split into start and end dates
	if ' - ' in date_str:
	start, end = date_str.split(' - ')
	elif '-' in date_str:
	start, end = date_str.split('-')
	else:
	# For single date format like "May 19, 2025"
	start = end = date_str

	# Clean up month abbreviations
	month_map = {
	'Sept': 'September', # Handle Sept before Sep
	'Jan': 'January',
	'Feb': 'February',
	'Mar': 'March',
	'Apr': 'April',
	'Jun': 'June',
	'Jul': 'July',
	'Aug': 'August',
	'Sep': 'September',
	'Oct': 'October',
	'Nov': 'November',
	'Dec': 'December'
	}

	# Create a set of all month names (full and abbreviated)
	all_months = set(month_map.keys()) \| set(month_map.values())

	# Handle cases like "April 29-May 4"
	has_month = any(month in end for month in all_months)
	if not has_month:
	# End is just a day number, use start's month
	start_parts = start.split()
	if len(start_parts) >= 1:
	end = f"{start_parts[0]} {end.strip()}"

	# Replace month abbreviations
	for abbr, full in month_map.items():
	start = start.replace(abbr, full)
	end = end.replace(abbr, full)

	# Clean up any extra spaces
	start = ' '.join(start.split())
	end = ' '.join(end.split())

	# Parse start date
	start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")

	# Parse end date
	end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")

	return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

	except Exception as e:
	raise ValueError(f"Could not parse date: {date_str} ({e})")


	def transform_conference_data(conferences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Transform ccfddl format to our format."""
	transformed = []
	current_year = datetime.now().year

	for conf in conferences:
	# Get the most recent or upcoming conference instance
	recent_conf = None
	if 'confs' in conf:
	for instance in conf['confs']:
	if instance['year'] >= current_year:
	recent_conf = instance
	break

	if not recent_conf:
	continue

	# Transform to our format
	transformed_conf = {
	'title': conf.get('title', ''),
	'year': recent_conf['year'],
	'id': recent_conf['id'],
	'full_name': conf.get('description', ''),
	'link': recent_conf.get('link', ''),
	'deadline': recent_conf.get('timeline', [{}])[0].get('deadline', ''),
	'timezone': recent_conf.get('timezone', ''),
	'date': recent_conf.get('date', ''),
	'tags': [], # We'll need to maintain a mapping for tags
	}

	# Handle city and country fields instead of place
	place = recent_conf.get('place', '')
	if place:
	# Try to parse the place into city and country if it contains a comma
	if ',' in place:
	city, country = place.split(',', 1)
	transformed_conf['city'] = city.strip()
	transformed_conf['country'] = country.strip()
	else:
	# If we can't parse, just set the country
	transformed_conf['country'] = place.strip()

	# Add optional fields
	timeline = recent_conf.get('timeline', [{}])[0]
	if 'abstract_deadline' in timeline:
	transformed_conf['abstract_deadline'] = timeline['abstract_deadline']

	# Parse date range for start/end
	try:
	if transformed_conf['date']:
	start_date, end_date = parse_date_range(
	transformed_conf['date'],
	str(transformed_conf['year'])
	)
	transformed_conf['start'] = start_date
	transformed_conf['end'] = end_date
	except Exception as e:
	print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")

	# Add rankings as separate field
	if 'rank' in conf:
	rankings = []
	for rank_type, rank_value in conf['rank'].items():
	rankings.append(f"{rank_type.upper()}: {rank_value}")
	if rankings:
	transformed_conf['rankings'] = ', '.join(rankings)

	transformed.append(transformed_conf)

	return transformed


	def load_all_current_conferences() -> Dict[str, List[Dict[str, Any]]]:
	"""Load all current conferences from individual files."""
	conferences_dir = 'src/data/conferences'
	conference_groups = {}

	if not os.path.exists(conferences_dir):
	return {}

	for filename in os.listdir(conferences_dir):
	if filename.endswith('.yml'):
	filepath = os.path.join(conferences_dir, filename)
	with open(filepath, 'r') as f:
	conferences = yaml.safe_load(f)
	if conferences:
	# Extract conference title from the first entry
	title = conferences[0]['title']
	conference_groups[title] = conferences

	return conference_groups


	def create_filename_from_title(title: str) -> str:
	"""Create a filename-safe version of the conference title."""
	filename = re.sub(r'[^a-zA-Z0-9\s&()-]', '', title.lower())
	filename = re.sub(r'\s+', '_', filename)
	filename = filename.replace('&', 'and')
	filename = filename.strip('_')
	return filename


	def update_conference_loader():
	"""Update the conference loader file with all current conferences."""
	conferences_dir = 'src/data/conferences'
	loader_path = 'src/utils/conferenceLoader.ts'

	# Get all conference files
	conference_files = []
	if os.path.exists(conferences_dir):
	for filename in sorted(os.listdir(conferences_dir)):
	if filename.endswith('.yml'):
	conference_files.append(filename)

	# Generate import statements
	imports = []
	variable_names = []

	for filename in conference_files:
	# Create variable name from filename
	var_name = filename.replace('.yml', '').replace('-', '_') + 'Data'
	variable_names.append(var_name)
	imports.append(f"import {var_name} from '@/data/conferences/{filename}';")

	# Generate the loader file content
	loader_content = f"""import {{ Conference }} from '@/types/conference';

	// Import all conference YAML files
	{chr(10).join(imports)}

	// Combine all conference data into a single array
	const allConferencesData: Conference[] = [
	{chr(10).join(f' ...{var_name},' for var_name in variable_names)}
	];

	export default allConferencesData;"""

	# Write the loader file
	with open(loader_path, 'w') as f:
	f.write(loader_content)

	print(f"Updated conference loader with {len(conference_files)} conference files")


	def main():
	try:
	# Load current conferences from individual files
	current_conference_groups = load_all_current_conferences()

	# Fetch and transform new data
	new_conferences = fetch_conference_files()
	if not new_conferences:
	print("Warning: No conferences fetched from ccfddl")
	return

	transformed_conferences = transform_conference_data(new_conferences)
	if not transformed_conferences:
	print("Warning: No conferences transformed")
	return

	# Create conferences directory if it doesn't exist
	conferences_dir = 'src/data/conferences'
	os.makedirs(conferences_dir, exist_ok=True)

	# Group new conferences by title
	new_conference_groups = {}
	for conf in transformed_conferences:
	title = conf['title']
	if title not in new_conference_groups:
	new_conference_groups[title] = []
	new_conference_groups[title].append(conf)

	# Update each conference group
	updated_count = 0
	for title, new_confs in new_conference_groups.items():
	filename = create_filename_from_title(title) + '.yml'
	filepath = os.path.join(conferences_dir, filename)

	# Get current conferences for this title
	current_confs = current_conference_groups.get(title, [])
	current_conf_dict = {conf['id']: conf for conf in current_confs}

	# Update or add new conferences
	for new_conf in new_confs:
	if new_conf['id'] in current_conf_dict:
	# Update existing conference while preserving fields
	curr_conf = current_conf_dict[new_conf['id']]

	# Preserve existing fields
	preserved_fields = [
	'tags', 'venue', 'hindex', 'submission_deadline',
	'timezone_submission', 'rebuttal_period_start',
	'rebuttal_period_end', 'final_decision_date',
	'review_release_date', 'commitment_deadline',
	'start', 'end', 'note', 'city', 'country', 'deadlines'
	]
	for field in preserved_fields:
	if field in curr_conf:
	new_conf[field] = curr_conf[field]

	# Preserve existing rankings if available
	if 'rankings' in curr_conf:
	new_conf['rankings'] = curr_conf['rankings']

	current_conf_dict[new_conf['id']] = new_conf
	else:
	# Add new conference
	current_conf_dict[new_conf['id']] = new_conf

	# Convert back to list and sort by year
	all_confs = list(current_conf_dict.values())
	all_confs.sort(key=lambda x: x.get('year', 9999))

	# Write to individual file
	with open(filepath, 'w') as f:
	yaml.dump(all_confs, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

	updated_count += 1
	print(f"Updated {filename} with {len(all_confs)} entries")

	# Update the conference loader
	update_conference_loader()

	print(f"Successfully updated {updated_count} conference files")

	except Exception as e:
	print(f"Error: {e}")
	raise


	if __name__ == "__main__":
	main()