Spaces:

Nicolas53
/

auto-reporting

Sleeping

auto-reporting / scripts-new /common /itu_api.py

MATIAS Nicolas

patch for discontinued Q

dcfa568 27 days ago

34.8 kB

	"""ITU website data fetching functions.

	Fetches organizational data from ITU web pages:
	- Meeting documents (contributions, TDs)
	- Question details and leadership
	- Study group structure
	- Working party management
	"""

	import pycurl
	import sys
	from io import BytesIO
	from urllib.parse import parse_qs, urlparse, unquote
	from bs4 import BeautifulSoup

	from common.models import (
	ValueAndLink, TableRow, AElement, Column, Row, Table,
	Role, Question, WorkingParty, WorkItem,
	QuestionStructure, WorkingPartyStructure, StudyGroupStructure,
	)

	debug = False


	def _extract_source_from_url(href, fallback_text):
	"""Extract source name from URL 'source' parameter, with fallback to displayed text.

	The ITU website sometimes truncates displayed source names but keeps
	the full name in the URL's source filter parameter.
	"""
	if not href:
	return fallback_text
	try:
	parsed = urlparse(href)
	params = parse_qs(parsed.query)
	if 'source' in params and params['source']:
	return unquote(params['source'][0])
	except Exception:
	pass
	return fallback_text


	# ---------------------------------------------------------------------------
	# Meeting documents
	# ---------------------------------------------------------------------------

	def get_documents(document_type=None, group=None, working_party=None,
	questions=None, start=None):
	"""Fetch temporary documents from ITU meeting document system.

	Args:
	document_type: 'C' (Contributions), 'GEN', 'PLEN', or 'WP'
	group: Study group number (e.g. 17)
	working_party: Working party number
	questions: Single question number, list of numbers, or 'QALL'
	start: Start date string (YYYYMMDD)

	Returns:
	List of TableRow objects.
	"""
	# Normalize start date: accept both YYYYMMDD (8 chars) and YYMMDD (6 chars)
	if len(start) == 8:
	# YYYYMMDD -> extract YY and convert to YYMMDD
	year = int(start[2:4])
	start = start[2:] # Strip first 2 digits (century)
	else:
	# YYMMDD format
	year = int(start[0:2])
	period = str(int(year / 4) * 4 + 1)
	url_base = (f"https://www.itu.int/md/meetingdoc.asp?lang=en"
	f"&parent=T{period}-SG{group}-{start}-")

	type_map = {'C': 'C', 'GEN': 'TD-GEN', 'PLEN': 'TD-PLEN',
	'WP': f'TD-WP{working_party}'}
	if document_type not in type_map:
	return None
	url_base += type_map[document_type]

	question_names = []
	all_questions_name = f'QALL/{group}'

	if questions is not None:
	if isinstance(questions, list):
	for q in questions:
	question_names.append(f'Q{q}/{group}')
	elif isinstance(questions, int):
	url_base += f'&question=Q{questions}/{group}'
	else:
	url_base += f'&question=QALL/{group}'

	first = 0
	table_rows = []

	while True:
	url = f"{url_base}&PageLB={first}"
	response = _fetch_url(url, encoding='iso8859-2')
	tables = _parse_html_tables(response)

	nrows = 0
	for table in tables:
	if not _is_document_table(table):
	continue

	nrows = len(table.rows) - 3
	if nrows <= 0:
	break
	first += nrows

	for i in range(2, len(table.rows) - 1):
	row = table.rows[i]
	number = None
	rev = None
	title = None
	source = None
	related_questions = None

	for col_idx, column in enumerate(row.columns):
	if col_idx == 1: # Number
	if column.aElements:
	value = None
	if column.aElements[0].strongElements:
	value = column.aElements[0].strongElements[0]
	elif column.aElements[0].contents:
	value = _clean(column.aElements[0].get_text())
	href = column.aElements[0].href
	number = ValueAndLink(value, href)
	if column.fontElements:
	rev = column.fontElements[0][0]

	elif col_idx == 2: # Title
	if column.contents:
	idx = column.contents[0].find("\r")
	title = column.contents[0][:idx] if idx >= 0 else column.contents[0]
	idx = title.find("[from ")
	if idx >= 0:
	title = title[:idx]

	elif col_idx == 3: # Source
	if column.aElements:
	href = column.aElements[0].href
	displayed_text = column.aElements[0].get_text()
	source_name = _extract_source_from_url(href, displayed_text)
	source = ValueAndLink(source_name, href)

	elif col_idx == 4: # Related questions
	related_questions = [
	ValueAndLink(a.get_text(), a.href)
	for a in column.aElements
	]

	# Filter by question if a list was provided
	selected = True
	if isinstance(questions, list):
	selected = False
	for q in (related_questions or []):
	if q.value == all_questions_name:
	selected = True
	break
	for qn in question_names:
	if qn == q.value:
	selected = True
	if selected:
	break

	if selected:
	table_rows.append(TableRow(
	number=number, rev=rev, title=title,
	source=source, questions=related_questions
	))

	if nrows <= 0:
	break

	return table_rows


	# ---------------------------------------------------------------------------
	# Question details
	# ---------------------------------------------------------------------------

	def get_question(group=None, question=None, start=None):
	"""Fetch question leadership data from the ITU LOQR page.

	Args:
	group: Study group number
	question: Question number
	start: Start date (YYYYMMDD)

	Returns:
	Question object with roles, or None.
	"""
	year = int(start[:4])
	period = str(int((year - 1953) / 4))
	url = (f"https://www.itu.int/net4/ITU-T/lists/loqr.aspx"
	f"?Group={group}&Period={period}")
	question_name = f'Q{question}/{group}'

	response = _fetch_url(url, encoding='utf-8')
	soup = BeautifulSoup(response, "html.parser")

	prefix = 'ContentPlaceHolder1_dtlRappQues_'
	selected_table = None
	first_row = None
	last_row = None

	for table in soup.find_all("table"):
	rows = table.find_all("tr")
	if not isinstance(rows, list):
	continue
	for i in range(len(rows)):
	row = rows[i]
	for td in row.find_all("td"):
	spans = td.find_all("span")
	if not (isinstance(spans, list) and spans):
	continue
	if 'id' not in spans[0].attrs:
	continue
	span_id = spans[0].attrs['id']
	if span_id.startswith(prefix + 'lblQWP_') or i == len(rows) - 1:
	if selected_table is None:
	contents = spans[0].contents
	if (isinstance(contents, list) and contents and
	contents[0].startswith(question_name)):
	selected_table = table
	first_row = i
	else:
	last_row = i
	break
	if last_row is not None:
	break
	if last_row is not None:
	break

	if last_row is None:
	return None

	question_details = Question(group=group, question=question)
	rows = selected_table.find_all("tr")

	for i in range(first_row, last_row):
	row = rows[i]
	role = Role()
	for td in row.find_all("td"):
	spans = td.find_all("span")
	a_elements = td.find_all("a")
	for span in spans:
	if 'id' not in span.attrs or not span.contents:
	continue
	span_id = span.attrs['id']
	content = span.contents[0]

	if span_id.startswith(prefix + 'lblQWP_'):
	idx1 = content.find('(WP')
	if idx1 >= 0:
	idx2 = content.find('/', idx1 + 3)
	if idx2 > 0:
	try:
	question_details.workingParty = int(content[idx1 + 3:idx2])
	except ValueError:
	pass
	elif span_id.startswith(prefix + 'lblQuestion69_'):
	question_details.title = content
	elif span_id.startswith(prefix + 'lblFName_'):
	role.firstName = content
	elif span_id.startswith(prefix + 'lblLName_'):
	role.lastName = content
	elif span_id.startswith(prefix + 'lblRole_'):
	role.roleName = content
	elif span_id.startswith(prefix + 'lblCompany_'):
	role.company = content
	elif span_id.startswith(prefix + 'lblAddress_'):
	if len(span.contents) > 1:
	role.address = span.contents[-2]
	elif span_id.startswith(prefix + 'telLabel_'):
	role.tel = content
	elif span_id.startswith(prefix + 'lblEmail_'):
	for a_el in a_elements:
	if 'id' in a_el.attrs and a_el.attrs['id'].startswith(prefix + 'linkemail_'):
	if a_el.contents:
	role.email = a_el.contents[0].replace('[at]', '@')
	question_details.addRole(role)
	role = Role()

	return question_details


	# ---------------------------------------------------------------------------
	# Study group structure
	# ---------------------------------------------------------------------------

	def get_study_group(group=None, start=None):
	"""Fetch the SG organizational hierarchy (WPs and Questions).

	Args:
	group: Study group number
	start: Start date string (YYYY/MM/DD)

	Returns:
	StudyGroupStructure object.
	"""
	year = int(start[:4])
	period = str(int((year - 1953) / 4))
	url = (f"https://www.itu.int/net4/ITU-T/lists/sgstructure.aspx"
	f"?Group={group}&Period={period}")

	response = _fetch_url(url, encoding='utf-8')
	soup = BeautifulSoup(response, "html.parser")

	sg = StudyGroupStructure(group=group)
	current_wp = None
	current_q = None

	for table in soup.find_all("table"):
	rows = table.find_all("tr")
	if not isinstance(rows, list):
	continue
	for i in range(1, len(rows)):
	row = rows[i]
	for td in row.find_all("td"):
	wp = None
	wp_title = None
	title = None
	question = None

	spans = td.find_all("span")
	if not (isinstance(spans, list) and spans):
	continue

	for span in spans:
	if 'id' not in span.attrs:
	continue
	span_id = span.attrs['id']

	if 'lblQWP_' in span_id and question is None:
	strongs = span.find_all("strong")
	if strongs and strongs[0].contents:
	question = str(strongs[0].contents[0])
	if question.startswith('Q'):
	# Check if this question is discontinued (text in parent row)
	parent_row = span.find_parent('tr')
	if parent_row and 'discontinued' in parent_row.get_text().lower():
	current_q = None
	continue
	pos = question.find('/')
	number = int(question[1:pos])
	current_q = QuestionStructure(number=number)
	if current_wp is not None:
	current_wp.questions.append(current_q)
	else:
	current_q = None

	elif 'lblBlk' in span_id and wp is None:
	strongs = span.find_all("strong")
	if strongs and strongs[0].contents:
	wp = str(strongs[0].contents[0])
	if wp.startswith('WP'):
	pos = wp.find('/')
	number = int(wp[2:pos])
	current_wp = WorkingPartyStructure(number=number)
	sg.workingParties.append(current_wp)
	else:
	current_wp = None

	elif 'lblQuestion' in span_id:
	strongs = span.find_all("strong")
	if not strongs and title is None:
	title = str(span.contents[0])
	if current_q is not None:
	current_q.title = title
	if strongs and wp_title is None and strongs[0].contents:
	wp_title = str(strongs[0].contents[0])
	if current_wp is not None:
	current_wp.title = wp_title

	return sg


	# ---------------------------------------------------------------------------
	# Working party management
	# ---------------------------------------------------------------------------

	def get_working_party(group=None, working_party=None, questions=None, start=None):
	"""Fetch working party leadership data from ITU management page.

	Args:
	group: Study group number
	working_party: Working party number
	questions: List of question structures
	start: Start date string (YYYY/MM/DD)

	Returns:
	WorkingParty object with roles.
	"""
	wp_details = WorkingParty(
	group=group, workingParty=working_party, questions=questions
	)
	wp_name = f"WP{working_party}/{group}"

	year = int(start[:4])
	period = str(int((year - 1953) / 4))
	url = (f"https://www.itu.int/net4/ITU-T/lists/mgmt.aspx"
	f"?Group={group}&Period={period}")

	response = _fetch_url(url, encoding='utf-8')
	soup = BeautifulSoup(response, "html.parser")

	for table in soup.find_all("table"):
	rows = table.find_all("tr")
	if not isinstance(rows, list):
	continue
	for i in range(1, len(rows)):
	row = rows[i]
	first_name = last_name = wp = title = None
	sub_role = company = address = tel = email = None

	for td in row.find_all("td"):
	for span in td.find_all("span"):
	if 'id' not in span.attrs or not span.contents:
	continue
	span_id = span.attrs['id']
	for content in span.contents:
	if str(content) == '<br/>':
	continue
	s = str(content)
	if "lblFName" in span_id:
	first_name = s
	elif "lblLName" in span_id:
	last_name = s
	elif "lblWP" in span_id:
	wp = s
	elif "lblTitle" in span_id:
	title = s
	elif "lblSubrole" in span_id:
	sub_role = s
	elif "lblCompany" in span_id:
	company = s
	elif "lblAddress" in span_id:
	address = s
	elif "lblTel" in span_id:
	tel = s

	for a_el in td.find_all("a"):
	if 'id' in a_el.attrs and "Email" in a_el.attrs['id']:
	email = str(a_el.contents[0]).replace('[at]', '@')

	if title and first_name and last_name:
	role = Role(
	roleName=title, firstName=first_name, lastName=last_name,
	company=company, address=address, email=email, tel=tel
	)
	if wp is not None and wp.lower() == wp_name.lower():
	wp_details.addRole(role)
	elif wp is None or wp.strip() == '':
	# SG-level roles (Counsellor, Project officer, etc.)
	wp_details.sg_roles.append(role)

	return wp_details


	# ---------------------------------------------------------------------------
	# Work programme
	# ---------------------------------------------------------------------------

	_WP_SEARCH_URL = "https://www.itu.int/ITU-T/workprog/wp_search.aspx"

	_TABULAR_HEADERS = [
	'Work item', 'Question', 'Equiv. Num.', 'Status', 'Timing',
	'Approval process', 'Version', 'Liaison relationship',
	'Subject/Title', 'Priority',
	]


	def get_work_programme(group=None, question=None, working_party=None, start=None):
	"""Fetch work programme items from the public ITU work programme page.

	Discovers ISN parameters dynamically, then scrapes the tabular view.

	Args:
	group: Study group number (e.g. 17)
	question: Question number (e.g. 11), or None for all questions
	working_party: Working party number (e.g. 1)
	start: Start date string (YYYY/MM/DD)

	Returns:
	List of WorkItem objects, or empty list on error.
	"""
	year = int(start[:4])
	# Calculate study period range (e.g., 2026 -> "2025-2028")
	# Study periods: 2001-2004, 2005-2008, ..., 2017-2020, 2022-2024, 2025-2028
	if year >= 2025:
	period_start = 2025 + ((year - 2025) // 4) * 4
	elif year >= 2022:
	period_start = 2022 # Special case: 2022-2024 (3-year period)
	elif year >= 2017:
	period_start = 2017
	else:
	period_start = 2001 + ((year - 2001) // 4) * 4
	study_period_prefix = f"{period_start}-"

	# Step 1: Discover Study Period and Study Group ISNs
	html = _fetch_url(_WP_SEARCH_URL, encoding='utf-8')
	soup = BeautifulSoup(html, 'html.parser')

	isn_sp = _find_dropdown_isn(soup, 'study_period', study_period_prefix)
	isn_sg = _find_dropdown_isn(soup, 'study_group', f'SG{group}:')
	if isn_sp is None or isn_sg is None:
	print(f"Could not find ISN for study period {study_period_prefix}* or SG{group}")
	return []

	# Step 2: Discover WP and Question ISNs
	url2 = f"{_WP_SEARCH_URL}?isn_sp={isn_sp}&isn_sg={isn_sg}"
	html2 = _fetch_url(url2, encoding='utf-8')
	soup2 = BeautifulSoup(html2, 'html.parser')

	isn_wp = _find_dropdown_isn(soup2, 'working_party', f'WP{working_party}/{group}:')
	if isn_wp is None:
	print(f"Could not find ISN for WP{working_party}/{group}")
	return []

	# Question is optional: use -1 for "any question" (all questions under the WP)
	isn_qu = '-1'
	if question is not None:
	isn_qu = _find_dropdown_isn(soup2, 'question', f'Q{question}/{group}:')
	if isn_qu is None:
	print(f"Could not find ISN for Q{question}/{group}")
	return []

	# Step 3: Fetch with all ISNs and parse the tabular view table
	# Use full URL with all statuses and larger page size to get all items
	url3 = (f"{_WP_SEARCH_URL}?isn_sp={isn_sp}&isn_sg={isn_sg}"
	f"&isn_wp={isn_wp}&isn_qu={isn_qu}"
	f"&isn_status=-1,8,1,3,7,2,4,6,5&pg_size=100&details=0&field=acdefghijo")
	print(f" Fetching work programme from: {url3}")
	html3 = _fetch_url(url3, encoding='utf-8')
	soup3 = BeautifulSoup(html3, 'html.parser')

	work_items = _parse_work_programme_table(soup3)

	# Extract detail page links from the entire page (wp_item.aspx?isn=X)
	# and match them to work items by name
	detail_links = {}
	for a_tag in soup3.find_all('a'):
	href = a_tag.get('href', '')
	if 'wp_item.aspx?isn=' in href:
	text = a_tag.get_text().strip()
	if text:
	detail_links[text] = href
	if detail_links:
	print(f" Found {len(detail_links)} detail page links: {list(detail_links.keys())[:5]}...")
	else:
	print(f" No detail page links found on search results page")
	matched = 0
	for wi in work_items:
	if not wi.detailLink and wi.workItem:
	wi.detailLink = detail_links.get(wi.workItem, '')
	if wi.detailLink:
	matched += 1
	if detail_links:
	print(f" Matched {matched}/{len(work_items)} work items to detail links")

	return work_items


	def _find_dropdown_isn(soup, dropdown_name, match_prefix):
	"""Find an ISN value from a <select> dropdown by matching option text."""
	for select in soup.find_all('select'):
	sel_id = select.get('id', '')
	if dropdown_name not in sel_id:
	continue
	for option in select.find_all('option'):
	text = option.get_text().strip()
	value = option.get('value', '-1')
	if value == '-1':
	continue
	if text.startswith(match_prefix):
	return value
	return None


	def _parse_work_programme_table(soup):
	"""Parse the tabular view table from the work programme page."""
	work_items = []

	for table in soup.find_all('table'):
	rows = table.find_all('tr')
	if len(rows) < 2:
	continue
	# Find the header row matching our expected columns
	for row_idx, row in enumerate(rows):
	cells = row.find_all(['td', 'th'])
	texts = [c.get_text().strip() for c in cells]
	# Flexible header matching - check if key columns exist
	if len(texts) >= 10 and texts[0] == 'Work item' and 'Status' in texts:
	# Parse data rows after the header
	for data_row in rows[row_idx + 1:]:
	data_cells = data_row.find_all(['td', 'th'])
	vals = [c.get_text().strip() for c in data_cells]
	if len(vals) < 10:
	continue
	# Extract detail page link from work item name cell
	link_tag = data_cells[0].find('a') if data_cells else None
	detail_link = link_tag.get('href', '') if link_tag else ''
	# Extract work item name: start with X, stop at first space
	work_item_name = vals[0]
	if ' ' in work_item_name:
	work_item_name = work_item_name.split(' ')[0]
	wi = WorkItem(
	workItem=work_item_name,
	question=vals[1],
	title=vals[8],
	timing=vals[4],
	group=None,
	period=None,
	version=vals[6],
	status=vals[3],
	approvalProcess=vals[5],
	equivNum=vals[2],
	detailLink=detail_link,
	)
	work_items.append(wi)
	return work_items

	return work_items


	def get_work_item_editors(work_item_details, max_workers=8):
	"""Fetch editor names from individual work item detail pages.

	Scrapes each work item's detail page (wp_item.aspx?isn=X) to extract
	editor names from the Contact(s) section. Uses parallel requests for speed.

	Returns:
	dict: work_item_name -> "Editor1, Editor2"
	"""
	from concurrent.futures import ThreadPoolExecutor, as_completed

	items_with_links = [wi for wi in work_item_details if wi.detailLink]
	if not items_with_links:
	return {}

	editors = {}

	with ThreadPoolExecutor(max_workers=max_workers) as pool:
	future_to_wi = {
	pool.submit(_fetch_one_editor, wi): wi
	for wi in items_with_links
	}
	for future in as_completed(future_to_wi):
	wi = future_to_wi[future]
	try:
	names = future.result()
	except Exception as e:
	print(f" editors for {wi.workItem} = [fetch failed: {e}]")
	continue
	if names:
	editors[wi.workItem] = ", ".join(names)
	print(f" editors for {wi.workItem} = {', '.join(names)}")
	else:
	print(f" editors for {wi.workItem} = [none found]")

	return editors


	def _fetch_one_editor(wi):
	"""Fetch editor names for a single work item. Returns list of name strings."""
	base_url = "https://www.itu.int/ITU-T/workprog/"
	if wi.detailLink.startswith('http'):
	url = wi.detailLink.replace('http://', 'https://')
	else:
	url = base_url + wi.detailLink

	html = _fetch_url(url, encoding='utf-8')
	soup = BeautifulSoup(html, 'html.parser')

	names = []

	# Strategy 1: Find the Contact(s) row
	contact_cell = None
	for td in soup.find_all('td'):
	td_text = td.get_text().strip()
	if td_text.startswith('Contact'):
	tr = td.find_parent('tr')
	if tr:
	siblings = tr.find_all('td')
	for sib in siblings:
	if sib != td:
	contact_cell = sib
	break
	break

	if contact_cell:
	for span in contact_cell.find_all('span'):
	text = span.get_text().strip()
	if ', Editor' in text:
	name = text[:text.rfind(', Editor')].strip()
	if name:
	names.append(name)

	# Strategy 2: Fallback — search all spans on the page
	if not names:
	for span in soup.find_all('span'):
	text = span.get_text().strip()
	if text.endswith(', Editor'):
	name = text[:-len(', Editor')].strip()
	if name:
	names.append(name)

	return names


	# ---------------------------------------------------------------------------
	# Meeting dates from document listing page
	# ---------------------------------------------------------------------------

	def get_meeting_info(group, start):
	"""Fetch meeting dates and place from the ITU meeting document page.

	Scrapes the meeting page header for:
	- "Meeting from YYYY-MM-DD to YYYY-MM-DD"
	- "held in Country [City]"

	Args:
	group: Study group number (e.g. 17)
	start: Start date string (YYYYMMDD format)

	Returns:
	dict with 'start', 'end' (datetime), 'place', 'country', or None if not found.
	"""
	import re
	from datetime import datetime

	year = int(start[2:4])
	period = str(int(year / 4) * 4 + 1)
	url = f"https://www.itu.int/md/T{period}-SG{group}-{start[2:]}-TD/en"

	try:
	html = _fetch_url(url, encoding='iso8859-2')
	except Exception as e:
	print(f" Warning: could not fetch meeting page: {e}")
	return None

	result = {}

	# Look for "Meeting from YYYY-MM-DD to YYYY-MM-DD"
	date_pattern = re.compile(
	r'Meeting\s+from\s+(\d{4}-\d{2}-\d{2})\s+to\s+(\d{4}-\d{2}-\d{2})',
	re.IGNORECASE
	)
	date_match = date_pattern.search(html)
	if date_match:
	try:
	result['start'] = datetime.strptime(date_match.group(1), "%Y-%m-%d")
	result['end'] = datetime.strptime(date_match.group(2), "%Y-%m-%d")
	except ValueError as e:
	print(f" Warning: could not parse meeting dates: {e}")
	return None
	else:
	print(f" Warning: could not find meeting dates on page")
	return None

	# Look for "held in Country [City]"
	place_pattern = re.compile(
	r'held\s+in\s+([A-Za-z\s]+)\s*\[([A-Za-z\s]+)\]',
	re.IGNORECASE
	)
	place_match = place_pattern.search(html)
	if place_match:
	result['country'] = place_match.group(1).strip()
	result['place'] = place_match.group(2).strip()
	else:
	print(f" Warning: could not find meeting place on page")
	result['country'] = ''
	result['place'] = ''

	return result


	# ---------------------------------------------------------------------------
	# Next SG meeting info
	# ---------------------------------------------------------------------------

	def get_next_sg_meeting(group, after_date=None):
	"""Scrape the ITU SG page to find the next full SG meeting.

	Looks for entries titled "SGxx meeting" (not Content Week, Plenary, or WP)
	on the SG's main page and extracts city and date range.

	Args:
	group: Study group number (e.g. 17)
	after_date: datetime — only return meetings after this date (optional)

	Returns:
	dict with 'city', 'country', 'date_range', 'start_date', 'end_date'
	or None if not found.
	"""
	import re
	from datetime import datetime

	url = (f"https://www.itu.int/en/ITU-T/studygroups/"
	f"2025-2028/{group}/Pages/default.aspx")
	try:
	html = _fetch_url(url)
	except Exception as e:
	print(f" Warning: could not fetch SG{group} page: {e}")
	return None

	soup = BeautifulSoup(html, 'html.parser')

	# Look for spans/strongs with "SGxx meeting" (exact — not Content Week,
	# Plenary, WP, or photos)
	pattern = re.compile(rf'\bSG\s*{group}\s+meeting\b', re.IGNORECASE)
	exclude = re.compile(r'(content week\|plenary\|photo\|WP\d)', re.IGNORECASE)

	for tag in soup.find_all(['span', 'strong'], string=pattern):
	text = tag.get_text(strip=True)
	if exclude.search(text):
	continue

	# The location/date is typically in the next sibling span or in the
	# parent's text, formatted as: ",City, DD-DD Month YYYY"
	parent = tag.parent
	if parent is None:
	continue
	full_text = parent.get_text(strip=True)
	# Remove zero-width spaces
	full_text = full_text.replace('\u200b', '')

	# Extract "City, DD-DD Month YYYY" or "City, D Month YYYY"
	m = re.search(
	r',\s([A-Za-z\s]+?),\s'
	r'(\d{1,2}(?:\s-\s\d{1,2})?)\s+'
	r'([A-Za-z]+)\s+'
	r'(\d{4})',
	full_text
	)
	if not m:
	continue

	city = m.group(1).strip()
	day_range = m.group(2).strip()
	month_str = m.group(3).strip()
	year = m.group(4).strip()

	# Parse start date for comparison
	start_day = day_range.split('-')[0].strip()
	try:
	start_date = datetime.strptime(
	f"{start_day} {month_str} {year}", "%d %B %Y")
	except ValueError:
	continue

	if after_date and start_date <= after_date:
	continue

	# Parse end date if present
	end_date = start_date
	if '-' in day_range:
	end_day = day_range.split('-')[1].strip()
	try:
	end_date = datetime.strptime(
	f"{end_day} {month_str} {year}", "%d %B %Y")
	except ValueError:
	pass

	return {
	'city': city,
	'country': 'Switzerland' if city.lower() == 'geneva' else '',
	'date_range': f"{day_range} {month_str} {year}",
	'start_date': start_date,
	'end_date': end_date,
	}

	return None


	# ---------------------------------------------------------------------------
	# Internal helpers
	# ---------------------------------------------------------------------------

	def _fetch_url(url, encoding='utf-8'):
	"""Fetch a URL using pycurl and return decoded response."""
	request = pycurl.Curl()
	request.setopt(request.URL, url)
	buf = BytesIO()
	request.setopt(request.WRITEDATA, buf)
	request.perform()
	return buf.getvalue().decode(encoding)


	def _clean(value):
	"""Remove non-printable characters (< 32) from a string."""
	return ''.join(ch for ch in str(value) if ord(ch) >= 32)


	def _parse_html_tables(html):
	"""Parse all HTML tables into Table objects."""
	soup = BeautifulSoup(html, "html.parser")
	tables = []
	for html_table in soup.find_all("table"):
	table = Table()
	for html_row in html_table.find_all("tr"):
	row = Row()
	for td in html_row.find_all("td"):
	column = Column()

	for a in td.find_all("a"):
	href = a.attrs.get('href')
	strong_texts = [
	s.get_text() for s in a.find_all("strong")
	]
	a_el = AElement(href=href, strongElements=strong_texts,
	contents=a.contents, text=a.get_text())
	column.aElements.append(a_el)

	for font in td.find_all("font"):
	column.fontElements.append(font.contents)

	for strong in td.find_all("strong"):
	column.strongElements.append(strong.contents)

	column.contents = td.contents
	row.columns.append(column)
	table.rows.append(row)
	tables.append(table)
	return tables


	def _is_document_table(table):
	"""Check if a parsed Table is the meeting documents table (has Number/Title/Source/AI headers)."""
	if len(table.rows) < 2:
	return False
	row = table.rows[1]
	if len(row.columns) < 6:
	return False

	expected = {1: 'Number', 2: 'Title', 3: 'Source', 4: 'AI/Question'}
	for col_idx, text in expected.items():
	col = row.columns[col_idx]
	if (not col.strongElements or not col.strongElements[0] or
	col.strongElements[0][0] != text):
	return False
	return True