Spaces:

INLEXIO
/

openalex-search

Sleeping

App Files Files Community

openalex-search / src /streamlit_app.py

INLEXIO

Update src/streamlit_app.py

cc10842 verified 4 months ago

raw

history blame contribute delete

44.9 kB

	import streamlit as st
	import json
	import pandas as pd
	from io import BytesIO
	from collections import defaultdict, Counter
	import unicodedata

	# Configure page
	st.set_page_config(
	page_title="OpenAlex Author Search",
	page_icon="🔍",
	layout="wide"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main {
	padding: 2rem;
	}
	.stButton>button {
	background-color: #164A78;
	color: white;
	font-size: 16px;
	padding: 0.5rem 2rem;
	border-radius: 5px;
	border: none;
	}
	.stButton>button:hover {
	background-color: #0d3050;
	}
	h1 {
	color: #164A78;
	}
	</style>
	""", unsafe_allow_html=True)

	# ============================================================================
	# UTILITY FUNCTIONS
	# ============================================================================

	def normalize_author_name(name):
	"""Normalize author names to handle accents and dashes"""
	if not name:
	return name

	normalized = unicodedata.normalize('NFD', name)
	ascii_name = normalized.encode('ascii', 'ignore').decode('ascii')

	ascii_name = ascii_name.replace('–', '-')
	ascii_name = ascii_name.replace('—', '-')
	ascii_name = ascii_name.replace('−', '-')
	ascii_name = ascii_name.replace('‐', '-')
	ascii_name = ascii_name.replace('‑', '-')

	ascii_name = ' '.join(ascii_name.split())

	return ascii_name.strip()

	COUNTRY_CODES = {
	'AD': 'Andorra', 'AL': 'Albania', 'AM': 'Armenia', 'AT': 'Austria',
	'AX': 'Åland Islands', 'BA': 'Bosnia and Herzegovina', 'BE': 'Belgium',
	'BG': 'Bulgaria', 'BY': 'Belarus', 'CH': 'Switzerland', 'CY': 'Cyprus',
	'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EE': 'Estonia',
	'ES': 'Spain', 'FI': 'Finland', 'FO': 'Faroe Islands', 'FR': 'France',
	'GB': 'United Kingdom', 'UK': 'United Kingdom', 'GE': 'Georgia',
	'GG': 'Guernsey', 'GI': 'Gibraltar', 'GR': 'Greece', 'HR': 'Croatia',
	'HU': 'Hungary', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IS': 'Iceland',
	'IT': 'Italy', 'JE': 'Jersey', 'LI': 'Liechtenstein', 'LT': 'Lithuania',
	'LU': 'Luxembourg', 'LV': 'Latvia', 'MC': 'Monaco', 'MD': 'Moldova',
	'ME': 'Montenegro', 'MK': 'North Macedonia', 'MT': 'Malta', 'NL': 'Netherlands',
	'NO': 'Norway', 'PL': 'Poland', 'PT': 'Portugal', 'RO': 'Romania',
	'RS': 'Serbia', 'RU': 'Russia', 'SE': 'Sweden', 'SI': 'Slovenia',
	'SJ': 'Svalbard and Jan Mayen', 'SK': 'Slovakia', 'SM': 'San Marino',
	'UA': 'Ukraine', 'VA': 'Vatican City', 'XK': 'Kosovo',
	'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AZ': 'Azerbaijan',
	'BD': 'Bangladesh', 'BH': 'Bahrain', 'BN': 'Brunei', 'BT': 'Bhutan',
	'CN': 'China', 'HK': 'Hong Kong', 'ID': 'Indonesia', 'IL': 'Israel',
	'IN': 'India', 'IQ': 'Iraq', 'IR': 'Iran', 'JO': 'Jordan', 'JP': 'Japan',
	'KG': 'Kyrgyzstan', 'KH': 'Cambodia', 'KP': 'North Korea', 'KR': 'South Korea',
	'KW': 'Kuwait', 'KZ': 'Kazakhstan', 'LA': 'Laos', 'LB': 'Lebanon',
	'LK': 'Sri Lanka', 'MM': 'Myanmar', 'MN': 'Mongolia', 'MO': 'Macau',
	'MV': 'Maldives', 'MY': 'Malaysia', 'NP': 'Nepal', 'OM': 'Oman',
	'PH': 'Philippines', 'PK': 'Pakistan', 'PS': 'Palestine', 'QA': 'Qatar',
	'SA': 'Saudi Arabia', 'SG': 'Singapore', 'SY': 'Syria', 'TH': 'Thailand',
	'TJ': 'Tajikistan', 'TL': 'Timor-Leste', 'TM': 'Turkmenistan', 'TR': 'Turkey',
	'TW': 'Taiwan', 'UZ': 'Uzbekistan', 'VN': 'Vietnam', 'YE': 'Yemen',
	'AO': 'Angola', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'BJ': 'Benin',
	'BW': 'Botswana', 'CD': 'Democratic Republic of the Congo',
	'CF': 'Central African Republic', 'CG': 'Republic of the Congo',
	'CI': 'Ivory Coast', 'CM': 'Cameroon', 'CV': 'Cape Verde', 'DJ': 'Djibouti',
	'DZ': 'Algeria', 'EG': 'Egypt', 'EH': 'Western Sahara', 'ER': 'Eritrea',
	'ET': 'Ethiopia', 'GA': 'Gabon', 'GH': 'Ghana', 'GM': 'Gambia',
	'GN': 'Guinea', 'GQ': 'Equatorial Guinea', 'GW': 'Guinea-Bissau',
	'KE': 'Kenya', 'KM': 'Comoros', 'LR': 'Liberia', 'LS': 'Lesotho',
	'LY': 'Libya', 'MA': 'Morocco', 'MG': 'Madagascar', 'ML': 'Mali',
	'MR': 'Mauritania', 'MU': 'Mauritius', 'MW': 'Malawi', 'MZ': 'Mozambique',
	'NA': 'Namibia', 'NE': 'Niger', 'NG': 'Nigeria', 'RE': 'Réunion',
	'RW': 'Rwanda', 'SC': 'Seychelles', 'SD': 'Sudan', 'SL': 'Sierra Leone',
	'SN': 'Senegal', 'SO': 'Somalia', 'SS': 'South Sudan',
	'ST': 'São Tomé and Príncipe', 'SZ': 'Eswatini', 'TD': 'Chad', 'TG': 'Togo',
	'TN': 'Tunisia', 'TZ': 'Tanzania', 'UG': 'Uganda', 'YT': 'Mayotte',
	'ZA': 'South Africa', 'ZM': 'Zambia', 'ZW': 'Zimbabwe',
	'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AW': 'Aruba',
	'BB': 'Barbados', 'BL': 'Saint Barthélemy', 'BM': 'Bermuda',
	'BQ': 'Caribbean Netherlands', 'BS': 'Bahamas', 'BZ': 'Belize',
	'CA': 'Canada', 'CR': 'Costa Rica', 'CU': 'Cuba', 'CW': 'Curaçao',
	'DM': 'Dominica', 'DO': 'Dominican Republic', 'GD': 'Grenada',
	'GL': 'Greenland', 'GP': 'Guadeloupe', 'GT': 'Guatemala', 'HN': 'Honduras',
	'HT': 'Haiti', 'JM': 'Jamaica', 'KN': 'Saint Kitts and Nevis',
	'KY': 'Cayman Islands', 'LC': 'Saint Lucia', 'MF': 'Saint Martin',
	'MQ': 'Martinique', 'MS': 'Montserrat', 'MX': 'Mexico', 'NI': 'Nicaragua',
	'PA': 'Panama', 'PM': 'Saint Pierre and Miquelon', 'PR': 'Puerto Rico',
	'SV': 'El Salvador', 'SX': 'Sint Maarten', 'TC': 'Turks and Caicos Islands',
	'TT': 'Trinidad and Tobago', 'US': 'United States',
	'VC': 'Saint Vincent and the Grenadines', 'VG': 'British Virgin Islands',
	'VI': 'U.S. Virgin Islands',
	'AR': 'Argentina', 'BO': 'Bolivia', 'BR': 'Brazil', 'CL': 'Chile',
	'CO': 'Colombia', 'EC': 'Ecuador', 'FK': 'Falkland Islands',
	'GF': 'French Guiana', 'GY': 'Guyana', 'PE': 'Peru', 'PY': 'Paraguay',
	'SR': 'Suriname', 'UY': 'Uruguay', 'VE': 'Venezuela',
	'AS': 'American Samoa', 'AU': 'Australia', 'CK': 'Cook Islands',
	'FJ': 'Fiji', 'FM': 'Micronesia', 'GU': 'Guam', 'KI': 'Kiribati',
	'MH': 'Marshall Islands', 'MP': 'Northern Mariana Islands',
	'NC': 'New Caledonia', 'NF': 'Norfolk Island', 'NR': 'Nauru', 'NU': 'Niue',
	'NZ': 'New Zealand', 'PF': 'French Polynesia', 'PG': 'Papua New Guinea',
	'PN': 'Pitcairn Islands', 'PW': 'Palau', 'SB': 'Solomon Islands',
	'TK': 'Tokelau', 'TO': 'Tonga', 'TV': 'Tuvalu',
	'UM': 'U.S. Minor Outlying Islands', 'VU': 'Vanuatu',
	'WF': 'Wallis and Futuna', 'WS': 'Samoa'
	}

	CONTINENT_MAP = {
	'Europe': ['AD', 'AL', 'AT', 'AX', 'BA', 'BE', 'BG', 'BY', 'CH', 'CY',
	'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FO', 'FR', 'GB', 'UK',
	'GG', 'GI', 'GR', 'HR', 'HU', 'IE', 'IM', 'IS', 'IT', 'JE',
	'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'ME', 'MK', 'MT', 'NL',
	'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SJ', 'SK', 'SM',
	'UA', 'VA', 'XK'],
	'Asia': ['AE', 'AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'GE',
	'HK', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KH',
	'KP', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LK', 'MM', 'MN', 'MO',
	'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'PS', 'QA', 'SA', 'SG',
	'SY', 'TH', 'TJ', 'TL', 'TM', 'TR', 'TW', 'UZ', 'VN', 'YE'],
	'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CF', 'CG', 'CI', 'CM',
	'CV', 'DJ', 'DZ', 'EG', 'EH', 'ER', 'ET', 'GA', 'GH', 'GM',
	'GN', 'GQ', 'GW', 'KE', 'KM', 'LR', 'LS', 'LY', 'MA', 'MG',
	'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RE', 'RW',
	'SC', 'SD', 'SL', 'SN', 'SO', 'SS', 'ST', 'SZ', 'TD', 'TG',
	'TN', 'TZ', 'UG', 'YT', 'ZA', 'ZM', 'ZW'],
	'North America': ['AG', 'AI', 'AW', 'BB', 'BL', 'BM', 'BQ', 'BS', 'BZ',
	'CA', 'CR', 'CU', 'CW', 'DM', 'DO', 'GD', 'GL', 'GP',
	'GT', 'HN', 'HT', 'JM', 'KN', 'KY', 'LC', 'MF', 'MQ',
	'MS', 'MX', 'NI', 'PA', 'PM', 'PR', 'SV', 'SX', 'TC',
	'TT', 'US', 'VC', 'VG', 'VI'],
	'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY',
	'PE', 'PY', 'SR', 'UY', 'VE'],
	'Oceania': ['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP', 'NC',
	'NF', 'NR', 'NU', 'NZ', 'PF', 'PG', 'PN', 'PW', 'SB', 'TK',
	'TO', 'TV', 'UM', 'VU', 'WF', 'WS']
	}

	def get_country_name(code):
	return COUNTRY_CODES.get(code.upper(), code)

	def get_continent(country_code):
	cc = country_code.upper()
	for continent, codes in CONTINENT_MAP.items():
	if cc in codes:
	return continent
	return 'Unknown'

	def process_works_to_author_profiles(works, topic_filter=None, journal_filter=None, country_filter=None):
	"""Process works into author profiles with filtering"""
	author_profiles = defaultdict(lambda: {
	'count': 0,
	'citations': [],
	'topics': Counter(),
	'topic_ids': {}, # Store topic IDs
	'coauthors': Counter(),
	'journals': Counter(),
	'countries': Counter(),
	'orcid': '',
	'openalex_id': '',
	'display_name': ''
	})

	for work in works:
	citations = work.get('cited_by_count', 0)

	primary_loc = work.get('primary_location', {})
	source = primary_loc.get('source', {}) if primary_loc else {}
	journal = source.get('display_name', 'Unknown')

	topic = work.get('primary_topic')
	topic_name = topic.get('display_name', 'Unknown') if topic else 'Unknown'
	topic_id = topic.get('id', '') if topic else ''

	# Apply filters
	if topic_filter and topic_filter not in topic_name.lower():
	continue

	if journal_filter and journal_filter not in journal.lower():
	continue

	if country_filter:
	work_has_country = False
	for authorship in work.get('authorships', []):
	countries = authorship.get('countries', [])
	for country_code in countries:
	if country_code:
	country_name = get_country_name(country_code).lower()
	if country_filter in country_name or country_filter in country_code.lower():
	work_has_country = True
	break
	if work_has_country:
	break
	if not work_has_country:
	continue

	# Process authors
	for authorship in work.get('authorships', []):
	author_info = authorship.get('author', {})
	author_name = author_info.get('display_name', 'Unknown')

	if not author_name or author_name == 'Unknown':
	continue

	normalized_name = normalize_author_name(author_name)
	profile = author_profiles[normalized_name]

	if not profile['display_name']:
	profile['display_name'] = author_name

	profile['count'] += 1
	profile['citations'].append(citations)

	if author_info.get('orcid') and not profile['orcid']:
	profile['orcid'] = author_info['orcid']
	if author_info.get('id') and not profile['openalex_id']:
	profile['openalex_id'] = author_info['id']

	if topic_name != 'Unknown':
	profile['topics'][topic_name] += 1
	# Store the topic ID for this topic name
	if topic_name not in profile['topic_ids'] and topic_id:
	profile['topic_ids'][topic_name] = topic_id

	for other_auth in work.get('authorships', []):
	other_name = other_auth.get('author', {}).get('display_name', '')
	if other_name and other_name != author_name:
	profile['coauthors'][other_name] += 1

	if journal != 'Unknown':
	profile['journals'][journal] += 1

	countries = authorship.get('countries', [])
	for country_code in countries:
	if country_code:
	profile['countries'][country_code] += 1

	return author_profiles

	def transform_openalex_api_to_excel_format(api_work):
	"""Convert OpenAlex API format to match Excel export format"""

	# Safety check
	if not api_work:
	return None

	# Extract primary topic
	primary_topic = None
	topics = api_work.get('topics', [])
	if topics and len(topics) > 0:
	topic = topics[0]
	primary_topic = {
	'id': topic.get('id', '').split('/')[-1] if topic.get('id') else '',
	'display_name': topic.get('display_name', ''),
	'subfield': {
	'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
	'display_name': topic.get('subfield', {}).get('display_name', '')
	} if topic.get('subfield') else {'id': '', 'display_name': ''},
	'field': {
	'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
	'display_name': topic.get('field', {}).get('display_name', '')
	} if topic.get('field') else {'id': '', 'display_name': ''},
	'domain': {
	'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
	'display_name': topic.get('domain', {}).get('display_name', '')
	} if topic.get('domain') else {'id': '', 'display_name': ''}
	}

	# Extract authorships
	authorships = []
	for authorship in api_work.get('authorships', []):
	author = authorship.get('author', {})
	if not author:
	continue

	# Extract countries from institutions
	countries = []
	for institution in authorship.get('institutions', []):
	if institution:
	country_code = institution.get('country_code', '')
	if country_code:
	countries.append(country_code)

	# Remove duplicates
	countries = list(set(countries))

	authorships.append({
	'author': {
	'id': author.get('id', '').split('/')[-1] if author.get('id') else '',
	'display_name': author.get('display_name', ''),
	'orcid': author.get('orcid', '')
	},
	'countries': countries
	})

	# Safely extract primary location source
	primary_location = api_work.get('primary_location', {})
	source_name = ''
	if primary_location and primary_location.get('source'):
	source_name = primary_location['source'].get('display_name', '')

	# Build simplified work object
	return {
	'doi': api_work.get('doi', ''),
	'publication_year': api_work.get('publication_year', ''),
	'type': api_work.get('type', ''),
	'cited_by_count': api_work.get('cited_by_count', 0),
	'primary_location': {
	'source': {
	'display_name': source_name
	}
	},
	'biblio': {
	'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else ''
	},
	'primary_topic': primary_topic,
	'mesh': [],
	'authorships': authorships
	}

	# ============================================================================
	# STREAMLIT UI
	# ============================================================================

	st.title("🔍 OpenAlex Author Search")
	st.markdown("Search and analyze author data from OpenAlex")

	# Sidebar for instructions
	with st.sidebar:
	st.header("📖 How to Use")
	st.markdown("""
	Option 1: Fetch from OpenAlex
	1. Paste your OpenAlex URL
	2. Click "Fetch Data"
	3. Search and download results

	Option 2: Upload File
	1. Upload JSON file
	2. Search and download results

	Option 3: Direct API Search
	1. Enter search criteria
	2. Click "Search"
	3. Analyze results
	""")

	st.markdown("---")

	st.subheader("⚙️ API Settings")
	st.caption("💡 Email included in API requests for OpenAlex 'polite pool' (faster access)")
	st.code("halozen@pm.me", language=None)

	# Hardcoded email
	st.session_state.user_email = "halozen@pm.me"

	# Initialize session state for works
	if 'works' not in st.session_state:
	st.session_state.works = None

	# Create tabs for different input methods
	tab1, tab2, tab3 = st.tabs(["🌐 Fetch from OpenAlex", "📁 Upload File", "🔍 Direct API Search"])

	with tab1:
	st.subheader("Fetch Data from OpenAlex")

	openalex_url = st.text_input(
	"Paste OpenAlex URL (web or API format)",
	placeholder="https://openalex.org/works?filter=...",
	help="Paste either the web URL or API URL from OpenAlex"
	)

	col1, col2 = st.columns([1, 3])
	with col1:
	max_pages = st.number_input(
	"Max Pages",
	min_value=1,
	max_value=50,
	value=5,
	help="Each page has up to 200 works. 5 pages = 1,000 works max"
	)

	if st.button("🔍 Fetch Data from OpenAlex", type="primary"):
	if not openalex_url:
	st.error("Please enter an OpenAlex URL")
	else:
	try:
	import re
	import urllib.parse

	# Parse the URL
	parsed = urllib.parse.urlparse(openalex_url)
	query_params = urllib.parse.parse_qs(parsed.query)

	# Build API URL - OpenAlex API format
	api_url = "https://api.openalex.org/works"

	# Extract filter parameter
	if 'filter' in query_params:
	filter_value = query_params['filter'][0]
	api_url = f"{api_url}?filter={filter_value}"

	# Add per-page parameter
	separator = '&' if '?' in api_url else '?'
	api_url = f"{api_url}{separator}per-page=200"

	st.info(f"📡 Fetching from OpenAlex API... (up to {max_pages} pages)")
	st.code(api_url, language=None) # Show the API URL being used

	all_works = []

	# Progress bar
	progress_bar = st.progress(0)
	status_text = st.empty()

	for page in range(1, max_pages + 1):
	# Add page parameter
	page_url = api_url
	if '?' in page_url:
	page_url = f"{page_url}&page={page}"
	else:
	page_url = f"{page_url}?page={page}"

	status_text.text(f"Fetching page {page}/{max_pages}...")

	# Fetch data with user agent
	import urllib.request
	req = urllib.request.Request(page_url)
	req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')

	with urllib.request.urlopen(req) as response:
	data = json.loads(response.read().decode())
	results = data.get('results', [])

	if not results:
	break

	# Transform to Excel format
	for work in results:
	all_works.append(transform_openalex_api_to_excel_format(work))

	progress_bar.progress(page / max_pages)

	# Stop if we got fewer than 200 results (last page)
	if len(results) < 200:
	break

	progress_bar.empty()
	status_text.empty()

	if all_works:
	st.session_state.works = all_works
	st.success(f"✅ Fetched {len(all_works):,} works from OpenAlex")

	# Offer to download the JSON
	json_data = json.dumps(all_works, indent=2)
	st.download_button(
	label="💾 Download Raw Data (JSON)",
	data=json_data,
	file_name="openalex_data.json",
	mime="application/json"
	)
	else:
	st.warning("No works found. Try a different URL or check your filters.")

	except Exception as e:
	st.error(f"❌ Error fetching data: {str(e)}")
	st.info("Make sure your URL is valid and try again.")

	with tab2:
	st.subheader("Upload Existing Data File")

	uploaded_file = st.file_uploader(
	"Upload your OpenAlex data (JSON)",
	type=['json'],
	help="Upload a previously saved JSON file"
	)

	if uploaded_file:
	try:
	# Load data
	data = json.load(uploaded_file)

	# Handle both formats
	if isinstance(data, dict) and 'results' in data:
	api_works = data['results']
	st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in api_works]
	st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format")
	elif isinstance(data, list):
	if data and 'title' in data[0]:
	st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in data]
	st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format")
	else:
	st.session_state.works = data
	else:
	st.error("❌ Unexpected JSON format")
	st.stop()

	st.success(f"✅ Loaded {len(st.session_state.works):,} works from file")

	except json.JSONDecodeError:
	st.error("❌ Invalid JSON file")
	except Exception as e:
	st.error(f"❌ Error processing file: {str(e)}")

	with tab3:
	st.subheader("Direct API Search")
	st.markdown("Search OpenAlex directly without needing a pre-built URL")

	# Search filters
	col1, col2 = st.columns(2)

	with col1:
	api_author_name = st.text_input(
	"Author Name",
	placeholder="e.g., John Smith",
	help="Search for works by a specific author",
	key="api_author_name"
	)

	api_institution = st.text_input(
	"Institution",
	placeholder="e.g., Harvard University",
	help="Filter by institution/affiliation",
	key="api_institution"
	)

	api_topic = st.text_input(
	"Topic/Keyword",
	placeholder="e.g., machine learning",
	help="Search by topic or keyword",
	key="api_topic"
	)

	api_journals = st.text_area(
	"Journal(s)",
	placeholder="Nature\nScience\nCell",
	help="Enter journal names, one per line. Leave blank for all journals.",
	key="api_journals"
	)

	with col2:
	api_year_from = st.number_input(
	"Publication Year From",
	min_value=1900,
	max_value=2025,
	value=2020,
	help="Start year for publication range",
	key="api_year_from"
	)

	api_year_to = st.number_input(
	"Publication Year To",
	min_value=1900,
	max_value=2025,
	value=2025,
	help="End year for publication range",
	key="api_year_to"
	)

	api_min_citations = st.number_input(
	"Minimum Citations",
	min_value=0,
	max_value=10000,
	value=0,
	help="Filter works with at least this many citations",
	key="api_min_citations"
	)

	api_max_citations = st.number_input(
	"Maximum Citations",
	min_value=0,
	max_value=100000,
	value=0,
	help="Filter works with at most this many citations (0 = no limit)",
	key="api_max_citations"
	)

	api_max_results = st.number_input(
	"Maximum Results",
	min_value=100,
	max_value=50000,
	value=1000,
	step=100,
	help="Maximum number of works to retrieve (Warning: >10,000 may be slow)",
	key="api_max_results"
	)
	api_author_name = st.text_input(
	"Author Name",
	placeholder="e.g., John Smith",
	help="Search for works by a specific author"
	)

	api_institution = st.text_input(
	"Institution",
	placeholder="e.g., Harvard University",
	help="Filter by institution/affiliation"
	)

	api_topic = st.text_input(
	"Topic/Keyword",
	placeholder="e.g., machine learning",
	help="Search by topic or keyword"
	)

	with col2:
	api_year_from = st.number_input(
	"Publication Year From",
	min_value=1900,
	max_value=2025,
	value=2020,
	help="Start year for publication range"
	)

	api_year_to = st.number_input(
	"Publication Year To",
	min_value=1900,
	max_value=2025,
	value=2025,
	help="End year for publication range"
	)

	api_max_results = st.number_input(
	"Maximum Results",
	min_value=100,
	max_value=50000,
	value=1000,
	step=100,
	help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
	)

	# Warning for large requests
	if api_max_results > 10000:
	st.warning(f"⚠️ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.")

	if st.button("🔍 Search OpenAlex API", type="primary"):
	try:
	from datetime import datetime

	# Record search start time
	search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	# Build API query
	filters = []

	if api_author_name:
	filters.append(f'author.search:{api_author_name}')

	if api_institution:
	filters.append(f'authorships.institutions.display_name.search:{api_institution}')

	if api_topic:
	filters.append(f'title_and_abstract.search:{api_topic}')

	if api_year_from and api_year_to:
	filters.append(f'publication_year:{api_year_from}-{api_year_to}')

	# Add citation filters
	if api_min_citations > 0:
	filters.append(f'cited_by_count:>{api_min_citations - 1}')

	if api_max_citations > 0:
	filters.append(f'cited_by_count:<{api_max_citations + 1}')

	# Add journal filters - use display_name.search with OR
	if api_journals:
	journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()]
	if journal_list:
	# Create OR filter for journals using pipe separator
	# Format: primary_location.source.display_name.search:journal1\|journal2\|journal3
	journal_search = '\|'.join(journal_list)
	filters.append(f'primary_location.source.display_name.search:{journal_search}')

	# Add article/review/letter filter
	filters.append('type:article\|review\|letter')

	if not filters or filters == ['type:article\|review\|letter']:
	st.warning("Please enter at least one search criterion")
	st.stop()

	# Build API URL
	filter_string = ','.join(filters)
	api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"

	# Store search parameters for summary
	search_params = {
	'author': api_author_name if api_author_name else 'Any',
	'institution': api_institution if api_institution else 'Any',
	'topic': api_topic if api_topic else 'Any',
	'journals': journal_list if api_journals else ['Any'],
	'years': f'{api_year_from}-{api_year_to}',
	'min_citations': api_min_citations,
	'max_citations': api_max_citations if api_max_citations > 0 else 'No limit',
	'search_date': search_datetime
	}

	st.info(f"📡 Searching OpenAlex API...")
	st.code(api_url, language=None)

	all_works = []
	max_pages = (api_max_results // 200) + 1

	progress_bar = st.progress(0)
	status_text = st.empty()

	for page in range(1, max_pages + 1):
	page_url = f"{api_url}&page={page}"
	status_text.text(f"Fetching page {page}/{max_pages}... ({len(all_works)} works so far)")

	import urllib.request
	req = urllib.request.Request(page_url)
	req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')

	with urllib.request.urlopen(req) as response:
	data = json.loads(response.read().decode())
	results = data.get('results', [])

	if not results:
	break

	for work in results:
	transformed = transform_openalex_api_to_excel_format(work)
	if transformed: # Only add if transformation succeeded
	all_works.append(transformed)

	progress_bar.progress(min(page / max_pages, 1.0))

	if len(all_works) >= api_max_results:
	all_works = all_works[:api_max_results]
	break

	if len(results) < 200:
	break

	progress_bar.empty()
	status_text.empty()

	if all_works:
	st.session_state.works = all_works
	st.session_state.search_params = search_params # Store search parameters

	st.success(f"✅ Found {len(all_works):,} works from OpenAlex")

	# Display search summary
	st.markdown("---")
	st.subheader("📊 Search Summary")

	summary_col1, summary_col2 = st.columns(2)

	with summary_col1:
	st.markdown(f"Search Date: {search_params['search_date']}")
	st.markdown(f"Author: {search_params['author']}")
	st.markdown(f"Institution: {search_params['institution']}")
	st.markdown(f"Topic: {search_params['topic']}")

	with summary_col2:
	st.markdown(f"Years: {search_params['years']}")
	st.markdown(f"Min Citations: {search_params['min_citations']}")
	st.markdown(f"Max Citations: {search_params['max_citations']}")
	if search_params['journals'] != ['Any']:
	st.markdown(f"Journals: {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}")
	else:
	st.markdown(f"Journals: Any")

	st.markdown(f"Total Works Retrieved: {len(all_works):,}")
	st.markdown("---")

	# Offer to download the JSON
	json_data = json.dumps(all_works, indent=2)
	st.download_button(
	label="💾 Download Raw Data (JSON)",
	data=json_data,
	file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json",
	mime="application/json"
	)
	else:
	st.warning("No works found. Try different search criteria.")

	except Exception as e:
	st.error(f"❌ Error searching API: {str(e)}")
	st.info("Make sure your search criteria are valid and try again.")

	# Only show search interface if we have data
	if st.session_state.works:
	st.markdown("---")
	st.header("🔎 Search Authors")

	# Search criteria in columns
	col1, col2 = st.columns(2)

	with col1:
	topic_search = st.text_input(
	"🔬 Search by Topic",
	placeholder="e.g., neuroscience",
	help="Filter works by topic keyword (case-insensitive)"
	)

	author_search = st.text_input(
	"👤 Search by Author Name",
	placeholder="e.g., Smith",
	help="Filter authors by name (partial match)"
	)

	with col2:
	journal_search = st.text_input(
	"📄 Search by Journal",
	placeholder="e.g., Nature",
	help="Filter works by journal name"
	)

	country_search = st.text_input(
	"🌍 Search by Country",
	placeholder="e.g., United States or US",
	help="Filter authors by country (name or code)"
	)

	# Additional options
	col3, col4, col5 = st.columns(3)

	with col3:
	min_articles = st.number_input(
	"Minimum Articles",
	min_value=1,
	max_value=100,
	value=3,
	help="Minimum number of publications"
	)

	with col4:
	max_results = st.number_input(
	"Maximum Results",
	min_value=1,
	max_value=500,
	value=50,
	help="Maximum number of authors to display"
	)

	with col5:
	sort_by = st.selectbox(
	"Sort By",
	["Count", "Average Citations", "Median Citations"],
	help="How to sort the results"
	)

	# Additional display options
	st.markdown("Display Options:")
	col6, col7 = st.columns(2)
	with col6:
	link_type = st.selectbox(
	"Link Author Names To",
	["None", "ORCID", "OpenAlex"],
	help="Make author names clickable links"
	)
	with col7:
	link_topics = st.checkbox(
	"Link Topics to OpenAlex",
	value=True,
	help="Make topic names clickable"
	)

	# Search button
	if st.button("🔍 Search Authors", type="primary"):
	with st.spinner("Processing author profiles..."):

	# Process works
	profiles = process_works_to_author_profiles(
	st.session_state.works,
	topic_filter=topic_search.lower() if topic_search else None,
	journal_filter=journal_search.lower() if journal_search else None,
	country_filter=country_search.lower() if country_search else None
	)

	# Build results
	results = []
	for normalized_name, profile in profiles.items():
	if profile['count'] < min_articles:
	continue

	# Author name filter
	if author_search:
	display_name = profile['display_name'].lower()
	if author_search.lower() not in normalized_name.lower() and author_search.lower() not in display_name:
	continue

	citations = profile['citations']
	median_cites = sorted(citations)[len(citations)//2] if citations else 0
	avg_cites = round(sum(citations) / len(citations), 1) if citations else 0

	most_common_country = profile['countries'].most_common(1)
	country_code = most_common_country[0][0] if most_common_country else ''
	country_name = get_country_name(country_code) if country_code else ''
	continent = get_continent(country_code) if country_code else ''

	top_topics = ', '.join([t for t, _ in profile['topics'].most_common(5)])

	# Create clickable topic links if enabled
	if link_topics and top_topics:
	topic_list = [t for t, _ in profile['topics'].most_common(5)]
	# Create OpenAlex works filter links using stored topic IDs
	linked_topics = []
	for topic in topic_list:
	# Get the topic ID if we have it
	topic_id = profile['topic_ids'].get(topic, '')
	if topic_id:
	# Extract just the ID (e.g., "T12345" from full URL or already formatted)
	if '/' in topic_id:
	topic_id = topic_id.split('/')[-1]
	# Convert to lowercase for the filter (e.g., T10316 -> t10316)
	topic_id_lower = topic_id.lower()
	# Link to works filtered by this topic with additional parameters
	link = f'<a href="https://openalex.org/works?filter=primary_topic.id:{topic_id_lower},primary_location.source.type:source-types/journal&group_by=publication_year,open_access.is_oa,primary_topic.id,authorships.institutions.lineage,type,authorships.author.id,primary_location.source.id,primary_location.source.type&page=1" target="_blank">{topic}</a>'
	else:
	# Fallback to search if no ID available
	topic_encoded = topic.replace(' ', '+')
	link = f'<a href="https://openalex.org/topics?search={topic_encoded}" target="_blank">{topic}</a>'
	linked_topics.append(link)
	top_topics_display = ', '.join(linked_topics)
	else:
	top_topics_display = top_topics

	top_coauthors = ', '.join([c for c, _ in profile['coauthors'].most_common(5)])
	top_journals = ', '.join([j for j, _ in profile['journals'].most_common(5)])

	# Create author name with optional link
	author_display = profile['display_name']
	if link_type == "ORCID" and profile['orcid']:
	author_display = f'<a href="{profile["orcid"]}" target="_blank">{profile["display_name"]}</a>'
	elif link_type == "OpenAlex" and profile['openalex_id']:
	# Extract just the ID from the full URL if needed
	openalex_id = profile['openalex_id'].split('/')[-1] if '/' in profile['openalex_id'] else profile['openalex_id']
	author_display = f'<a href="https://openalex.org/authors/{openalex_id}" target="_blank">{profile["display_name"]}</a>'

	results.append({
	'Author': author_display,
	'Count': profile['count'],
	'Median Citations': median_cites,
	'Average Citations': avg_cites,
	'Country': country_name,
	'Continent': continent,
	'Top Topics': top_topics_display,
	'Top Co-authors': top_coauthors,
	'Top Journals': top_journals
	})

	# For Excel export, create a version without HTML
	df_export = pd.DataFrame(results).copy()
	# Remove HTML tags for Excel export
	df_export['Author'] = df_export['Author'].str.replace('<[^<]+?>', '', regex=True)
	df_export['Top Topics'] = df_export['Top Topics'].str.replace('<[^<]+?>', '', regex=True)

	# Sort results
	if sort_by == "Count":
	results.sort(key=lambda x: x['Count'], reverse=True)
	elif sort_by == "Average Citations":
	results.sort(key=lambda x: x['Average Citations'], reverse=True)
	elif sort_by == "Median Citations":
	results.sort(key=lambda x: x['Median Citations'], reverse=True)

	# Limit results
	results = results[:max_results]

	if results:
	st.success(f"✅ Found {len(results)} matching authors")

	# Convert to DataFrame
	df = pd.DataFrame(results)

	# Custom CSS for better table display
	st.markdown("""
	<style>
	/* Style the HTML table */
	table {
	width: 100%;
	border-collapse: collapse;
	margin: 20px 0;
	font-size: 14px;
	}

	table thead tr {
	background-color: #164A78;
	color: white;
	text-align: left;
	font-weight: bold;
	}

	table th, table td {
	padding: 12px 15px;
	border: 1px solid #ddd;
	}

	table tbody tr {
	border-bottom: 1px solid #dddddd;
	}

	table tbody tr:nth-of-type(even) {
	background-color: #f3f3f3;
	}

	table tbody tr:hover {
	background-color: #e8f4f8;
	}

	/* Center numeric columns - Count, Median, Average */
	table td:nth-child(2),
	table td:nth-child(3),
	table td:nth-child(4) {
	text-align: center;
	}

	/* Enable text wrapping */
	table td {
	white-space: normal;
	word-wrap: break-word;
	max-width: 300px;
	}

	/* Make links blue and underlined */
	table a {
	color: #1e88e5;
	text-decoration: underline;
	}

	table a:hover {
	color: #0d47a1;
	}
	</style>
	""", unsafe_allow_html=True)

	# Display results as HTML table with clickable links
	st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)

	# Summary statistics
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Total Authors", len(results))
	with col2:
	st.metric("Avg Publications", f"{df['Count'].mean():.1f}")
	with col3:
	st.metric("Avg Citations", f"{df['Average Citations'].mean():.1f}")
	with col4:
	orcid_count = sum(1 for r in results if link_type == "ORCID" and '<a href=' in r['Author'])
	openalex_count = sum(1 for r in results if link_type == "OpenAlex" and '<a href=' in r['Author'])
	linked_count = orcid_count if link_type == "ORCID" else openalex_count
	st.metric("With Links", linked_count if link_type != "None" else "N/A")

	# Download button - use clean data without HTML
	output = BytesIO()
	with pd.ExcelWriter(output, engine='openpyxl') as writer:
	df_export.to_excel(writer, index=False, sheet_name='Author Search Results')

	st.download_button(
	label="📥 Download Results (Excel)",
	data=output.getvalue(),
	file_name="author_search_results.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	else:
	st.warning("No authors match your search criteria. Try adjusting your filters.")

	else:
	st.info("👆 Fetch data from OpenAlex or upload a JSON file to get started")