import streamlit as st
import json
import pandas as pd
from io import BytesIO
from collections import defaultdict, Counter
import unicodedata
# Configure page
st.set_page_config(
page_title="OpenAlex Author Search",
page_icon="๐",
layout="wide"
)
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================
def normalize_author_name(name):
"""Normalize author names to handle accents and dashes"""
if not name:
return name
normalized = unicodedata.normalize('NFD', name)
ascii_name = normalized.encode('ascii', 'ignore').decode('ascii')
ascii_name = ascii_name.replace('โ', '-')
ascii_name = ascii_name.replace('โ', '-')
ascii_name = ascii_name.replace('โ', '-')
ascii_name = ascii_name.replace('โ', '-')
ascii_name = ascii_name.replace('โ', '-')
ascii_name = ' '.join(ascii_name.split())
return ascii_name.strip()
COUNTRY_CODES = {
'AD': 'Andorra', 'AL': 'Albania', 'AM': 'Armenia', 'AT': 'Austria',
'AX': 'ร
land Islands', 'BA': 'Bosnia and Herzegovina', 'BE': 'Belgium',
'BG': 'Bulgaria', 'BY': 'Belarus', 'CH': 'Switzerland', 'CY': 'Cyprus',
'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EE': 'Estonia',
'ES': 'Spain', 'FI': 'Finland', 'FO': 'Faroe Islands', 'FR': 'France',
'GB': 'United Kingdom', 'UK': 'United Kingdom', 'GE': 'Georgia',
'GG': 'Guernsey', 'GI': 'Gibraltar', 'GR': 'Greece', 'HR': 'Croatia',
'HU': 'Hungary', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IS': 'Iceland',
'IT': 'Italy', 'JE': 'Jersey', 'LI': 'Liechtenstein', 'LT': 'Lithuania',
'LU': 'Luxembourg', 'LV': 'Latvia', 'MC': 'Monaco', 'MD': 'Moldova',
'ME': 'Montenegro', 'MK': 'North Macedonia', 'MT': 'Malta', 'NL': 'Netherlands',
'NO': 'Norway', 'PL': 'Poland', 'PT': 'Portugal', 'RO': 'Romania',
'RS': 'Serbia', 'RU': 'Russia', 'SE': 'Sweden', 'SI': 'Slovenia',
'SJ': 'Svalbard and Jan Mayen', 'SK': 'Slovakia', 'SM': 'San Marino',
'UA': 'Ukraine', 'VA': 'Vatican City', 'XK': 'Kosovo',
'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AZ': 'Azerbaijan',
'BD': 'Bangladesh', 'BH': 'Bahrain', 'BN': 'Brunei', 'BT': 'Bhutan',
'CN': 'China', 'HK': 'Hong Kong', 'ID': 'Indonesia', 'IL': 'Israel',
'IN': 'India', 'IQ': 'Iraq', 'IR': 'Iran', 'JO': 'Jordan', 'JP': 'Japan',
'KG': 'Kyrgyzstan', 'KH': 'Cambodia', 'KP': 'North Korea', 'KR': 'South Korea',
'KW': 'Kuwait', 'KZ': 'Kazakhstan', 'LA': 'Laos', 'LB': 'Lebanon',
'LK': 'Sri Lanka', 'MM': 'Myanmar', 'MN': 'Mongolia', 'MO': 'Macau',
'MV': 'Maldives', 'MY': 'Malaysia', 'NP': 'Nepal', 'OM': 'Oman',
'PH': 'Philippines', 'PK': 'Pakistan', 'PS': 'Palestine', 'QA': 'Qatar',
'SA': 'Saudi Arabia', 'SG': 'Singapore', 'SY': 'Syria', 'TH': 'Thailand',
'TJ': 'Tajikistan', 'TL': 'Timor-Leste', 'TM': 'Turkmenistan', 'TR': 'Turkey',
'TW': 'Taiwan', 'UZ': 'Uzbekistan', 'VN': 'Vietnam', 'YE': 'Yemen',
'AO': 'Angola', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'BJ': 'Benin',
'BW': 'Botswana', 'CD': 'Democratic Republic of the Congo',
'CF': 'Central African Republic', 'CG': 'Republic of the Congo',
'CI': 'Ivory Coast', 'CM': 'Cameroon', 'CV': 'Cape Verde', 'DJ': 'Djibouti',
'DZ': 'Algeria', 'EG': 'Egypt', 'EH': 'Western Sahara', 'ER': 'Eritrea',
'ET': 'Ethiopia', 'GA': 'Gabon', 'GH': 'Ghana', 'GM': 'Gambia',
'GN': 'Guinea', 'GQ': 'Equatorial Guinea', 'GW': 'Guinea-Bissau',
'KE': 'Kenya', 'KM': 'Comoros', 'LR': 'Liberia', 'LS': 'Lesotho',
'LY': 'Libya', 'MA': 'Morocco', 'MG': 'Madagascar', 'ML': 'Mali',
'MR': 'Mauritania', 'MU': 'Mauritius', 'MW': 'Malawi', 'MZ': 'Mozambique',
'NA': 'Namibia', 'NE': 'Niger', 'NG': 'Nigeria', 'RE': 'Rรฉunion',
'RW': 'Rwanda', 'SC': 'Seychelles', 'SD': 'Sudan', 'SL': 'Sierra Leone',
'SN': 'Senegal', 'SO': 'Somalia', 'SS': 'South Sudan',
'ST': 'Sรฃo Tomรฉ and Prรญncipe', 'SZ': 'Eswatini', 'TD': 'Chad', 'TG': 'Togo',
'TN': 'Tunisia', 'TZ': 'Tanzania', 'UG': 'Uganda', 'YT': 'Mayotte',
'ZA': 'South Africa', 'ZM': 'Zambia', 'ZW': 'Zimbabwe',
'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AW': 'Aruba',
'BB': 'Barbados', 'BL': 'Saint Barthรฉlemy', 'BM': 'Bermuda',
'BQ': 'Caribbean Netherlands', 'BS': 'Bahamas', 'BZ': 'Belize',
'CA': 'Canada', 'CR': 'Costa Rica', 'CU': 'Cuba', 'CW': 'Curaรงao',
'DM': 'Dominica', 'DO': 'Dominican Republic', 'GD': 'Grenada',
'GL': 'Greenland', 'GP': 'Guadeloupe', 'GT': 'Guatemala', 'HN': 'Honduras',
'HT': 'Haiti', 'JM': 'Jamaica', 'KN': 'Saint Kitts and Nevis',
'KY': 'Cayman Islands', 'LC': 'Saint Lucia', 'MF': 'Saint Martin',
'MQ': 'Martinique', 'MS': 'Montserrat', 'MX': 'Mexico', 'NI': 'Nicaragua',
'PA': 'Panama', 'PM': 'Saint Pierre and Miquelon', 'PR': 'Puerto Rico',
'SV': 'El Salvador', 'SX': 'Sint Maarten', 'TC': 'Turks and Caicos Islands',
'TT': 'Trinidad and Tobago', 'US': 'United States',
'VC': 'Saint Vincent and the Grenadines', 'VG': 'British Virgin Islands',
'VI': 'U.S. Virgin Islands',
'AR': 'Argentina', 'BO': 'Bolivia', 'BR': 'Brazil', 'CL': 'Chile',
'CO': 'Colombia', 'EC': 'Ecuador', 'FK': 'Falkland Islands',
'GF': 'French Guiana', 'GY': 'Guyana', 'PE': 'Peru', 'PY': 'Paraguay',
'SR': 'Suriname', 'UY': 'Uruguay', 'VE': 'Venezuela',
'AS': 'American Samoa', 'AU': 'Australia', 'CK': 'Cook Islands',
'FJ': 'Fiji', 'FM': 'Micronesia', 'GU': 'Guam', 'KI': 'Kiribati',
'MH': 'Marshall Islands', 'MP': 'Northern Mariana Islands',
'NC': 'New Caledonia', 'NF': 'Norfolk Island', 'NR': 'Nauru', 'NU': 'Niue',
'NZ': 'New Zealand', 'PF': 'French Polynesia', 'PG': 'Papua New Guinea',
'PN': 'Pitcairn Islands', 'PW': 'Palau', 'SB': 'Solomon Islands',
'TK': 'Tokelau', 'TO': 'Tonga', 'TV': 'Tuvalu',
'UM': 'U.S. Minor Outlying Islands', 'VU': 'Vanuatu',
'WF': 'Wallis and Futuna', 'WS': 'Samoa'
}
CONTINENT_MAP = {
'Europe': ['AD', 'AL', 'AT', 'AX', 'BA', 'BE', 'BG', 'BY', 'CH', 'CY',
'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FO', 'FR', 'GB', 'UK',
'GG', 'GI', 'GR', 'HR', 'HU', 'IE', 'IM', 'IS', 'IT', 'JE',
'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'ME', 'MK', 'MT', 'NL',
'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SJ', 'SK', 'SM',
'UA', 'VA', 'XK'],
'Asia': ['AE', 'AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'GE',
'HK', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KH',
'KP', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LK', 'MM', 'MN', 'MO',
'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'PS', 'QA', 'SA', 'SG',
'SY', 'TH', 'TJ', 'TL', 'TM', 'TR', 'TW', 'UZ', 'VN', 'YE'],
'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CF', 'CG', 'CI', 'CM',
'CV', 'DJ', 'DZ', 'EG', 'EH', 'ER', 'ET', 'GA', 'GH', 'GM',
'GN', 'GQ', 'GW', 'KE', 'KM', 'LR', 'LS', 'LY', 'MA', 'MG',
'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RE', 'RW',
'SC', 'SD', 'SL', 'SN', 'SO', 'SS', 'ST', 'SZ', 'TD', 'TG',
'TN', 'TZ', 'UG', 'YT', 'ZA', 'ZM', 'ZW'],
'North America': ['AG', 'AI', 'AW', 'BB', 'BL', 'BM', 'BQ', 'BS', 'BZ',
'CA', 'CR', 'CU', 'CW', 'DM', 'DO', 'GD', 'GL', 'GP',
'GT', 'HN', 'HT', 'JM', 'KN', 'KY', 'LC', 'MF', 'MQ',
'MS', 'MX', 'NI', 'PA', 'PM', 'PR', 'SV', 'SX', 'TC',
'TT', 'US', 'VC', 'VG', 'VI'],
'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY',
'PE', 'PY', 'SR', 'UY', 'VE'],
'Oceania': ['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP', 'NC',
'NF', 'NR', 'NU', 'NZ', 'PF', 'PG', 'PN', 'PW', 'SB', 'TK',
'TO', 'TV', 'UM', 'VU', 'WF', 'WS']
}
def get_country_name(code):
return COUNTRY_CODES.get(code.upper(), code)
def get_continent(country_code):
cc = country_code.upper()
for continent, codes in CONTINENT_MAP.items():
if cc in codes:
return continent
return 'Unknown'
def process_works_to_author_profiles(works, topic_filter=None, journal_filter=None, country_filter=None):
"""Process works into author profiles with filtering"""
author_profiles = defaultdict(lambda: {
'count': 0,
'citations': [],
'topics': Counter(),
'topic_ids': {}, # Store topic IDs
'coauthors': Counter(),
'journals': Counter(),
'countries': Counter(),
'orcid': '',
'openalex_id': '',
'display_name': ''
})
for work in works:
citations = work.get('cited_by_count', 0)
primary_loc = work.get('primary_location', {})
source = primary_loc.get('source', {}) if primary_loc else {}
journal = source.get('display_name', 'Unknown')
topic = work.get('primary_topic')
topic_name = topic.get('display_name', 'Unknown') if topic else 'Unknown'
topic_id = topic.get('id', '') if topic else ''
# Apply filters
if topic_filter and topic_filter not in topic_name.lower():
continue
if journal_filter and journal_filter not in journal.lower():
continue
if country_filter:
work_has_country = False
for authorship in work.get('authorships', []):
countries = authorship.get('countries', [])
for country_code in countries:
if country_code:
country_name = get_country_name(country_code).lower()
if country_filter in country_name or country_filter in country_code.lower():
work_has_country = True
break
if work_has_country:
break
if not work_has_country:
continue
# Process authors
for authorship in work.get('authorships', []):
author_info = authorship.get('author', {})
author_name = author_info.get('display_name', 'Unknown')
if not author_name or author_name == 'Unknown':
continue
normalized_name = normalize_author_name(author_name)
profile = author_profiles[normalized_name]
if not profile['display_name']:
profile['display_name'] = author_name
profile['count'] += 1
profile['citations'].append(citations)
if author_info.get('orcid') and not profile['orcid']:
profile['orcid'] = author_info['orcid']
if author_info.get('id') and not profile['openalex_id']:
profile['openalex_id'] = author_info['id']
if topic_name != 'Unknown':
profile['topics'][topic_name] += 1
# Store the topic ID for this topic name
if topic_name not in profile['topic_ids'] and topic_id:
profile['topic_ids'][topic_name] = topic_id
for other_auth in work.get('authorships', []):
other_name = other_auth.get('author', {}).get('display_name', '')
if other_name and other_name != author_name:
profile['coauthors'][other_name] += 1
if journal != 'Unknown':
profile['journals'][journal] += 1
countries = authorship.get('countries', [])
for country_code in countries:
if country_code:
profile['countries'][country_code] += 1
return author_profiles
def transform_openalex_api_to_excel_format(api_work):
"""Convert OpenAlex API format to match Excel export format"""
# Safety check
if not api_work:
return None
# Extract primary topic
primary_topic = None
topics = api_work.get('topics', [])
if topics and len(topics) > 0:
topic = topics[0]
primary_topic = {
'id': topic.get('id', '').split('/')[-1] if topic.get('id') else '',
'display_name': topic.get('display_name', ''),
'subfield': {
'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
'display_name': topic.get('subfield', {}).get('display_name', '')
} if topic.get('subfield') else {'id': '', 'display_name': ''},
'field': {
'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
'display_name': topic.get('field', {}).get('display_name', '')
} if topic.get('field') else {'id': '', 'display_name': ''},
'domain': {
'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
'display_name': topic.get('domain', {}).get('display_name', '')
} if topic.get('domain') else {'id': '', 'display_name': ''}
}
# Extract authorships
authorships = []
for authorship in api_work.get('authorships', []):
author = authorship.get('author', {})
if not author:
continue
# Extract countries from institutions
countries = []
for institution in authorship.get('institutions', []):
if institution:
country_code = institution.get('country_code', '')
if country_code:
countries.append(country_code)
# Remove duplicates
countries = list(set(countries))
authorships.append({
'author': {
'id': author.get('id', '').split('/')[-1] if author.get('id') else '',
'display_name': author.get('display_name', ''),
'orcid': author.get('orcid', '')
},
'countries': countries
})
# Safely extract primary location source
primary_location = api_work.get('primary_location', {})
source_name = ''
if primary_location and primary_location.get('source'):
source_name = primary_location['source'].get('display_name', '')
# Build simplified work object
return {
'doi': api_work.get('doi', ''),
'publication_year': api_work.get('publication_year', ''),
'type': api_work.get('type', ''),
'cited_by_count': api_work.get('cited_by_count', 0),
'primary_location': {
'source': {
'display_name': source_name
}
},
'biblio': {
'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else ''
},
'primary_topic': primary_topic,
'mesh': [],
'authorships': authorships
}
# ============================================================================
# STREAMLIT UI
# ============================================================================
st.title("๐ OpenAlex Author Search")
st.markdown("Search and analyze author data from OpenAlex")
# Sidebar for instructions
with st.sidebar:
st.header("๐ How to Use")
st.markdown("""
**Option 1: Fetch from OpenAlex**
1. Paste your OpenAlex URL
2. Click "Fetch Data"
3. Search and download results
**Option 2: Upload File**
1. Upload JSON file
2. Search and download results
**Option 3: Direct API Search**
1. Enter search criteria
2. Click "Search"
3. Analyze results
""")
st.markdown("---")
st.subheader("โ๏ธ API Settings")
st.caption("๐ก Email included in API requests for OpenAlex 'polite pool' (faster access)")
st.code("halozen@pm.me", language=None)
# Hardcoded email
st.session_state.user_email = "halozen@pm.me"
# Initialize session state for works
if 'works' not in st.session_state:
st.session_state.works = None
# Create tabs for different input methods
tab1, tab2, tab3 = st.tabs(["๐ Fetch from OpenAlex", "๐ Upload File", "๐ Direct API Search"])
with tab1:
st.subheader("Fetch Data from OpenAlex")
openalex_url = st.text_input(
"Paste OpenAlex URL (web or API format)",
placeholder="https://openalex.org/works?filter=...",
help="Paste either the web URL or API URL from OpenAlex"
)
col1, col2 = st.columns([1, 3])
with col1:
max_pages = st.number_input(
"Max Pages",
min_value=1,
max_value=50,
value=5,
help="Each page has up to 200 works. 5 pages = 1,000 works max"
)
if st.button("๐ Fetch Data from OpenAlex", type="primary"):
if not openalex_url:
st.error("Please enter an OpenAlex URL")
else:
try:
import re
import urllib.parse
# Parse the URL
parsed = urllib.parse.urlparse(openalex_url)
query_params = urllib.parse.parse_qs(parsed.query)
# Build API URL - OpenAlex API format
api_url = "https://api.openalex.org/works"
# Extract filter parameter
if 'filter' in query_params:
filter_value = query_params['filter'][0]
api_url = f"{api_url}?filter={filter_value}"
# Add per-page parameter
separator = '&' if '?' in api_url else '?'
api_url = f"{api_url}{separator}per-page=200"
st.info(f"๐ก Fetching from OpenAlex API... (up to {max_pages} pages)")
st.code(api_url, language=None) # Show the API URL being used
all_works = []
# Progress bar
progress_bar = st.progress(0)
status_text = st.empty()
for page in range(1, max_pages + 1):
# Add page parameter
page_url = api_url
if '?' in page_url:
page_url = f"{page_url}&page={page}"
else:
page_url = f"{page_url}?page={page}"
status_text.text(f"Fetching page {page}/{max_pages}...")
# Fetch data with user agent
import urllib.request
req = urllib.request.Request(page_url)
req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')
with urllib.request.urlopen(req) as response:
data = json.loads(response.read().decode())
results = data.get('results', [])
if not results:
break
# Transform to Excel format
for work in results:
all_works.append(transform_openalex_api_to_excel_format(work))
progress_bar.progress(page / max_pages)
# Stop if we got fewer than 200 results (last page)
if len(results) < 200:
break
progress_bar.empty()
status_text.empty()
if all_works:
st.session_state.works = all_works
st.success(f"โ
Fetched {len(all_works):,} works from OpenAlex")
# Offer to download the JSON
json_data = json.dumps(all_works, indent=2)
st.download_button(
label="๐พ Download Raw Data (JSON)",
data=json_data,
file_name="openalex_data.json",
mime="application/json"
)
else:
st.warning("No works found. Try a different URL or check your filters.")
except Exception as e:
st.error(f"โ Error fetching data: {str(e)}")
st.info("Make sure your URL is valid and try again.")
with tab2:
st.subheader("Upload Existing Data File")
uploaded_file = st.file_uploader(
"Upload your OpenAlex data (JSON)",
type=['json'],
help="Upload a previously saved JSON file"
)
if uploaded_file:
try:
# Load data
data = json.load(uploaded_file)
# Handle both formats
if isinstance(data, dict) and 'results' in data:
api_works = data['results']
st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in api_works]
st.info(f"โน๏ธ Transformed {len(st.session_state.works)} works from OpenAlex API format")
elif isinstance(data, list):
if data and 'title' in data[0]:
st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in data]
st.info(f"โน๏ธ Transformed {len(st.session_state.works)} works from OpenAlex API format")
else:
st.session_state.works = data
else:
st.error("โ Unexpected JSON format")
st.stop()
st.success(f"โ
Loaded {len(st.session_state.works):,} works from file")
except json.JSONDecodeError:
st.error("โ Invalid JSON file")
except Exception as e:
st.error(f"โ Error processing file: {str(e)}")
with tab3:
st.subheader("Direct API Search")
st.markdown("Search OpenAlex directly without needing a pre-built URL")
# Search filters
col1, col2 = st.columns(2)
with col1:
api_author_name = st.text_input(
"Author Name",
placeholder="e.g., John Smith",
help="Search for works by a specific author",
key="api_author_name"
)
api_institution = st.text_input(
"Institution",
placeholder="e.g., Harvard University",
help="Filter by institution/affiliation",
key="api_institution"
)
api_topic = st.text_input(
"Topic/Keyword",
placeholder="e.g., machine learning",
help="Search by topic or keyword",
key="api_topic"
)
api_journals = st.text_area(
"Journal(s)",
placeholder="Nature\nScience\nCell",
help="Enter journal names, one per line. Leave blank for all journals.",
key="api_journals"
)
with col2:
api_year_from = st.number_input(
"Publication Year From",
min_value=1900,
max_value=2025,
value=2020,
help="Start year for publication range",
key="api_year_from"
)
api_year_to = st.number_input(
"Publication Year To",
min_value=1900,
max_value=2025,
value=2025,
help="End year for publication range",
key="api_year_to"
)
api_min_citations = st.number_input(
"Minimum Citations",
min_value=0,
max_value=10000,
value=0,
help="Filter works with at least this many citations",
key="api_min_citations"
)
api_max_citations = st.number_input(
"Maximum Citations",
min_value=0,
max_value=100000,
value=0,
help="Filter works with at most this many citations (0 = no limit)",
key="api_max_citations"
)
api_max_results = st.number_input(
"Maximum Results",
min_value=100,
max_value=50000,
value=1000,
step=100,
help="Maximum number of works to retrieve (Warning: >10,000 may be slow)",
key="api_max_results"
)
api_author_name = st.text_input(
"Author Name",
placeholder="e.g., John Smith",
help="Search for works by a specific author"
)
api_institution = st.text_input(
"Institution",
placeholder="e.g., Harvard University",
help="Filter by institution/affiliation"
)
api_topic = st.text_input(
"Topic/Keyword",
placeholder="e.g., machine learning",
help="Search by topic or keyword"
)
with col2:
api_year_from = st.number_input(
"Publication Year From",
min_value=1900,
max_value=2025,
value=2020,
help="Start year for publication range"
)
api_year_to = st.number_input(
"Publication Year To",
min_value=1900,
max_value=2025,
value=2025,
help="End year for publication range"
)
api_max_results = st.number_input(
"Maximum Results",
min_value=100,
max_value=50000,
value=1000,
step=100,
help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
)
# Warning for large requests
if api_max_results > 10000:
st.warning(f"โ ๏ธ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.")
if st.button("๐ Search OpenAlex API", type="primary"):
try:
from datetime import datetime
# Record search start time
search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Build API query
filters = []
if api_author_name:
filters.append(f'author.search:{api_author_name}')
if api_institution:
filters.append(f'authorships.institutions.display_name.search:{api_institution}')
if api_topic:
filters.append(f'title_and_abstract.search:{api_topic}')
if api_year_from and api_year_to:
filters.append(f'publication_year:{api_year_from}-{api_year_to}')
# Add citation filters
if api_min_citations > 0:
filters.append(f'cited_by_count:>{api_min_citations - 1}')
if api_max_citations > 0:
filters.append(f'cited_by_count:<{api_max_citations + 1}')
# Add journal filters - use display_name.search with OR
if api_journals:
journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()]
if journal_list:
# Create OR filter for journals using pipe separator
# Format: primary_location.source.display_name.search:journal1|journal2|journal3
journal_search = '|'.join(journal_list)
filters.append(f'primary_location.source.display_name.search:{journal_search}')
# Add article/review/letter filter
filters.append('type:article|review|letter')
if not filters or filters == ['type:article|review|letter']:
st.warning("Please enter at least one search criterion")
st.stop()
# Build API URL
filter_string = ','.join(filters)
api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
# Store search parameters for summary
search_params = {
'author': api_author_name if api_author_name else 'Any',
'institution': api_institution if api_institution else 'Any',
'topic': api_topic if api_topic else 'Any',
'journals': journal_list if api_journals else ['Any'],
'years': f'{api_year_from}-{api_year_to}',
'min_citations': api_min_citations,
'max_citations': api_max_citations if api_max_citations > 0 else 'No limit',
'search_date': search_datetime
}
st.info(f"๐ก Searching OpenAlex API...")
st.code(api_url, language=None)
all_works = []
max_pages = (api_max_results // 200) + 1
progress_bar = st.progress(0)
status_text = st.empty()
for page in range(1, max_pages + 1):
page_url = f"{api_url}&page={page}"
status_text.text(f"Fetching page {page}/{max_pages}... ({len(all_works)} works so far)")
import urllib.request
req = urllib.request.Request(page_url)
req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')
with urllib.request.urlopen(req) as response:
data = json.loads(response.read().decode())
results = data.get('results', [])
if not results:
break
for work in results:
transformed = transform_openalex_api_to_excel_format(work)
if transformed: # Only add if transformation succeeded
all_works.append(transformed)
progress_bar.progress(min(page / max_pages, 1.0))
if len(all_works) >= api_max_results:
all_works = all_works[:api_max_results]
break
if len(results) < 200:
break
progress_bar.empty()
status_text.empty()
if all_works:
st.session_state.works = all_works
st.session_state.search_params = search_params # Store search parameters
st.success(f"โ
Found {len(all_works):,} works from OpenAlex")
# Display search summary
st.markdown("---")
st.subheader("๐ Search Summary")
summary_col1, summary_col2 = st.columns(2)
with summary_col1:
st.markdown(f"**Search Date:** {search_params['search_date']}")
st.markdown(f"**Author:** {search_params['author']}")
st.markdown(f"**Institution:** {search_params['institution']}")
st.markdown(f"**Topic:** {search_params['topic']}")
with summary_col2:
st.markdown(f"**Years:** {search_params['years']}")
st.markdown(f"**Min Citations:** {search_params['min_citations']}")
st.markdown(f"**Max Citations:** {search_params['max_citations']}")
if search_params['journals'] != ['Any']:
st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}")
else:
st.markdown(f"**Journals:** Any")
st.markdown(f"**Total Works Retrieved:** {len(all_works):,}")
st.markdown("---")
# Offer to download the JSON
json_data = json.dumps(all_works, indent=2)
st.download_button(
label="๐พ Download Raw Data (JSON)",
data=json_data,
file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json",
mime="application/json"
)
else:
st.warning("No works found. Try different search criteria.")
except Exception as e:
st.error(f"โ Error searching API: {str(e)}")
st.info("Make sure your search criteria are valid and try again.")
# Only show search interface if we have data
if st.session_state.works:
st.markdown("---")
st.header("๐ Search Authors")
# Search criteria in columns
col1, col2 = st.columns(2)
with col1:
topic_search = st.text_input(
"๐ฌ Search by Topic",
placeholder="e.g., neuroscience",
help="Filter works by topic keyword (case-insensitive)"
)
author_search = st.text_input(
"๐ค Search by Author Name",
placeholder="e.g., Smith",
help="Filter authors by name (partial match)"
)
with col2:
journal_search = st.text_input(
"๐ Search by Journal",
placeholder="e.g., Nature",
help="Filter works by journal name"
)
country_search = st.text_input(
"๐ Search by Country",
placeholder="e.g., United States or US",
help="Filter authors by country (name or code)"
)
# Additional options
col3, col4, col5 = st.columns(3)
with col3:
min_articles = st.number_input(
"Minimum Articles",
min_value=1,
max_value=100,
value=3,
help="Minimum number of publications"
)
with col4:
max_results = st.number_input(
"Maximum Results",
min_value=1,
max_value=500,
value=50,
help="Maximum number of authors to display"
)
with col5:
sort_by = st.selectbox(
"Sort By",
["Count", "Average Citations", "Median Citations"],
help="How to sort the results"
)
# Additional display options
st.markdown("**Display Options:**")
col6, col7 = st.columns(2)
with col6:
link_type = st.selectbox(
"Link Author Names To",
["None", "ORCID", "OpenAlex"],
help="Make author names clickable links"
)
with col7:
link_topics = st.checkbox(
"Link Topics to OpenAlex",
value=True,
help="Make topic names clickable"
)
# Search button
if st.button("๐ Search Authors", type="primary"):
with st.spinner("Processing author profiles..."):
# Process works
profiles = process_works_to_author_profiles(
st.session_state.works,
topic_filter=topic_search.lower() if topic_search else None,
journal_filter=journal_search.lower() if journal_search else None,
country_filter=country_search.lower() if country_search else None
)
# Build results
results = []
for normalized_name, profile in profiles.items():
if profile['count'] < min_articles:
continue
# Author name filter
if author_search:
display_name = profile['display_name'].lower()
if author_search.lower() not in normalized_name.lower() and author_search.lower() not in display_name:
continue
citations = profile['citations']
median_cites = sorted(citations)[len(citations)//2] if citations else 0
avg_cites = round(sum(citations) / len(citations), 1) if citations else 0
most_common_country = profile['countries'].most_common(1)
country_code = most_common_country[0][0] if most_common_country else ''
country_name = get_country_name(country_code) if country_code else ''
continent = get_continent(country_code) if country_code else ''
top_topics = ', '.join([t for t, _ in profile['topics'].most_common(5)])
# Create clickable topic links if enabled
if link_topics and top_topics:
topic_list = [t for t, _ in profile['topics'].most_common(5)]
# Create OpenAlex works filter links using stored topic IDs
linked_topics = []
for topic in topic_list:
# Get the topic ID if we have it
topic_id = profile['topic_ids'].get(topic, '')
if topic_id:
# Extract just the ID (e.g., "T12345" from full URL or already formatted)
if '/' in topic_id:
topic_id = topic_id.split('/')[-1]
# Convert to lowercase for the filter (e.g., T10316 -> t10316)
topic_id_lower = topic_id.lower()
# Link to works filtered by this topic with additional parameters
link = f'{topic}'
else:
# Fallback to search if no ID available
topic_encoded = topic.replace(' ', '+')
link = f'{topic}'
linked_topics.append(link)
top_topics_display = ', '.join(linked_topics)
else:
top_topics_display = top_topics
top_coauthors = ', '.join([c for c, _ in profile['coauthors'].most_common(5)])
top_journals = ', '.join([j for j, _ in profile['journals'].most_common(5)])
# Create author name with optional link
author_display = profile['display_name']
if link_type == "ORCID" and profile['orcid']:
author_display = f'{profile["display_name"]}'
elif link_type == "OpenAlex" and profile['openalex_id']:
# Extract just the ID from the full URL if needed
openalex_id = profile['openalex_id'].split('/')[-1] if '/' in profile['openalex_id'] else profile['openalex_id']
author_display = f'{profile["display_name"]}'
results.append({
'Author': author_display,
'Count': profile['count'],
'Median Citations': median_cites,
'Average Citations': avg_cites,
'Country': country_name,
'Continent': continent,
'Top Topics': top_topics_display,
'Top Co-authors': top_coauthors,
'Top Journals': top_journals
})
# For Excel export, create a version without HTML
df_export = pd.DataFrame(results).copy()
# Remove HTML tags for Excel export
df_export['Author'] = df_export['Author'].str.replace('<[^<]+?>', '', regex=True)
df_export['Top Topics'] = df_export['Top Topics'].str.replace('<[^<]+?>', '', regex=True)
# Sort results
if sort_by == "Count":
results.sort(key=lambda x: x['Count'], reverse=True)
elif sort_by == "Average Citations":
results.sort(key=lambda x: x['Average Citations'], reverse=True)
elif sort_by == "Median Citations":
results.sort(key=lambda x: x['Median Citations'], reverse=True)
# Limit results
results = results[:max_results]
if results:
st.success(f"โ
Found {len(results)} matching authors")
# Convert to DataFrame
df = pd.DataFrame(results)
# Custom CSS for better table display
st.markdown("""
""", unsafe_allow_html=True)
# Display results as HTML table with clickable links
st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
# Summary statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Authors", len(results))
with col2:
st.metric("Avg Publications", f"{df['Count'].mean():.1f}")
with col3:
st.metric("Avg Citations", f"{df['Average Citations'].mean():.1f}")
with col4:
orcid_count = sum(1 for r in results if link_type == "ORCID" and '