import streamlit as st import json import pandas as pd from io import BytesIO from collections import defaultdict, Counter import unicodedata # Configure page st.set_page_config( page_title="OpenAlex Author Search", page_icon="🔍", layout="wide" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # ============================================================================ # UTILITY FUNCTIONS # ============================================================================ def normalize_author_name(name): """Normalize author names to handle accents and dashes""" if not name: return name normalized = unicodedata.normalize('NFD', name) ascii_name = normalized.encode('ascii', 'ignore').decode('ascii') ascii_name = ascii_name.replace('–', '-') ascii_name = ascii_name.replace('—', '-') ascii_name = ascii_name.replace('−', '-') ascii_name = ascii_name.replace('‐', '-') ascii_name = ascii_name.replace('‑', '-') ascii_name = ' '.join(ascii_name.split()) return ascii_name.strip() COUNTRY_CODES = { 'AD': 'Andorra', 'AL': 'Albania', 'AM': 'Armenia', 'AT': 'Austria', 'AX': 'Åland Islands', 'BA': 'Bosnia and Herzegovina', 'BE': 'Belgium', 'BG': 'Bulgaria', 'BY': 'Belarus', 'CH': 'Switzerland', 'CY': 'Cyprus', 'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EE': 'Estonia', 'ES': 'Spain', 'FI': 'Finland', 'FO': 'Faroe Islands', 'FR': 'France', 'GB': 'United Kingdom', 'UK': 'United Kingdom', 'GE': 'Georgia', 'GG': 'Guernsey', 'GI': 'Gibraltar', 'GR': 'Greece', 'HR': 'Croatia', 'HU': 'Hungary', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IS': 'Iceland', 'IT': 'Italy', 'JE': 'Jersey', 'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg', 'LV': 'Latvia', 'MC': 'Monaco', 'MD': 'Moldova', 'ME': 'Montenegro', 'MK': 'North Macedonia', 'MT': 'Malta', 'NL': 'Netherlands', 'NO': 'Norway', 'PL': 'Poland', 'PT': 'Portugal', 'RO': 'Romania', 'RS': 'Serbia', 'RU': 'Russia', 'SE': 'Sweden', 'SI': 'Slovenia', 'SJ': 'Svalbard and Jan Mayen', 'SK': 'Slovakia', 'SM': 'San Marino', 'UA': 'Ukraine', 'VA': 'Vatican City', 'XK': 'Kosovo', 'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AZ': 'Azerbaijan', 'BD': 'Bangladesh', 'BH': 'Bahrain', 'BN': 'Brunei', 'BT': 'Bhutan', 'CN': 'China', 'HK': 'Hong Kong', 'ID': 'Indonesia', 'IL': 'Israel', 'IN': 'India', 'IQ': 'Iraq', 'IR': 'Iran', 'JO': 'Jordan', 'JP': 'Japan', 'KG': 'Kyrgyzstan', 'KH': 'Cambodia', 'KP': 'North Korea', 'KR': 'South Korea', 'KW': 'Kuwait', 'KZ': 'Kazakhstan', 'LA': 'Laos', 'LB': 'Lebanon', 'LK': 'Sri Lanka', 'MM': 'Myanmar', 'MN': 'Mongolia', 'MO': 'Macau', 'MV': 'Maldives', 'MY': 'Malaysia', 'NP': 'Nepal', 'OM': 'Oman', 'PH': 'Philippines', 'PK': 'Pakistan', 'PS': 'Palestine', 'QA': 'Qatar', 'SA': 'Saudi Arabia', 'SG': 'Singapore', 'SY': 'Syria', 'TH': 'Thailand', 'TJ': 'Tajikistan', 'TL': 'Timor-Leste', 'TM': 'Turkmenistan', 'TR': 'Turkey', 'TW': 'Taiwan', 'UZ': 'Uzbekistan', 'VN': 'Vietnam', 'YE': 'Yemen', 'AO': 'Angola', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'BJ': 'Benin', 'BW': 'Botswana', 'CD': 'Democratic Republic of the Congo', 'CF': 'Central African Republic', 'CG': 'Republic of the Congo', 'CI': 'Ivory Coast', 'CM': 'Cameroon', 'CV': 'Cape Verde', 'DJ': 'Djibouti', 'DZ': 'Algeria', 'EG': 'Egypt', 'EH': 'Western Sahara', 'ER': 'Eritrea', 'ET': 'Ethiopia', 'GA': 'Gabon', 'GH': 'Ghana', 'GM': 'Gambia', 'GN': 'Guinea', 'GQ': 'Equatorial Guinea', 'GW': 'Guinea-Bissau', 'KE': 'Kenya', 'KM': 'Comoros', 'LR': 'Liberia', 'LS': 'Lesotho', 'LY': 'Libya', 'MA': 'Morocco', 'MG': 'Madagascar', 'ML': 'Mali', 'MR': 'Mauritania', 'MU': 'Mauritius', 'MW': 'Malawi', 'MZ': 'Mozambique', 'NA': 'Namibia', 'NE': 'Niger', 'NG': 'Nigeria', 'RE': 'Réunion', 'RW': 'Rwanda', 'SC': 'Seychelles', 'SD': 'Sudan', 'SL': 'Sierra Leone', 'SN': 'Senegal', 'SO': 'Somalia', 'SS': 'South Sudan', 'ST': 'São Tomé and Príncipe', 'SZ': 'Eswatini', 'TD': 'Chad', 'TG': 'Togo', 'TN': 'Tunisia', 'TZ': 'Tanzania', 'UG': 'Uganda', 'YT': 'Mayotte', 'ZA': 'South Africa', 'ZM': 'Zambia', 'ZW': 'Zimbabwe', 'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AW': 'Aruba', 'BB': 'Barbados', 'BL': 'Saint Barthélemy', 'BM': 'Bermuda', 'BQ': 'Caribbean Netherlands', 'BS': 'Bahamas', 'BZ': 'Belize', 'CA': 'Canada', 'CR': 'Costa Rica', 'CU': 'Cuba', 'CW': 'Curaçao', 'DM': 'Dominica', 'DO': 'Dominican Republic', 'GD': 'Grenada', 'GL': 'Greenland', 'GP': 'Guadeloupe', 'GT': 'Guatemala', 'HN': 'Honduras', 'HT': 'Haiti', 'JM': 'Jamaica', 'KN': 'Saint Kitts and Nevis', 'KY': 'Cayman Islands', 'LC': 'Saint Lucia', 'MF': 'Saint Martin', 'MQ': 'Martinique', 'MS': 'Montserrat', 'MX': 'Mexico', 'NI': 'Nicaragua', 'PA': 'Panama', 'PM': 'Saint Pierre and Miquelon', 'PR': 'Puerto Rico', 'SV': 'El Salvador', 'SX': 'Sint Maarten', 'TC': 'Turks and Caicos Islands', 'TT': 'Trinidad and Tobago', 'US': 'United States', 'VC': 'Saint Vincent and the Grenadines', 'VG': 'British Virgin Islands', 'VI': 'U.S. Virgin Islands', 'AR': 'Argentina', 'BO': 'Bolivia', 'BR': 'Brazil', 'CL': 'Chile', 'CO': 'Colombia', 'EC': 'Ecuador', 'FK': 'Falkland Islands', 'GF': 'French Guiana', 'GY': 'Guyana', 'PE': 'Peru', 'PY': 'Paraguay', 'SR': 'Suriname', 'UY': 'Uruguay', 'VE': 'Venezuela', 'AS': 'American Samoa', 'AU': 'Australia', 'CK': 'Cook Islands', 'FJ': 'Fiji', 'FM': 'Micronesia', 'GU': 'Guam', 'KI': 'Kiribati', 'MH': 'Marshall Islands', 'MP': 'Northern Mariana Islands', 'NC': 'New Caledonia', 'NF': 'Norfolk Island', 'NR': 'Nauru', 'NU': 'Niue', 'NZ': 'New Zealand', 'PF': 'French Polynesia', 'PG': 'Papua New Guinea', 'PN': 'Pitcairn Islands', 'PW': 'Palau', 'SB': 'Solomon Islands', 'TK': 'Tokelau', 'TO': 'Tonga', 'TV': 'Tuvalu', 'UM': 'U.S. Minor Outlying Islands', 'VU': 'Vanuatu', 'WF': 'Wallis and Futuna', 'WS': 'Samoa' } CONTINENT_MAP = { 'Europe': ['AD', 'AL', 'AT', 'AX', 'BA', 'BE', 'BG', 'BY', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FO', 'FR', 'GB', 'UK', 'GG', 'GI', 'GR', 'HR', 'HU', 'IE', 'IM', 'IS', 'IT', 'JE', 'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SJ', 'SK', 'SM', 'UA', 'VA', 'XK'], 'Asia': ['AE', 'AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'GE', 'HK', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KH', 'KP', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LK', 'MM', 'MN', 'MO', 'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'PS', 'QA', 'SA', 'SG', 'SY', 'TH', 'TJ', 'TL', 'TM', 'TR', 'TW', 'UZ', 'VN', 'YE'], 'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CF', 'CG', 'CI', 'CM', 'CV', 'DJ', 'DZ', 'EG', 'EH', 'ER', 'ET', 'GA', 'GH', 'GM', 'GN', 'GQ', 'GW', 'KE', 'KM', 'LR', 'LS', 'LY', 'MA', 'MG', 'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RE', 'RW', 'SC', 'SD', 'SL', 'SN', 'SO', 'SS', 'ST', 'SZ', 'TD', 'TG', 'TN', 'TZ', 'UG', 'YT', 'ZA', 'ZM', 'ZW'], 'North America': ['AG', 'AI', 'AW', 'BB', 'BL', 'BM', 'BQ', 'BS', 'BZ', 'CA', 'CR', 'CU', 'CW', 'DM', 'DO', 'GD', 'GL', 'GP', 'GT', 'HN', 'HT', 'JM', 'KN', 'KY', 'LC', 'MF', 'MQ', 'MS', 'MX', 'NI', 'PA', 'PM', 'PR', 'SV', 'SX', 'TC', 'TT', 'US', 'VC', 'VG', 'VI'], 'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY', 'PE', 'PY', 'SR', 'UY', 'VE'], 'Oceania': ['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP', 'NC', 'NF', 'NR', 'NU', 'NZ', 'PF', 'PG', 'PN', 'PW', 'SB', 'TK', 'TO', 'TV', 'UM', 'VU', 'WF', 'WS'] } def get_country_name(code): return COUNTRY_CODES.get(code.upper(), code) def get_continent(country_code): cc = country_code.upper() for continent, codes in CONTINENT_MAP.items(): if cc in codes: return continent return 'Unknown' def process_works_to_author_profiles(works, topic_filter=None, journal_filter=None, country_filter=None): """Process works into author profiles with filtering""" author_profiles = defaultdict(lambda: { 'count': 0, 'citations': [], 'topics': Counter(), 'topic_ids': {}, # Store topic IDs 'coauthors': Counter(), 'journals': Counter(), 'countries': Counter(), 'orcid': '', 'openalex_id': '', 'display_name': '' }) for work in works: citations = work.get('cited_by_count', 0) primary_loc = work.get('primary_location', {}) source = primary_loc.get('source', {}) if primary_loc else {} journal = source.get('display_name', 'Unknown') topic = work.get('primary_topic') topic_name = topic.get('display_name', 'Unknown') if topic else 'Unknown' topic_id = topic.get('id', '') if topic else '' # Apply filters if topic_filter and topic_filter not in topic_name.lower(): continue if journal_filter and journal_filter not in journal.lower(): continue if country_filter: work_has_country = False for authorship in work.get('authorships', []): countries = authorship.get('countries', []) for country_code in countries: if country_code: country_name = get_country_name(country_code).lower() if country_filter in country_name or country_filter in country_code.lower(): work_has_country = True break if work_has_country: break if not work_has_country: continue # Process authors for authorship in work.get('authorships', []): author_info = authorship.get('author', {}) author_name = author_info.get('display_name', 'Unknown') if not author_name or author_name == 'Unknown': continue normalized_name = normalize_author_name(author_name) profile = author_profiles[normalized_name] if not profile['display_name']: profile['display_name'] = author_name profile['count'] += 1 profile['citations'].append(citations) if author_info.get('orcid') and not profile['orcid']: profile['orcid'] = author_info['orcid'] if author_info.get('id') and not profile['openalex_id']: profile['openalex_id'] = author_info['id'] if topic_name != 'Unknown': profile['topics'][topic_name] += 1 # Store the topic ID for this topic name if topic_name not in profile['topic_ids'] and topic_id: profile['topic_ids'][topic_name] = topic_id for other_auth in work.get('authorships', []): other_name = other_auth.get('author', {}).get('display_name', '') if other_name and other_name != author_name: profile['coauthors'][other_name] += 1 if journal != 'Unknown': profile['journals'][journal] += 1 countries = authorship.get('countries', []) for country_code in countries: if country_code: profile['countries'][country_code] += 1 return author_profiles def transform_openalex_api_to_excel_format(api_work): """Convert OpenAlex API format to match Excel export format""" # Safety check if not api_work: return None # Extract primary topic primary_topic = None topics = api_work.get('topics', []) if topics and len(topics) > 0: topic = topics[0] primary_topic = { 'id': topic.get('id', '').split('/')[-1] if topic.get('id') else '', 'display_name': topic.get('display_name', ''), 'subfield': { 'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '', 'display_name': topic.get('subfield', {}).get('display_name', '') } if topic.get('subfield') else {'id': '', 'display_name': ''}, 'field': { 'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '', 'display_name': topic.get('field', {}).get('display_name', '') } if topic.get('field') else {'id': '', 'display_name': ''}, 'domain': { 'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '', 'display_name': topic.get('domain', {}).get('display_name', '') } if topic.get('domain') else {'id': '', 'display_name': ''} } # Extract authorships authorships = [] for authorship in api_work.get('authorships', []): author = authorship.get('author', {}) if not author: continue # Extract countries from institutions countries = [] for institution in authorship.get('institutions', []): if institution: country_code = institution.get('country_code', '') if country_code: countries.append(country_code) # Remove duplicates countries = list(set(countries)) authorships.append({ 'author': { 'id': author.get('id', '').split('/')[-1] if author.get('id') else '', 'display_name': author.get('display_name', ''), 'orcid': author.get('orcid', '') }, 'countries': countries }) # Safely extract primary location source primary_location = api_work.get('primary_location', {}) source_name = '' if primary_location and primary_location.get('source'): source_name = primary_location['source'].get('display_name', '') # Build simplified work object return { 'doi': api_work.get('doi', ''), 'publication_year': api_work.get('publication_year', ''), 'type': api_work.get('type', ''), 'cited_by_count': api_work.get('cited_by_count', 0), 'primary_location': { 'source': { 'display_name': source_name } }, 'biblio': { 'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else '' }, 'primary_topic': primary_topic, 'mesh': [], 'authorships': authorships } # ============================================================================ # STREAMLIT UI # ============================================================================ st.title("🔍 OpenAlex Author Search") st.markdown("Search and analyze author data from OpenAlex") # Sidebar for instructions with st.sidebar: st.header("📖 How to Use") st.markdown(""" **Option 1: Fetch from OpenAlex** 1. Paste your OpenAlex URL 2. Click "Fetch Data" 3. Search and download results **Option 2: Upload File** 1. Upload JSON file 2. Search and download results **Option 3: Direct API Search** 1. Enter search criteria 2. Click "Search" 3. Analyze results """) st.markdown("---") st.subheader("⚙️ API Settings") st.caption("💡 Email included in API requests for OpenAlex 'polite pool' (faster access)") st.code("halozen@pm.me", language=None) # Hardcoded email st.session_state.user_email = "halozen@pm.me" # Initialize session state for works if 'works' not in st.session_state: st.session_state.works = None # Create tabs for different input methods tab1, tab2, tab3 = st.tabs(["🌐 Fetch from OpenAlex", "📁 Upload File", "🔍 Direct API Search"]) with tab1: st.subheader("Fetch Data from OpenAlex") openalex_url = st.text_input( "Paste OpenAlex URL (web or API format)", placeholder="https://openalex.org/works?filter=...", help="Paste either the web URL or API URL from OpenAlex" ) col1, col2 = st.columns([1, 3]) with col1: max_pages = st.number_input( "Max Pages", min_value=1, max_value=50, value=5, help="Each page has up to 200 works. 5 pages = 1,000 works max" ) if st.button("🔍 Fetch Data from OpenAlex", type="primary"): if not openalex_url: st.error("Please enter an OpenAlex URL") else: try: import re import urllib.parse # Parse the URL parsed = urllib.parse.urlparse(openalex_url) query_params = urllib.parse.parse_qs(parsed.query) # Build API URL - OpenAlex API format api_url = "https://api.openalex.org/works" # Extract filter parameter if 'filter' in query_params: filter_value = query_params['filter'][0] api_url = f"{api_url}?filter={filter_value}" # Add per-page parameter separator = '&' if '?' in api_url else '?' api_url = f"{api_url}{separator}per-page=200" st.info(f"📡 Fetching from OpenAlex API... (up to {max_pages} pages)") st.code(api_url, language=None) # Show the API URL being used all_works = [] # Progress bar progress_bar = st.progress(0) status_text = st.empty() for page in range(1, max_pages + 1): # Add page parameter page_url = api_url if '?' in page_url: page_url = f"{page_url}&page={page}" else: page_url = f"{page_url}?page={page}" status_text.text(f"Fetching page {page}/{max_pages}...") # Fetch data with user agent import urllib.request req = urllib.request.Request(page_url) req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})') with urllib.request.urlopen(req) as response: data = json.loads(response.read().decode()) results = data.get('results', []) if not results: break # Transform to Excel format for work in results: all_works.append(transform_openalex_api_to_excel_format(work)) progress_bar.progress(page / max_pages) # Stop if we got fewer than 200 results (last page) if len(results) < 200: break progress_bar.empty() status_text.empty() if all_works: st.session_state.works = all_works st.success(f"✅ Fetched {len(all_works):,} works from OpenAlex") # Offer to download the JSON json_data = json.dumps(all_works, indent=2) st.download_button( label="💾 Download Raw Data (JSON)", data=json_data, file_name="openalex_data.json", mime="application/json" ) else: st.warning("No works found. Try a different URL or check your filters.") except Exception as e: st.error(f"❌ Error fetching data: {str(e)}") st.info("Make sure your URL is valid and try again.") with tab2: st.subheader("Upload Existing Data File") uploaded_file = st.file_uploader( "Upload your OpenAlex data (JSON)", type=['json'], help="Upload a previously saved JSON file" ) if uploaded_file: try: # Load data data = json.load(uploaded_file) # Handle both formats if isinstance(data, dict) and 'results' in data: api_works = data['results'] st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in api_works] st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format") elif isinstance(data, list): if data and 'title' in data[0]: st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in data] st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format") else: st.session_state.works = data else: st.error("❌ Unexpected JSON format") st.stop() st.success(f"✅ Loaded {len(st.session_state.works):,} works from file") except json.JSONDecodeError: st.error("❌ Invalid JSON file") except Exception as e: st.error(f"❌ Error processing file: {str(e)}") with tab3: st.subheader("Direct API Search") st.markdown("Search OpenAlex directly without needing a pre-built URL") # Search filters col1, col2 = st.columns(2) with col1: api_author_name = st.text_input( "Author Name", placeholder="e.g., John Smith", help="Search for works by a specific author", key="api_author_name" ) api_institution = st.text_input( "Institution", placeholder="e.g., Harvard University", help="Filter by institution/affiliation", key="api_institution" ) api_topic = st.text_input( "Topic/Keyword", placeholder="e.g., machine learning", help="Search by topic or keyword", key="api_topic" ) api_journals = st.text_area( "Journal(s)", placeholder="Nature\nScience\nCell", help="Enter journal names, one per line. Leave blank for all journals.", key="api_journals" ) with col2: api_year_from = st.number_input( "Publication Year From", min_value=1900, max_value=2025, value=2020, help="Start year for publication range", key="api_year_from" ) api_year_to = st.number_input( "Publication Year To", min_value=1900, max_value=2025, value=2025, help="End year for publication range", key="api_year_to" ) api_min_citations = st.number_input( "Minimum Citations", min_value=0, max_value=10000, value=0, help="Filter works with at least this many citations", key="api_min_citations" ) api_max_citations = st.number_input( "Maximum Citations", min_value=0, max_value=100000, value=0, help="Filter works with at most this many citations (0 = no limit)", key="api_max_citations" ) api_max_results = st.number_input( "Maximum Results", min_value=100, max_value=50000, value=1000, step=100, help="Maximum number of works to retrieve (Warning: >10,000 may be slow)", key="api_max_results" ) api_author_name = st.text_input( "Author Name", placeholder="e.g., John Smith", help="Search for works by a specific author" ) api_institution = st.text_input( "Institution", placeholder="e.g., Harvard University", help="Filter by institution/affiliation" ) api_topic = st.text_input( "Topic/Keyword", placeholder="e.g., machine learning", help="Search by topic or keyword" ) with col2: api_year_from = st.number_input( "Publication Year From", min_value=1900, max_value=2025, value=2020, help="Start year for publication range" ) api_year_to = st.number_input( "Publication Year To", min_value=1900, max_value=2025, value=2025, help="End year for publication range" ) api_max_results = st.number_input( "Maximum Results", min_value=100, max_value=50000, value=1000, step=100, help="Maximum number of works to retrieve (Warning: >10,000 may be slow)" ) # Warning for large requests if api_max_results > 10000: st.warning(f"⚠️ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.") if st.button("🔍 Search OpenAlex API", type="primary"): try: from datetime import datetime # Record search start time search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Build API query filters = [] if api_author_name: filters.append(f'author.search:{api_author_name}') if api_institution: filters.append(f'authorships.institutions.display_name.search:{api_institution}') if api_topic: filters.append(f'title_and_abstract.search:{api_topic}') if api_year_from and api_year_to: filters.append(f'publication_year:{api_year_from}-{api_year_to}') # Add citation filters if api_min_citations > 0: filters.append(f'cited_by_count:>{api_min_citations - 1}') if api_max_citations > 0: filters.append(f'cited_by_count:<{api_max_citations + 1}') # Add journal filters - use display_name.search with OR if api_journals: journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()] if journal_list: # Create OR filter for journals using pipe separator # Format: primary_location.source.display_name.search:journal1|journal2|journal3 journal_search = '|'.join(journal_list) filters.append(f'primary_location.source.display_name.search:{journal_search}') # Add article/review/letter filter filters.append('type:article|review|letter') if not filters or filters == ['type:article|review|letter']: st.warning("Please enter at least one search criterion") st.stop() # Build API URL filter_string = ','.join(filters) api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200" # Store search parameters for summary search_params = { 'author': api_author_name if api_author_name else 'Any', 'institution': api_institution if api_institution else 'Any', 'topic': api_topic if api_topic else 'Any', 'journals': journal_list if api_journals else ['Any'], 'years': f'{api_year_from}-{api_year_to}', 'min_citations': api_min_citations, 'max_citations': api_max_citations if api_max_citations > 0 else 'No limit', 'search_date': search_datetime } st.info(f"📡 Searching OpenAlex API...") st.code(api_url, language=None) all_works = [] max_pages = (api_max_results // 200) + 1 progress_bar = st.progress(0) status_text = st.empty() for page in range(1, max_pages + 1): page_url = f"{api_url}&page={page}" status_text.text(f"Fetching page {page}/{max_pages}... ({len(all_works)} works so far)") import urllib.request req = urllib.request.Request(page_url) req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})') with urllib.request.urlopen(req) as response: data = json.loads(response.read().decode()) results = data.get('results', []) if not results: break for work in results: transformed = transform_openalex_api_to_excel_format(work) if transformed: # Only add if transformation succeeded all_works.append(transformed) progress_bar.progress(min(page / max_pages, 1.0)) if len(all_works) >= api_max_results: all_works = all_works[:api_max_results] break if len(results) < 200: break progress_bar.empty() status_text.empty() if all_works: st.session_state.works = all_works st.session_state.search_params = search_params # Store search parameters st.success(f"✅ Found {len(all_works):,} works from OpenAlex") # Display search summary st.markdown("---") st.subheader("📊 Search Summary") summary_col1, summary_col2 = st.columns(2) with summary_col1: st.markdown(f"**Search Date:** {search_params['search_date']}") st.markdown(f"**Author:** {search_params['author']}") st.markdown(f"**Institution:** {search_params['institution']}") st.markdown(f"**Topic:** {search_params['topic']}") with summary_col2: st.markdown(f"**Years:** {search_params['years']}") st.markdown(f"**Min Citations:** {search_params['min_citations']}") st.markdown(f"**Max Citations:** {search_params['max_citations']}") if search_params['journals'] != ['Any']: st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}") else: st.markdown(f"**Journals:** Any") st.markdown(f"**Total Works Retrieved:** {len(all_works):,}") st.markdown("---") # Offer to download the JSON json_data = json.dumps(all_works, indent=2) st.download_button( label="💾 Download Raw Data (JSON)", data=json_data, file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json", mime="application/json" ) else: st.warning("No works found. Try different search criteria.") except Exception as e: st.error(f"❌ Error searching API: {str(e)}") st.info("Make sure your search criteria are valid and try again.") # Only show search interface if we have data if st.session_state.works: st.markdown("---") st.header("🔎 Search Authors") # Search criteria in columns col1, col2 = st.columns(2) with col1: topic_search = st.text_input( "🔬 Search by Topic", placeholder="e.g., neuroscience", help="Filter works by topic keyword (case-insensitive)" ) author_search = st.text_input( "👤 Search by Author Name", placeholder="e.g., Smith", help="Filter authors by name (partial match)" ) with col2: journal_search = st.text_input( "📄 Search by Journal", placeholder="e.g., Nature", help="Filter works by journal name" ) country_search = st.text_input( "🌍 Search by Country", placeholder="e.g., United States or US", help="Filter authors by country (name or code)" ) # Additional options col3, col4, col5 = st.columns(3) with col3: min_articles = st.number_input( "Minimum Articles", min_value=1, max_value=100, value=3, help="Minimum number of publications" ) with col4: max_results = st.number_input( "Maximum Results", min_value=1, max_value=500, value=50, help="Maximum number of authors to display" ) with col5: sort_by = st.selectbox( "Sort By", ["Count", "Average Citations", "Median Citations"], help="How to sort the results" ) # Additional display options st.markdown("**Display Options:**") col6, col7 = st.columns(2) with col6: link_type = st.selectbox( "Link Author Names To", ["None", "ORCID", "OpenAlex"], help="Make author names clickable links" ) with col7: link_topics = st.checkbox( "Link Topics to OpenAlex", value=True, help="Make topic names clickable" ) # Search button if st.button("🔍 Search Authors", type="primary"): with st.spinner("Processing author profiles..."): # Process works profiles = process_works_to_author_profiles( st.session_state.works, topic_filter=topic_search.lower() if topic_search else None, journal_filter=journal_search.lower() if journal_search else None, country_filter=country_search.lower() if country_search else None ) # Build results results = [] for normalized_name, profile in profiles.items(): if profile['count'] < min_articles: continue # Author name filter if author_search: display_name = profile['display_name'].lower() if author_search.lower() not in normalized_name.lower() and author_search.lower() not in display_name: continue citations = profile['citations'] median_cites = sorted(citations)[len(citations)//2] if citations else 0 avg_cites = round(sum(citations) / len(citations), 1) if citations else 0 most_common_country = profile['countries'].most_common(1) country_code = most_common_country[0][0] if most_common_country else '' country_name = get_country_name(country_code) if country_code else '' continent = get_continent(country_code) if country_code else '' top_topics = ', '.join([t for t, _ in profile['topics'].most_common(5)]) # Create clickable topic links if enabled if link_topics and top_topics: topic_list = [t for t, _ in profile['topics'].most_common(5)] # Create OpenAlex works filter links using stored topic IDs linked_topics = [] for topic in topic_list: # Get the topic ID if we have it topic_id = profile['topic_ids'].get(topic, '') if topic_id: # Extract just the ID (e.g., "T12345" from full URL or already formatted) if '/' in topic_id: topic_id = topic_id.split('/')[-1] # Convert to lowercase for the filter (e.g., T10316 -> t10316) topic_id_lower = topic_id.lower() # Link to works filtered by this topic with additional parameters link = f'{topic}' else: # Fallback to search if no ID available topic_encoded = topic.replace(' ', '+') link = f'{topic}' linked_topics.append(link) top_topics_display = ', '.join(linked_topics) else: top_topics_display = top_topics top_coauthors = ', '.join([c for c, _ in profile['coauthors'].most_common(5)]) top_journals = ', '.join([j for j, _ in profile['journals'].most_common(5)]) # Create author name with optional link author_display = profile['display_name'] if link_type == "ORCID" and profile['orcid']: author_display = f'{profile["display_name"]}' elif link_type == "OpenAlex" and profile['openalex_id']: # Extract just the ID from the full URL if needed openalex_id = profile['openalex_id'].split('/')[-1] if '/' in profile['openalex_id'] else profile['openalex_id'] author_display = f'{profile["display_name"]}' results.append({ 'Author': author_display, 'Count': profile['count'], 'Median Citations': median_cites, 'Average Citations': avg_cites, 'Country': country_name, 'Continent': continent, 'Top Topics': top_topics_display, 'Top Co-authors': top_coauthors, 'Top Journals': top_journals }) # For Excel export, create a version without HTML df_export = pd.DataFrame(results).copy() # Remove HTML tags for Excel export df_export['Author'] = df_export['Author'].str.replace('<[^<]+?>', '', regex=True) df_export['Top Topics'] = df_export['Top Topics'].str.replace('<[^<]+?>', '', regex=True) # Sort results if sort_by == "Count": results.sort(key=lambda x: x['Count'], reverse=True) elif sort_by == "Average Citations": results.sort(key=lambda x: x['Average Citations'], reverse=True) elif sort_by == "Median Citations": results.sort(key=lambda x: x['Median Citations'], reverse=True) # Limit results results = results[:max_results] if results: st.success(f"✅ Found {len(results)} matching authors") # Convert to DataFrame df = pd.DataFrame(results) # Custom CSS for better table display st.markdown(""" """, unsafe_allow_html=True) # Display results as HTML table with clickable links st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True) # Summary statistics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Authors", len(results)) with col2: st.metric("Avg Publications", f"{df['Count'].mean():.1f}") with col3: st.metric("Avg Citations", f"{df['Average Citations'].mean():.1f}") with col4: orcid_count = sum(1 for r in results if link_type == "ORCID" and '