Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| import pandas as pd | |
| from io import BytesIO | |
| from collections import defaultdict, Counter | |
| import unicodedata | |
| # Configure page | |
| st.set_page_config( | |
| page_title="OpenAlex Author Search", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .main { | |
| padding: 2rem; | |
| } | |
| .stButton>button { | |
| background-color: #164A78; | |
| color: white; | |
| font-size: 16px; | |
| padding: 0.5rem 2rem; | |
| border-radius: 5px; | |
| border: none; | |
| } | |
| .stButton>button:hover { | |
| background-color: #0d3050; | |
| } | |
| h1 { | |
| color: #164A78; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ============================================================================ | |
| # UTILITY FUNCTIONS | |
| # ============================================================================ | |
| def normalize_author_name(name): | |
| """Normalize author names to handle accents and dashes""" | |
| if not name: | |
| return name | |
| normalized = unicodedata.normalize('NFD', name) | |
| ascii_name = normalized.encode('ascii', 'ignore').decode('ascii') | |
| ascii_name = ascii_name.replace('β', '-') | |
| ascii_name = ascii_name.replace('β', '-') | |
| ascii_name = ascii_name.replace('β', '-') | |
| ascii_name = ascii_name.replace('β', '-') | |
| ascii_name = ascii_name.replace('β', '-') | |
| ascii_name = ' '.join(ascii_name.split()) | |
| return ascii_name.strip() | |
| COUNTRY_CODES = { | |
| 'AD': 'Andorra', 'AL': 'Albania', 'AM': 'Armenia', 'AT': 'Austria', | |
| 'AX': 'Γ land Islands', 'BA': 'Bosnia and Herzegovina', 'BE': 'Belgium', | |
| 'BG': 'Bulgaria', 'BY': 'Belarus', 'CH': 'Switzerland', 'CY': 'Cyprus', | |
| 'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EE': 'Estonia', | |
| 'ES': 'Spain', 'FI': 'Finland', 'FO': 'Faroe Islands', 'FR': 'France', | |
| 'GB': 'United Kingdom', 'UK': 'United Kingdom', 'GE': 'Georgia', | |
| 'GG': 'Guernsey', 'GI': 'Gibraltar', 'GR': 'Greece', 'HR': 'Croatia', | |
| 'HU': 'Hungary', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IS': 'Iceland', | |
| 'IT': 'Italy', 'JE': 'Jersey', 'LI': 'Liechtenstein', 'LT': 'Lithuania', | |
| 'LU': 'Luxembourg', 'LV': 'Latvia', 'MC': 'Monaco', 'MD': 'Moldova', | |
| 'ME': 'Montenegro', 'MK': 'North Macedonia', 'MT': 'Malta', 'NL': 'Netherlands', | |
| 'NO': 'Norway', 'PL': 'Poland', 'PT': 'Portugal', 'RO': 'Romania', | |
| 'RS': 'Serbia', 'RU': 'Russia', 'SE': 'Sweden', 'SI': 'Slovenia', | |
| 'SJ': 'Svalbard and Jan Mayen', 'SK': 'Slovakia', 'SM': 'San Marino', | |
| 'UA': 'Ukraine', 'VA': 'Vatican City', 'XK': 'Kosovo', | |
| 'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AZ': 'Azerbaijan', | |
| 'BD': 'Bangladesh', 'BH': 'Bahrain', 'BN': 'Brunei', 'BT': 'Bhutan', | |
| 'CN': 'China', 'HK': 'Hong Kong', 'ID': 'Indonesia', 'IL': 'Israel', | |
| 'IN': 'India', 'IQ': 'Iraq', 'IR': 'Iran', 'JO': 'Jordan', 'JP': 'Japan', | |
| 'KG': 'Kyrgyzstan', 'KH': 'Cambodia', 'KP': 'North Korea', 'KR': 'South Korea', | |
| 'KW': 'Kuwait', 'KZ': 'Kazakhstan', 'LA': 'Laos', 'LB': 'Lebanon', | |
| 'LK': 'Sri Lanka', 'MM': 'Myanmar', 'MN': 'Mongolia', 'MO': 'Macau', | |
| 'MV': 'Maldives', 'MY': 'Malaysia', 'NP': 'Nepal', 'OM': 'Oman', | |
| 'PH': 'Philippines', 'PK': 'Pakistan', 'PS': 'Palestine', 'QA': 'Qatar', | |
| 'SA': 'Saudi Arabia', 'SG': 'Singapore', 'SY': 'Syria', 'TH': 'Thailand', | |
| 'TJ': 'Tajikistan', 'TL': 'Timor-Leste', 'TM': 'Turkmenistan', 'TR': 'Turkey', | |
| 'TW': 'Taiwan', 'UZ': 'Uzbekistan', 'VN': 'Vietnam', 'YE': 'Yemen', | |
| 'AO': 'Angola', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'BJ': 'Benin', | |
| 'BW': 'Botswana', 'CD': 'Democratic Republic of the Congo', | |
| 'CF': 'Central African Republic', 'CG': 'Republic of the Congo', | |
| 'CI': 'Ivory Coast', 'CM': 'Cameroon', 'CV': 'Cape Verde', 'DJ': 'Djibouti', | |
| 'DZ': 'Algeria', 'EG': 'Egypt', 'EH': 'Western Sahara', 'ER': 'Eritrea', | |
| 'ET': 'Ethiopia', 'GA': 'Gabon', 'GH': 'Ghana', 'GM': 'Gambia', | |
| 'GN': 'Guinea', 'GQ': 'Equatorial Guinea', 'GW': 'Guinea-Bissau', | |
| 'KE': 'Kenya', 'KM': 'Comoros', 'LR': 'Liberia', 'LS': 'Lesotho', | |
| 'LY': 'Libya', 'MA': 'Morocco', 'MG': 'Madagascar', 'ML': 'Mali', | |
| 'MR': 'Mauritania', 'MU': 'Mauritius', 'MW': 'Malawi', 'MZ': 'Mozambique', | |
| 'NA': 'Namibia', 'NE': 'Niger', 'NG': 'Nigeria', 'RE': 'RΓ©union', | |
| 'RW': 'Rwanda', 'SC': 'Seychelles', 'SD': 'Sudan', 'SL': 'Sierra Leone', | |
| 'SN': 'Senegal', 'SO': 'Somalia', 'SS': 'South Sudan', | |
| 'ST': 'SΓ£o TomΓ© and PrΓncipe', 'SZ': 'Eswatini', 'TD': 'Chad', 'TG': 'Togo', | |
| 'TN': 'Tunisia', 'TZ': 'Tanzania', 'UG': 'Uganda', 'YT': 'Mayotte', | |
| 'ZA': 'South Africa', 'ZM': 'Zambia', 'ZW': 'Zimbabwe', | |
| 'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AW': 'Aruba', | |
| 'BB': 'Barbados', 'BL': 'Saint BarthΓ©lemy', 'BM': 'Bermuda', | |
| 'BQ': 'Caribbean Netherlands', 'BS': 'Bahamas', 'BZ': 'Belize', | |
| 'CA': 'Canada', 'CR': 'Costa Rica', 'CU': 'Cuba', 'CW': 'CuraΓ§ao', | |
| 'DM': 'Dominica', 'DO': 'Dominican Republic', 'GD': 'Grenada', | |
| 'GL': 'Greenland', 'GP': 'Guadeloupe', 'GT': 'Guatemala', 'HN': 'Honduras', | |
| 'HT': 'Haiti', 'JM': 'Jamaica', 'KN': 'Saint Kitts and Nevis', | |
| 'KY': 'Cayman Islands', 'LC': 'Saint Lucia', 'MF': 'Saint Martin', | |
| 'MQ': 'Martinique', 'MS': 'Montserrat', 'MX': 'Mexico', 'NI': 'Nicaragua', | |
| 'PA': 'Panama', 'PM': 'Saint Pierre and Miquelon', 'PR': 'Puerto Rico', | |
| 'SV': 'El Salvador', 'SX': 'Sint Maarten', 'TC': 'Turks and Caicos Islands', | |
| 'TT': 'Trinidad and Tobago', 'US': 'United States', | |
| 'VC': 'Saint Vincent and the Grenadines', 'VG': 'British Virgin Islands', | |
| 'VI': 'U.S. Virgin Islands', | |
| 'AR': 'Argentina', 'BO': 'Bolivia', 'BR': 'Brazil', 'CL': 'Chile', | |
| 'CO': 'Colombia', 'EC': 'Ecuador', 'FK': 'Falkland Islands', | |
| 'GF': 'French Guiana', 'GY': 'Guyana', 'PE': 'Peru', 'PY': 'Paraguay', | |
| 'SR': 'Suriname', 'UY': 'Uruguay', 'VE': 'Venezuela', | |
| 'AS': 'American Samoa', 'AU': 'Australia', 'CK': 'Cook Islands', | |
| 'FJ': 'Fiji', 'FM': 'Micronesia', 'GU': 'Guam', 'KI': 'Kiribati', | |
| 'MH': 'Marshall Islands', 'MP': 'Northern Mariana Islands', | |
| 'NC': 'New Caledonia', 'NF': 'Norfolk Island', 'NR': 'Nauru', 'NU': 'Niue', | |
| 'NZ': 'New Zealand', 'PF': 'French Polynesia', 'PG': 'Papua New Guinea', | |
| 'PN': 'Pitcairn Islands', 'PW': 'Palau', 'SB': 'Solomon Islands', | |
| 'TK': 'Tokelau', 'TO': 'Tonga', 'TV': 'Tuvalu', | |
| 'UM': 'U.S. Minor Outlying Islands', 'VU': 'Vanuatu', | |
| 'WF': 'Wallis and Futuna', 'WS': 'Samoa' | |
| } | |
| CONTINENT_MAP = { | |
| 'Europe': ['AD', 'AL', 'AT', 'AX', 'BA', 'BE', 'BG', 'BY', 'CH', 'CY', | |
| 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FO', 'FR', 'GB', 'UK', | |
| 'GG', 'GI', 'GR', 'HR', 'HU', 'IE', 'IM', 'IS', 'IT', 'JE', | |
| 'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'ME', 'MK', 'MT', 'NL', | |
| 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SJ', 'SK', 'SM', | |
| 'UA', 'VA', 'XK'], | |
| 'Asia': ['AE', 'AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'GE', | |
| 'HK', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KH', | |
| 'KP', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LK', 'MM', 'MN', 'MO', | |
| 'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'PS', 'QA', 'SA', 'SG', | |
| 'SY', 'TH', 'TJ', 'TL', 'TM', 'TR', 'TW', 'UZ', 'VN', 'YE'], | |
| 'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CF', 'CG', 'CI', 'CM', | |
| 'CV', 'DJ', 'DZ', 'EG', 'EH', 'ER', 'ET', 'GA', 'GH', 'GM', | |
| 'GN', 'GQ', 'GW', 'KE', 'KM', 'LR', 'LS', 'LY', 'MA', 'MG', | |
| 'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RE', 'RW', | |
| 'SC', 'SD', 'SL', 'SN', 'SO', 'SS', 'ST', 'SZ', 'TD', 'TG', | |
| 'TN', 'TZ', 'UG', 'YT', 'ZA', 'ZM', 'ZW'], | |
| 'North America': ['AG', 'AI', 'AW', 'BB', 'BL', 'BM', 'BQ', 'BS', 'BZ', | |
| 'CA', 'CR', 'CU', 'CW', 'DM', 'DO', 'GD', 'GL', 'GP', | |
| 'GT', 'HN', 'HT', 'JM', 'KN', 'KY', 'LC', 'MF', 'MQ', | |
| 'MS', 'MX', 'NI', 'PA', 'PM', 'PR', 'SV', 'SX', 'TC', | |
| 'TT', 'US', 'VC', 'VG', 'VI'], | |
| 'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY', | |
| 'PE', 'PY', 'SR', 'UY', 'VE'], | |
| 'Oceania': ['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP', 'NC', | |
| 'NF', 'NR', 'NU', 'NZ', 'PF', 'PG', 'PN', 'PW', 'SB', 'TK', | |
| 'TO', 'TV', 'UM', 'VU', 'WF', 'WS'] | |
| } | |
| def get_country_name(code): | |
| return COUNTRY_CODES.get(code.upper(), code) | |
| def get_continent(country_code): | |
| cc = country_code.upper() | |
| for continent, codes in CONTINENT_MAP.items(): | |
| if cc in codes: | |
| return continent | |
| return 'Unknown' | |
| def process_works_to_author_profiles(works, topic_filter=None, journal_filter=None, country_filter=None): | |
| """Process works into author profiles with filtering""" | |
| author_profiles = defaultdict(lambda: { | |
| 'count': 0, | |
| 'citations': [], | |
| 'topics': Counter(), | |
| 'topic_ids': {}, # Store topic IDs | |
| 'coauthors': Counter(), | |
| 'journals': Counter(), | |
| 'countries': Counter(), | |
| 'orcid': '', | |
| 'openalex_id': '', | |
| 'display_name': '' | |
| }) | |
| for work in works: | |
| citations = work.get('cited_by_count', 0) | |
| primary_loc = work.get('primary_location', {}) | |
| source = primary_loc.get('source', {}) if primary_loc else {} | |
| journal = source.get('display_name', 'Unknown') | |
| topic = work.get('primary_topic') | |
| topic_name = topic.get('display_name', 'Unknown') if topic else 'Unknown' | |
| topic_id = topic.get('id', '') if topic else '' | |
| # Apply filters | |
| if topic_filter and topic_filter not in topic_name.lower(): | |
| continue | |
| if journal_filter and journal_filter not in journal.lower(): | |
| continue | |
| if country_filter: | |
| work_has_country = False | |
| for authorship in work.get('authorships', []): | |
| countries = authorship.get('countries', []) | |
| for country_code in countries: | |
| if country_code: | |
| country_name = get_country_name(country_code).lower() | |
| if country_filter in country_name or country_filter in country_code.lower(): | |
| work_has_country = True | |
| break | |
| if work_has_country: | |
| break | |
| if not work_has_country: | |
| continue | |
| # Process authors | |
| for authorship in work.get('authorships', []): | |
| author_info = authorship.get('author', {}) | |
| author_name = author_info.get('display_name', 'Unknown') | |
| if not author_name or author_name == 'Unknown': | |
| continue | |
| normalized_name = normalize_author_name(author_name) | |
| profile = author_profiles[normalized_name] | |
| if not profile['display_name']: | |
| profile['display_name'] = author_name | |
| profile['count'] += 1 | |
| profile['citations'].append(citations) | |
| if author_info.get('orcid') and not profile['orcid']: | |
| profile['orcid'] = author_info['orcid'] | |
| if author_info.get('id') and not profile['openalex_id']: | |
| profile['openalex_id'] = author_info['id'] | |
| if topic_name != 'Unknown': | |
| profile['topics'][topic_name] += 1 | |
| # Store the topic ID for this topic name | |
| if topic_name not in profile['topic_ids'] and topic_id: | |
| profile['topic_ids'][topic_name] = topic_id | |
| for other_auth in work.get('authorships', []): | |
| other_name = other_auth.get('author', {}).get('display_name', '') | |
| if other_name and other_name != author_name: | |
| profile['coauthors'][other_name] += 1 | |
| if journal != 'Unknown': | |
| profile['journals'][journal] += 1 | |
| countries = authorship.get('countries', []) | |
| for country_code in countries: | |
| if country_code: | |
| profile['countries'][country_code] += 1 | |
| return author_profiles | |
| def transform_openalex_api_to_excel_format(api_work): | |
| """Convert OpenAlex API format to match Excel export format""" | |
| # Safety check | |
| if not api_work: | |
| return None | |
| # Extract primary topic | |
| primary_topic = None | |
| topics = api_work.get('topics', []) | |
| if topics and len(topics) > 0: | |
| topic = topics[0] | |
| primary_topic = { | |
| 'id': topic.get('id', '').split('/')[-1] if topic.get('id') else '', | |
| 'display_name': topic.get('display_name', ''), | |
| 'subfield': { | |
| 'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '', | |
| 'display_name': topic.get('subfield', {}).get('display_name', '') | |
| } if topic.get('subfield') else {'id': '', 'display_name': ''}, | |
| 'field': { | |
| 'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '', | |
| 'display_name': topic.get('field', {}).get('display_name', '') | |
| } if topic.get('field') else {'id': '', 'display_name': ''}, | |
| 'domain': { | |
| 'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '', | |
| 'display_name': topic.get('domain', {}).get('display_name', '') | |
| } if topic.get('domain') else {'id': '', 'display_name': ''} | |
| } | |
| # Extract authorships | |
| authorships = [] | |
| for authorship in api_work.get('authorships', []): | |
| author = authorship.get('author', {}) | |
| if not author: | |
| continue | |
| # Extract countries from institutions | |
| countries = [] | |
| for institution in authorship.get('institutions', []): | |
| if institution: | |
| country_code = institution.get('country_code', '') | |
| if country_code: | |
| countries.append(country_code) | |
| # Remove duplicates | |
| countries = list(set(countries)) | |
| authorships.append({ | |
| 'author': { | |
| 'id': author.get('id', '').split('/')[-1] if author.get('id') else '', | |
| 'display_name': author.get('display_name', ''), | |
| 'orcid': author.get('orcid', '') | |
| }, | |
| 'countries': countries | |
| }) | |
| # Safely extract primary location source | |
| primary_location = api_work.get('primary_location', {}) | |
| source_name = '' | |
| if primary_location and primary_location.get('source'): | |
| source_name = primary_location['source'].get('display_name', '') | |
| # Build simplified work object | |
| return { | |
| 'doi': api_work.get('doi', ''), | |
| 'publication_year': api_work.get('publication_year', ''), | |
| 'type': api_work.get('type', ''), | |
| 'cited_by_count': api_work.get('cited_by_count', 0), | |
| 'primary_location': { | |
| 'source': { | |
| 'display_name': source_name | |
| } | |
| }, | |
| 'biblio': { | |
| 'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else '' | |
| }, | |
| 'primary_topic': primary_topic, | |
| 'mesh': [], | |
| 'authorships': authorships | |
| } | |
| # ============================================================================ | |
| # STREAMLIT UI | |
| # ============================================================================ | |
| st.title("π OpenAlex Author Search") | |
| st.markdown("Search and analyze author data from OpenAlex") | |
| # Sidebar for instructions | |
| with st.sidebar: | |
| st.header("π How to Use") | |
| st.markdown(""" | |
| **Option 1: Fetch from OpenAlex** | |
| 1. Paste your OpenAlex URL | |
| 2. Click "Fetch Data" | |
| 3. Search and download results | |
| **Option 2: Upload File** | |
| 1. Upload JSON file | |
| 2. Search and download results | |
| **Option 3: Direct API Search** | |
| 1. Enter search criteria | |
| 2. Click "Search" | |
| 3. Analyze results | |
| """) | |
| st.markdown("---") | |
| st.subheader("βοΈ API Settings") | |
| st.caption("π‘ Email included in API requests for OpenAlex 'polite pool' (faster access)") | |
| st.code("halozen@pm.me", language=None) | |
| # Hardcoded email | |
| st.session_state.user_email = "halozen@pm.me" | |
| # Initialize session state for works | |
| if 'works' not in st.session_state: | |
| st.session_state.works = None | |
| # Create tabs for different input methods | |
| tab1, tab2, tab3 = st.tabs(["π Fetch from OpenAlex", "π Upload File", "π Direct API Search"]) | |
| with tab1: | |
| st.subheader("Fetch Data from OpenAlex") | |
| openalex_url = st.text_input( | |
| "Paste OpenAlex URL (web or API format)", | |
| placeholder="https://openalex.org/works?filter=...", | |
| help="Paste either the web URL or API URL from OpenAlex" | |
| ) | |
| col1, col2 = st.columns([1, 3]) | |
| with col1: | |
| max_pages = st.number_input( | |
| "Max Pages", | |
| min_value=1, | |
| max_value=50, | |
| value=5, | |
| help="Each page has up to 200 works. 5 pages = 1,000 works max" | |
| ) | |
| if st.button("π Fetch Data from OpenAlex", type="primary"): | |
| if not openalex_url: | |
| st.error("Please enter an OpenAlex URL") | |
| else: | |
| try: | |
| import re | |
| import urllib.parse | |
| # Parse the URL | |
| parsed = urllib.parse.urlparse(openalex_url) | |
| query_params = urllib.parse.parse_qs(parsed.query) | |
| # Build API URL - OpenAlex API format | |
| api_url = "https://api.openalex.org/works" | |
| # Extract filter parameter | |
| if 'filter' in query_params: | |
| filter_value = query_params['filter'][0] | |
| api_url = f"{api_url}?filter={filter_value}" | |
| # Add per-page parameter | |
| separator = '&' if '?' in api_url else '?' | |
| api_url = f"{api_url}{separator}per-page=200" | |
| st.info(f"π‘ Fetching from OpenAlex API... (up to {max_pages} pages)") | |
| st.code(api_url, language=None) # Show the API URL being used | |
| all_works = [] | |
| # Progress bar | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for page in range(1, max_pages + 1): | |
| # Add page parameter | |
| page_url = api_url | |
| if '?' in page_url: | |
| page_url = f"{page_url}&page={page}" | |
| else: | |
| page_url = f"{page_url}?page={page}" | |
| status_text.text(f"Fetching page {page}/{max_pages}...") | |
| # Fetch data with user agent | |
| import urllib.request | |
| req = urllib.request.Request(page_url) | |
| req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})') | |
| with urllib.request.urlopen(req) as response: | |
| data = json.loads(response.read().decode()) | |
| results = data.get('results', []) | |
| if not results: | |
| break | |
| # Transform to Excel format | |
| for work in results: | |
| all_works.append(transform_openalex_api_to_excel_format(work)) | |
| progress_bar.progress(page / max_pages) | |
| # Stop if we got fewer than 200 results (last page) | |
| if len(results) < 200: | |
| break | |
| progress_bar.empty() | |
| status_text.empty() | |
| if all_works: | |
| st.session_state.works = all_works | |
| st.success(f"β Fetched {len(all_works):,} works from OpenAlex") | |
| # Offer to download the JSON | |
| json_data = json.dumps(all_works, indent=2) | |
| st.download_button( | |
| label="πΎ Download Raw Data (JSON)", | |
| data=json_data, | |
| file_name="openalex_data.json", | |
| mime="application/json" | |
| ) | |
| else: | |
| st.warning("No works found. Try a different URL or check your filters.") | |
| except Exception as e: | |
| st.error(f"β Error fetching data: {str(e)}") | |
| st.info("Make sure your URL is valid and try again.") | |
| with tab2: | |
| st.subheader("Upload Existing Data File") | |
| uploaded_file = st.file_uploader( | |
| "Upload your OpenAlex data (JSON)", | |
| type=['json'], | |
| help="Upload a previously saved JSON file" | |
| ) | |
| if uploaded_file: | |
| try: | |
| # Load data | |
| data = json.load(uploaded_file) | |
| # Handle both formats | |
| if isinstance(data, dict) and 'results' in data: | |
| api_works = data['results'] | |
| st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in api_works] | |
| st.info(f"βΉοΈ Transformed {len(st.session_state.works)} works from OpenAlex API format") | |
| elif isinstance(data, list): | |
| if data and 'title' in data[0]: | |
| st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in data] | |
| st.info(f"βΉοΈ Transformed {len(st.session_state.works)} works from OpenAlex API format") | |
| else: | |
| st.session_state.works = data | |
| else: | |
| st.error("β Unexpected JSON format") | |
| st.stop() | |
| st.success(f"β Loaded {len(st.session_state.works):,} works from file") | |
| except json.JSONDecodeError: | |
| st.error("β Invalid JSON file") | |
| except Exception as e: | |
| st.error(f"β Error processing file: {str(e)}") | |
| with tab3: | |
| st.subheader("Direct API Search") | |
| st.markdown("Search OpenAlex directly without needing a pre-built URL") | |
| # Search filters | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| api_author_name = st.text_input( | |
| "Author Name", | |
| placeholder="e.g., John Smith", | |
| help="Search for works by a specific author", | |
| key="api_author_name" | |
| ) | |
| api_institution = st.text_input( | |
| "Institution", | |
| placeholder="e.g., Harvard University", | |
| help="Filter by institution/affiliation", | |
| key="api_institution" | |
| ) | |
| api_topic = st.text_input( | |
| "Topic/Keyword", | |
| placeholder="e.g., machine learning", | |
| help="Search by topic or keyword", | |
| key="api_topic" | |
| ) | |
| api_journals = st.text_area( | |
| "Journal(s)", | |
| placeholder="Nature\nScience\nCell", | |
| help="Enter journal names, one per line. Leave blank for all journals.", | |
| key="api_journals" | |
| ) | |
| with col2: | |
| api_year_from = st.number_input( | |
| "Publication Year From", | |
| min_value=1900, | |
| max_value=2025, | |
| value=2020, | |
| help="Start year for publication range", | |
| key="api_year_from" | |
| ) | |
| api_year_to = st.number_input( | |
| "Publication Year To", | |
| min_value=1900, | |
| max_value=2025, | |
| value=2025, | |
| help="End year for publication range", | |
| key="api_year_to" | |
| ) | |
| api_min_citations = st.number_input( | |
| "Minimum Citations", | |
| min_value=0, | |
| max_value=10000, | |
| value=0, | |
| help="Filter works with at least this many citations", | |
| key="api_min_citations" | |
| ) | |
| api_max_citations = st.number_input( | |
| "Maximum Citations", | |
| min_value=0, | |
| max_value=100000, | |
| value=0, | |
| help="Filter works with at most this many citations (0 = no limit)", | |
| key="api_max_citations" | |
| ) | |
| api_max_results = st.number_input( | |
| "Maximum Results", | |
| min_value=100, | |
| max_value=50000, | |
| value=1000, | |
| step=100, | |
| help="Maximum number of works to retrieve (Warning: >10,000 may be slow)", | |
| key="api_max_results" | |
| ) | |
| api_author_name = st.text_input( | |
| "Author Name", | |
| placeholder="e.g., John Smith", | |
| help="Search for works by a specific author" | |
| ) | |
| api_institution = st.text_input( | |
| "Institution", | |
| placeholder="e.g., Harvard University", | |
| help="Filter by institution/affiliation" | |
| ) | |
| api_topic = st.text_input( | |
| "Topic/Keyword", | |
| placeholder="e.g., machine learning", | |
| help="Search by topic or keyword" | |
| ) | |
| with col2: | |
| api_year_from = st.number_input( | |
| "Publication Year From", | |
| min_value=1900, | |
| max_value=2025, | |
| value=2020, | |
| help="Start year for publication range" | |
| ) | |
| api_year_to = st.number_input( | |
| "Publication Year To", | |
| min_value=1900, | |
| max_value=2025, | |
| value=2025, | |
| help="End year for publication range" | |
| ) | |
| api_max_results = st.number_input( | |
| "Maximum Results", | |
| min_value=100, | |
| max_value=50000, | |
| value=1000, | |
| step=100, | |
| help="Maximum number of works to retrieve (Warning: >10,000 may be slow)" | |
| ) | |
| # Warning for large requests | |
| if api_max_results > 10000: | |
| st.warning(f"β οΈ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.") | |
| if st.button("π Search OpenAlex API", type="primary"): | |
| try: | |
| from datetime import datetime | |
| # Record search start time | |
| search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| # Build API query | |
| filters = [] | |
| if api_author_name: | |
| filters.append(f'author.search:{api_author_name}') | |
| if api_institution: | |
| filters.append(f'authorships.institutions.display_name.search:{api_institution}') | |
| if api_topic: | |
| filters.append(f'title_and_abstract.search:{api_topic}') | |
| if api_year_from and api_year_to: | |
| filters.append(f'publication_year:{api_year_from}-{api_year_to}') | |
| # Add citation filters | |
| if api_min_citations > 0: | |
| filters.append(f'cited_by_count:>{api_min_citations - 1}') | |
| if api_max_citations > 0: | |
| filters.append(f'cited_by_count:<{api_max_citations + 1}') | |
| # Add journal filters - use display_name.search with OR | |
| if api_journals: | |
| journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()] | |
| if journal_list: | |
| # Create OR filter for journals using pipe separator | |
| # Format: primary_location.source.display_name.search:journal1|journal2|journal3 | |
| journal_search = '|'.join(journal_list) | |
| filters.append(f'primary_location.source.display_name.search:{journal_search}') | |
| # Add article/review/letter filter | |
| filters.append('type:article|review|letter') | |
| if not filters or filters == ['type:article|review|letter']: | |
| st.warning("Please enter at least one search criterion") | |
| st.stop() | |
| # Build API URL | |
| filter_string = ','.join(filters) | |
| api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200" | |
| # Store search parameters for summary | |
| search_params = { | |
| 'author': api_author_name if api_author_name else 'Any', | |
| 'institution': api_institution if api_institution else 'Any', | |
| 'topic': api_topic if api_topic else 'Any', | |
| 'journals': journal_list if api_journals else ['Any'], | |
| 'years': f'{api_year_from}-{api_year_to}', | |
| 'min_citations': api_min_citations, | |
| 'max_citations': api_max_citations if api_max_citations > 0 else 'No limit', | |
| 'search_date': search_datetime | |
| } | |
| st.info(f"π‘ Searching OpenAlex API...") | |
| st.code(api_url, language=None) | |
| all_works = [] | |
| max_pages = (api_max_results // 200) + 1 | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for page in range(1, max_pages + 1): | |
| page_url = f"{api_url}&page={page}" | |
| status_text.text(f"Fetching page {page}/{max_pages}... ({len(all_works)} works so far)") | |
| import urllib.request | |
| req = urllib.request.Request(page_url) | |
| req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})') | |
| with urllib.request.urlopen(req) as response: | |
| data = json.loads(response.read().decode()) | |
| results = data.get('results', []) | |
| if not results: | |
| break | |
| for work in results: | |
| transformed = transform_openalex_api_to_excel_format(work) | |
| if transformed: # Only add if transformation succeeded | |
| all_works.append(transformed) | |
| progress_bar.progress(min(page / max_pages, 1.0)) | |
| if len(all_works) >= api_max_results: | |
| all_works = all_works[:api_max_results] | |
| break | |
| if len(results) < 200: | |
| break | |
| progress_bar.empty() | |
| status_text.empty() | |
| if all_works: | |
| st.session_state.works = all_works | |
| st.session_state.search_params = search_params # Store search parameters | |
| st.success(f"β Found {len(all_works):,} works from OpenAlex") | |
| # Display search summary | |
| st.markdown("---") | |
| st.subheader("π Search Summary") | |
| summary_col1, summary_col2 = st.columns(2) | |
| with summary_col1: | |
| st.markdown(f"**Search Date:** {search_params['search_date']}") | |
| st.markdown(f"**Author:** {search_params['author']}") | |
| st.markdown(f"**Institution:** {search_params['institution']}") | |
| st.markdown(f"**Topic:** {search_params['topic']}") | |
| with summary_col2: | |
| st.markdown(f"**Years:** {search_params['years']}") | |
| st.markdown(f"**Min Citations:** {search_params['min_citations']}") | |
| st.markdown(f"**Max Citations:** {search_params['max_citations']}") | |
| if search_params['journals'] != ['Any']: | |
| st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}") | |
| else: | |
| st.markdown(f"**Journals:** Any") | |
| st.markdown(f"**Total Works Retrieved:** {len(all_works):,}") | |
| st.markdown("---") | |
| # Offer to download the JSON | |
| json_data = json.dumps(all_works, indent=2) | |
| st.download_button( | |
| label="πΎ Download Raw Data (JSON)", | |
| data=json_data, | |
| file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json", | |
| mime="application/json" | |
| ) | |
| else: | |
| st.warning("No works found. Try different search criteria.") | |
| except Exception as e: | |
| st.error(f"β Error searching API: {str(e)}") | |
| st.info("Make sure your search criteria are valid and try again.") | |
| # Only show search interface if we have data | |
| if st.session_state.works: | |
| st.markdown("---") | |
| st.header("π Search Authors") | |
| # Search criteria in columns | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| topic_search = st.text_input( | |
| "π¬ Search by Topic", | |
| placeholder="e.g., neuroscience", | |
| help="Filter works by topic keyword (case-insensitive)" | |
| ) | |
| author_search = st.text_input( | |
| "π€ Search by Author Name", | |
| placeholder="e.g., Smith", | |
| help="Filter authors by name (partial match)" | |
| ) | |
| with col2: | |
| journal_search = st.text_input( | |
| "π Search by Journal", | |
| placeholder="e.g., Nature", | |
| help="Filter works by journal name" | |
| ) | |
| country_search = st.text_input( | |
| "π Search by Country", | |
| placeholder="e.g., United States or US", | |
| help="Filter authors by country (name or code)" | |
| ) | |
| # Additional options | |
| col3, col4, col5 = st.columns(3) | |
| with col3: | |
| min_articles = st.number_input( | |
| "Minimum Articles", | |
| min_value=1, | |
| max_value=100, | |
| value=3, | |
| help="Minimum number of publications" | |
| ) | |
| with col4: | |
| max_results = st.number_input( | |
| "Maximum Results", | |
| min_value=1, | |
| max_value=500, | |
| value=50, | |
| help="Maximum number of authors to display" | |
| ) | |
| with col5: | |
| sort_by = st.selectbox( | |
| "Sort By", | |
| ["Count", "Average Citations", "Median Citations"], | |
| help="How to sort the results" | |
| ) | |
| # Additional display options | |
| st.markdown("**Display Options:**") | |
| col6, col7 = st.columns(2) | |
| with col6: | |
| link_type = st.selectbox( | |
| "Link Author Names To", | |
| ["None", "ORCID", "OpenAlex"], | |
| help="Make author names clickable links" | |
| ) | |
| with col7: | |
| link_topics = st.checkbox( | |
| "Link Topics to OpenAlex", | |
| value=True, | |
| help="Make topic names clickable" | |
| ) | |
| # Search button | |
| if st.button("π Search Authors", type="primary"): | |
| with st.spinner("Processing author profiles..."): | |
| # Process works | |
| profiles = process_works_to_author_profiles( | |
| st.session_state.works, | |
| topic_filter=topic_search.lower() if topic_search else None, | |
| journal_filter=journal_search.lower() if journal_search else None, | |
| country_filter=country_search.lower() if country_search else None | |
| ) | |
| # Build results | |
| results = [] | |
| for normalized_name, profile in profiles.items(): | |
| if profile['count'] < min_articles: | |
| continue | |
| # Author name filter | |
| if author_search: | |
| display_name = profile['display_name'].lower() | |
| if author_search.lower() not in normalized_name.lower() and author_search.lower() not in display_name: | |
| continue | |
| citations = profile['citations'] | |
| median_cites = sorted(citations)[len(citations)//2] if citations else 0 | |
| avg_cites = round(sum(citations) / len(citations), 1) if citations else 0 | |
| most_common_country = profile['countries'].most_common(1) | |
| country_code = most_common_country[0][0] if most_common_country else '' | |
| country_name = get_country_name(country_code) if country_code else '' | |
| continent = get_continent(country_code) if country_code else '' | |
| top_topics = ', '.join([t for t, _ in profile['topics'].most_common(5)]) | |
| # Create clickable topic links if enabled | |
| if link_topics and top_topics: | |
| topic_list = [t for t, _ in profile['topics'].most_common(5)] | |
| # Create OpenAlex works filter links using stored topic IDs | |
| linked_topics = [] | |
| for topic in topic_list: | |
| # Get the topic ID if we have it | |
| topic_id = profile['topic_ids'].get(topic, '') | |
| if topic_id: | |
| # Extract just the ID (e.g., "T12345" from full URL or already formatted) | |
| if '/' in topic_id: | |
| topic_id = topic_id.split('/')[-1] | |
| # Convert to lowercase for the filter (e.g., T10316 -> t10316) | |
| topic_id_lower = topic_id.lower() | |
| # Link to works filtered by this topic with additional parameters | |
| link = f'<a href="https://openalex.org/works?filter=primary_topic.id:{topic_id_lower},primary_location.source.type:source-types/journal&group_by=publication_year,open_access.is_oa,primary_topic.id,authorships.institutions.lineage,type,authorships.author.id,primary_location.source.id,primary_location.source.type&page=1" target="_blank">{topic}</a>' | |
| else: | |
| # Fallback to search if no ID available | |
| topic_encoded = topic.replace(' ', '+') | |
| link = f'<a href="https://openalex.org/topics?search={topic_encoded}" target="_blank">{topic}</a>' | |
| linked_topics.append(link) | |
| top_topics_display = ', '.join(linked_topics) | |
| else: | |
| top_topics_display = top_topics | |
| top_coauthors = ', '.join([c for c, _ in profile['coauthors'].most_common(5)]) | |
| top_journals = ', '.join([j for j, _ in profile['journals'].most_common(5)]) | |
| # Create author name with optional link | |
| author_display = profile['display_name'] | |
| if link_type == "ORCID" and profile['orcid']: | |
| author_display = f'<a href="{profile["orcid"]}" target="_blank">{profile["display_name"]}</a>' | |
| elif link_type == "OpenAlex" and profile['openalex_id']: | |
| # Extract just the ID from the full URL if needed | |
| openalex_id = profile['openalex_id'].split('/')[-1] if '/' in profile['openalex_id'] else profile['openalex_id'] | |
| author_display = f'<a href="https://openalex.org/authors/{openalex_id}" target="_blank">{profile["display_name"]}</a>' | |
| results.append({ | |
| 'Author': author_display, | |
| 'Count': profile['count'], | |
| 'Median Citations': median_cites, | |
| 'Average Citations': avg_cites, | |
| 'Country': country_name, | |
| 'Continent': continent, | |
| 'Top Topics': top_topics_display, | |
| 'Top Co-authors': top_coauthors, | |
| 'Top Journals': top_journals | |
| }) | |
| # For Excel export, create a version without HTML | |
| df_export = pd.DataFrame(results).copy() | |
| # Remove HTML tags for Excel export | |
| df_export['Author'] = df_export['Author'].str.replace('<[^<]+?>', '', regex=True) | |
| df_export['Top Topics'] = df_export['Top Topics'].str.replace('<[^<]+?>', '', regex=True) | |
| # Sort results | |
| if sort_by == "Count": | |
| results.sort(key=lambda x: x['Count'], reverse=True) | |
| elif sort_by == "Average Citations": | |
| results.sort(key=lambda x: x['Average Citations'], reverse=True) | |
| elif sort_by == "Median Citations": | |
| results.sort(key=lambda x: x['Median Citations'], reverse=True) | |
| # Limit results | |
| results = results[:max_results] | |
| if results: | |
| st.success(f"β Found {len(results)} matching authors") | |
| # Convert to DataFrame | |
| df = pd.DataFrame(results) | |
| # Custom CSS for better table display | |
| st.markdown(""" | |
| <style> | |
| /* Style the HTML table */ | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 20px 0; | |
| font-size: 14px; | |
| } | |
| table thead tr { | |
| background-color: #164A78; | |
| color: white; | |
| text-align: left; | |
| font-weight: bold; | |
| } | |
| table th, table td { | |
| padding: 12px 15px; | |
| border: 1px solid #ddd; | |
| } | |
| table tbody tr { | |
| border-bottom: 1px solid #dddddd; | |
| } | |
| table tbody tr:nth-of-type(even) { | |
| background-color: #f3f3f3; | |
| } | |
| table tbody tr:hover { | |
| background-color: #e8f4f8; | |
| } | |
| /* Center numeric columns - Count, Median, Average */ | |
| table td:nth-child(2), | |
| table td:nth-child(3), | |
| table td:nth-child(4) { | |
| text-align: center; | |
| } | |
| /* Enable text wrapping */ | |
| table td { | |
| white-space: normal; | |
| word-wrap: break-word; | |
| max-width: 300px; | |
| } | |
| /* Make links blue and underlined */ | |
| table a { | |
| color: #1e88e5; | |
| text-decoration: underline; | |
| } | |
| table a:hover { | |
| color: #0d47a1; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Display results as HTML table with clickable links | |
| st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True) | |
| # Summary statistics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Authors", len(results)) | |
| with col2: | |
| st.metric("Avg Publications", f"{df['Count'].mean():.1f}") | |
| with col3: | |
| st.metric("Avg Citations", f"{df['Average Citations'].mean():.1f}") | |
| with col4: | |
| orcid_count = sum(1 for r in results if link_type == "ORCID" and '<a href=' in r['Author']) | |
| openalex_count = sum(1 for r in results if link_type == "OpenAlex" and '<a href=' in r['Author']) | |
| linked_count = orcid_count if link_type == "ORCID" else openalex_count | |
| st.metric("With Links", linked_count if link_type != "None" else "N/A") | |
| # Download button - use clean data without HTML | |
| output = BytesIO() | |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
| df_export.to_excel(writer, index=False, sheet_name='Author Search Results') | |
| st.download_button( | |
| label="π₯ Download Results (Excel)", | |
| data=output.getvalue(), | |
| file_name="author_search_results.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| else: | |
| st.warning("No authors match your search criteria. Try adjusting your filters.") | |
| else: | |
| st.info("π Fetch data from OpenAlex or upload a JSON file to get started") |