Spaces:

INLEXIO
/

openalex-search

Sleeping

File size: 44,936 Bytes

import streamlit as st
import json
import pandas as pd
from io import BytesIO
from collections import defaultdict, Counter
import unicodedata

# Configure page
st.set_page_config(
    page_title="OpenAlex Author Search",
    page_icon="🔍",
    layout="wide"
)

# Custom CSS
st.markdown("""
<style>
    .main {
        padding: 2rem;
    }
    .stButton>button {
        background-color: #164A78;
        color: white;
        font-size: 16px;
        padding: 0.5rem 2rem;
        border-radius: 5px;
        border: none;
    }
    .stButton>button:hover {
        background-color: #0d3050;
    }
    h1 {
        color: #164A78;
    }
</style>
""", unsafe_allow_html=True)

# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def normalize_author_name(name):
    """Normalize author names to handle accents and dashes"""
    if not name:
        return name

    normalized = unicodedata.normalize('NFD', name)
    ascii_name = normalized.encode('ascii', 'ignore').decode('ascii')

    ascii_name = ascii_name.replace('–', '-')
    ascii_name = ascii_name.replace('—', '-')
    ascii_name = ascii_name.replace('−', '-')
    ascii_name = ascii_name.replace('‐', '-')
    ascii_name = ascii_name.replace('‑', '-')

    ascii_name = ' '.join(ascii_name.split())

    return ascii_name.strip()

COUNTRY_CODES = {
    'AD': 'Andorra', 'AL': 'Albania', 'AM': 'Armenia', 'AT': 'Austria',
    'AX': 'Åland Islands', 'BA': 'Bosnia and Herzegovina', 'BE': 'Belgium',
    'BG': 'Bulgaria', 'BY': 'Belarus', 'CH': 'Switzerland', 'CY': 'Cyprus',
    'CZ': 'Czech Republic', 'DE': 'Germany', 'DK': 'Denmark', 'EE': 'Estonia',
    'ES': 'Spain', 'FI': 'Finland', 'FO': 'Faroe Islands', 'FR': 'France',
    'GB': 'United Kingdom', 'UK': 'United Kingdom', 'GE': 'Georgia',
    'GG': 'Guernsey', 'GI': 'Gibraltar', 'GR': 'Greece', 'HR': 'Croatia',
    'HU': 'Hungary', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IS': 'Iceland',
    'IT': 'Italy', 'JE': 'Jersey', 'LI': 'Liechtenstein', 'LT': 'Lithuania',
    'LU': 'Luxembourg', 'LV': 'Latvia', 'MC': 'Monaco', 'MD': 'Moldova',
    'ME': 'Montenegro', 'MK': 'North Macedonia', 'MT': 'Malta', 'NL': 'Netherlands',
    'NO': 'Norway', 'PL': 'Poland', 'PT': 'Portugal', 'RO': 'Romania',
    'RS': 'Serbia', 'RU': 'Russia', 'SE': 'Sweden', 'SI': 'Slovenia',
    'SJ': 'Svalbard and Jan Mayen', 'SK': 'Slovakia', 'SM': 'San Marino',
    'UA': 'Ukraine', 'VA': 'Vatican City', 'XK': 'Kosovo',
    'AE': 'United Arab Emirates', 'AF': 'Afghanistan', 'AZ': 'Azerbaijan',
    'BD': 'Bangladesh', 'BH': 'Bahrain', 'BN': 'Brunei', 'BT': 'Bhutan',
    'CN': 'China', 'HK': 'Hong Kong', 'ID': 'Indonesia', 'IL': 'Israel',
    'IN': 'India', 'IQ': 'Iraq', 'IR': 'Iran', 'JO': 'Jordan', 'JP': 'Japan',
    'KG': 'Kyrgyzstan', 'KH': 'Cambodia', 'KP': 'North Korea', 'KR': 'South Korea',
    'KW': 'Kuwait', 'KZ': 'Kazakhstan', 'LA': 'Laos', 'LB': 'Lebanon',
    'LK': 'Sri Lanka', 'MM': 'Myanmar', 'MN': 'Mongolia', 'MO': 'Macau',
    'MV': 'Maldives', 'MY': 'Malaysia', 'NP': 'Nepal', 'OM': 'Oman',
    'PH': 'Philippines', 'PK': 'Pakistan', 'PS': 'Palestine', 'QA': 'Qatar',
    'SA': 'Saudi Arabia', 'SG': 'Singapore', 'SY': 'Syria', 'TH': 'Thailand',
    'TJ': 'Tajikistan', 'TL': 'Timor-Leste', 'TM': 'Turkmenistan', 'TR': 'Turkey',
    'TW': 'Taiwan', 'UZ': 'Uzbekistan', 'VN': 'Vietnam', 'YE': 'Yemen',
    'AO': 'Angola', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'BJ': 'Benin',
    'BW': 'Botswana', 'CD': 'Democratic Republic of the Congo',
    'CF': 'Central African Republic', 'CG': 'Republic of the Congo',
    'CI': 'Ivory Coast', 'CM': 'Cameroon', 'CV': 'Cape Verde', 'DJ': 'Djibouti',
    'DZ': 'Algeria', 'EG': 'Egypt', 'EH': 'Western Sahara', 'ER': 'Eritrea',
    'ET': 'Ethiopia', 'GA': 'Gabon', 'GH': 'Ghana', 'GM': 'Gambia',
    'GN': 'Guinea', 'GQ': 'Equatorial Guinea', 'GW': 'Guinea-Bissau',
    'KE': 'Kenya', 'KM': 'Comoros', 'LR': 'Liberia', 'LS': 'Lesotho',
    'LY': 'Libya', 'MA': 'Morocco', 'MG': 'Madagascar', 'ML': 'Mali',
    'MR': 'Mauritania', 'MU': 'Mauritius', 'MW': 'Malawi', 'MZ': 'Mozambique',
    'NA': 'Namibia', 'NE': 'Niger', 'NG': 'Nigeria', 'RE': 'Réunion',
    'RW': 'Rwanda', 'SC': 'Seychelles', 'SD': 'Sudan', 'SL': 'Sierra Leone',
    'SN': 'Senegal', 'SO': 'Somalia', 'SS': 'South Sudan',
    'ST': 'São Tomé and Príncipe', 'SZ': 'Eswatini', 'TD': 'Chad', 'TG': 'Togo',
    'TN': 'Tunisia', 'TZ': 'Tanzania', 'UG': 'Uganda', 'YT': 'Mayotte',
    'ZA': 'South Africa', 'ZM': 'Zambia', 'ZW': 'Zimbabwe',
    'AG': 'Antigua and Barbuda', 'AI': 'Anguilla', 'AW': 'Aruba',
    'BB': 'Barbados', 'BL': 'Saint Barthélemy', 'BM': 'Bermuda',
    'BQ': 'Caribbean Netherlands', 'BS': 'Bahamas', 'BZ': 'Belize',
    'CA': 'Canada', 'CR': 'Costa Rica', 'CU': 'Cuba', 'CW': 'Curaçao',
    'DM': 'Dominica', 'DO': 'Dominican Republic', 'GD': 'Grenada',
    'GL': 'Greenland', 'GP': 'Guadeloupe', 'GT': 'Guatemala', 'HN': 'Honduras',
    'HT': 'Haiti', 'JM': 'Jamaica', 'KN': 'Saint Kitts and Nevis',
    'KY': 'Cayman Islands', 'LC': 'Saint Lucia', 'MF': 'Saint Martin',
    'MQ': 'Martinique', 'MS': 'Montserrat', 'MX': 'Mexico', 'NI': 'Nicaragua',
    'PA': 'Panama', 'PM': 'Saint Pierre and Miquelon', 'PR': 'Puerto Rico',
    'SV': 'El Salvador', 'SX': 'Sint Maarten', 'TC': 'Turks and Caicos Islands',
    'TT': 'Trinidad and Tobago', 'US': 'United States',
    'VC': 'Saint Vincent and the Grenadines', 'VG': 'British Virgin Islands',
    'VI': 'U.S. Virgin Islands',
    'AR': 'Argentina', 'BO': 'Bolivia', 'BR': 'Brazil', 'CL': 'Chile',
    'CO': 'Colombia', 'EC': 'Ecuador', 'FK': 'Falkland Islands',
    'GF': 'French Guiana', 'GY': 'Guyana', 'PE': 'Peru', 'PY': 'Paraguay',
    'SR': 'Suriname', 'UY': 'Uruguay', 'VE': 'Venezuela',
    'AS': 'American Samoa', 'AU': 'Australia', 'CK': 'Cook Islands',
    'FJ': 'Fiji', 'FM': 'Micronesia', 'GU': 'Guam', 'KI': 'Kiribati',
    'MH': 'Marshall Islands', 'MP': 'Northern Mariana Islands',
    'NC': 'New Caledonia', 'NF': 'Norfolk Island', 'NR': 'Nauru', 'NU': 'Niue',
    'NZ': 'New Zealand', 'PF': 'French Polynesia', 'PG': 'Papua New Guinea',
    'PN': 'Pitcairn Islands', 'PW': 'Palau', 'SB': 'Solomon Islands',
    'TK': 'Tokelau', 'TO': 'Tonga', 'TV': 'Tuvalu',
    'UM': 'U.S. Minor Outlying Islands', 'VU': 'Vanuatu',
    'WF': 'Wallis and Futuna', 'WS': 'Samoa'
}

CONTINENT_MAP = {
    'Europe': ['AD', 'AL', 'AT', 'AX', 'BA', 'BE', 'BG', 'BY', 'CH', 'CY',
               'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FO', 'FR', 'GB', 'UK',
               'GG', 'GI', 'GR', 'HR', 'HU', 'IE', 'IM', 'IS', 'IT', 'JE',
               'LI', 'LT', 'LU', 'LV', 'MC', 'MD', 'ME', 'MK', 'MT', 'NL',
               'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SJ', 'SK', 'SM',
               'UA', 'VA', 'XK'],
    'Asia': ['AE', 'AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'GE',
             'HK', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KH',
             'KP', 'KR', 'KW', 'KZ', 'LA', 'LB', 'LK', 'MM', 'MN', 'MO',
             'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'PS', 'QA', 'SA', 'SG',
             'SY', 'TH', 'TJ', 'TL', 'TM', 'TR', 'TW', 'UZ', 'VN', 'YE'],
    'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CF', 'CG', 'CI', 'CM',
               'CV', 'DJ', 'DZ', 'EG', 'EH', 'ER', 'ET', 'GA', 'GH', 'GM',
               'GN', 'GQ', 'GW', 'KE', 'KM', 'LR', 'LS', 'LY', 'MA', 'MG',
               'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RE', 'RW',
               'SC', 'SD', 'SL', 'SN', 'SO', 'SS', 'ST', 'SZ', 'TD', 'TG',
               'TN', 'TZ', 'UG', 'YT', 'ZA', 'ZM', 'ZW'],
    'North America': ['AG', 'AI', 'AW', 'BB', 'BL', 'BM', 'BQ', 'BS', 'BZ',
                      'CA', 'CR', 'CU', 'CW', 'DM', 'DO', 'GD', 'GL', 'GP',
                      'GT', 'HN', 'HT', 'JM', 'KN', 'KY', 'LC', 'MF', 'MQ',
                      'MS', 'MX', 'NI', 'PA', 'PM', 'PR', 'SV', 'SX', 'TC',
                      'TT', 'US', 'VC', 'VG', 'VI'],
    'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY',
                      'PE', 'PY', 'SR', 'UY', 'VE'],
    'Oceania': ['AS', 'AU', 'CK', 'FJ', 'FM', 'GU', 'KI', 'MH', 'MP', 'NC',
                'NF', 'NR', 'NU', 'NZ', 'PF', 'PG', 'PN', 'PW', 'SB', 'TK',
                'TO', 'TV', 'UM', 'VU', 'WF', 'WS']
}

def get_country_name(code):
    return COUNTRY_CODES.get(code.upper(), code)

def get_continent(country_code):
    cc = country_code.upper()
    for continent, codes in CONTINENT_MAP.items():
        if cc in codes:
            return continent
    return 'Unknown'

def process_works_to_author_profiles(works, topic_filter=None, journal_filter=None, country_filter=None):
    """Process works into author profiles with filtering"""
    author_profiles = defaultdict(lambda: {
        'count': 0,
        'citations': [],
        'topics': Counter(),
        'topic_ids': {},  # Store topic IDs
        'coauthors': Counter(),
        'journals': Counter(),
        'countries': Counter(),
        'orcid': '',
        'openalex_id': '',
        'display_name': ''
    })

    for work in works:
        citations = work.get('cited_by_count', 0)

        primary_loc = work.get('primary_location', {})
        source = primary_loc.get('source', {}) if primary_loc else {}
        journal = source.get('display_name', 'Unknown')

        topic = work.get('primary_topic')
        topic_name = topic.get('display_name', 'Unknown') if topic else 'Unknown'
        topic_id = topic.get('id', '') if topic else ''

        # Apply filters
        if topic_filter and topic_filter not in topic_name.lower():
            continue

        if journal_filter and journal_filter not in journal.lower():
            continue

        if country_filter:
            work_has_country = False
            for authorship in work.get('authorships', []):
                countries = authorship.get('countries', [])
                for country_code in countries:
                    if country_code:
                        country_name = get_country_name(country_code).lower()
                        if country_filter in country_name or country_filter in country_code.lower():
                            work_has_country = True
                            break
                if work_has_country:
                    break
            if not work_has_country:
                continue

        # Process authors
        for authorship in work.get('authorships', []):
            author_info = authorship.get('author', {})
            author_name = author_info.get('display_name', 'Unknown')

            if not author_name or author_name == 'Unknown':
                continue

            normalized_name = normalize_author_name(author_name)
            profile = author_profiles[normalized_name]

            if not profile['display_name']:
                profile['display_name'] = author_name

            profile['count'] += 1
            profile['citations'].append(citations)

            if author_info.get('orcid') and not profile['orcid']:
                profile['orcid'] = author_info['orcid']
            if author_info.get('id') and not profile['openalex_id']:
                profile['openalex_id'] = author_info['id']

            if topic_name != 'Unknown':
                profile['topics'][topic_name] += 1
                # Store the topic ID for this topic name
                if topic_name not in profile['topic_ids'] and topic_id:
                    profile['topic_ids'][topic_name] = topic_id

            for other_auth in work.get('authorships', []):
                other_name = other_auth.get('author', {}).get('display_name', '')
                if other_name and other_name != author_name:
                    profile['coauthors'][other_name] += 1

            if journal != 'Unknown':
                profile['journals'][journal] += 1

            countries = authorship.get('countries', [])
            for country_code in countries:
                if country_code:
                    profile['countries'][country_code] += 1

    return author_profiles

def transform_openalex_api_to_excel_format(api_work):
    """Convert OpenAlex API format to match Excel export format"""
    
    # Safety check
    if not api_work:
        return None

    # Extract primary topic
    primary_topic = None
    topics = api_work.get('topics', [])
    if topics and len(topics) > 0:
        topic = topics[0]
        primary_topic = {
            'id': topic.get('id', '').split('/')[-1] if topic.get('id') else '',
            'display_name': topic.get('display_name', ''),
            'subfield': {
                'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
                'display_name': topic.get('subfield', {}).get('display_name', '')
            } if topic.get('subfield') else {'id': '', 'display_name': ''},
            'field': {
                'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
                'display_name': topic.get('field', {}).get('display_name', '')
            } if topic.get('field') else {'id': '', 'display_name': ''},
            'domain': {
                'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
                'display_name': topic.get('domain', {}).get('display_name', '')
            } if topic.get('domain') else {'id': '', 'display_name': ''}
        }

    # Extract authorships
    authorships = []
    for authorship in api_work.get('authorships', []):
        author = authorship.get('author', {})
        if not author:
            continue

        # Extract countries from institutions
        countries = []
        for institution in authorship.get('institutions', []):
            if institution:
                country_code = institution.get('country_code', '')
                if country_code:
                    countries.append(country_code)

        # Remove duplicates
        countries = list(set(countries))

        authorships.append({
            'author': {
                'id': author.get('id', '').split('/')[-1] if author.get('id') else '',
                'display_name': author.get('display_name', ''),
                'orcid': author.get('orcid', '')
            },
            'countries': countries
        })

    # Safely extract primary location source
    primary_location = api_work.get('primary_location', {})
    source_name = ''
    if primary_location and primary_location.get('source'):
        source_name = primary_location['source'].get('display_name', '')

    # Build simplified work object
    return {
        'doi': api_work.get('doi', ''),
        'publication_year': api_work.get('publication_year', ''),
        'type': api_work.get('type', ''),
        'cited_by_count': api_work.get('cited_by_count', 0),
        'primary_location': {
            'source': {
                'display_name': source_name
            }
        },
        'biblio': {
            'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else ''
        },
        'primary_topic': primary_topic,
        'mesh': [],
        'authorships': authorships
    }

# ============================================================================
# STREAMLIT UI
# ============================================================================

st.title("🔍 OpenAlex Author Search")
st.markdown("Search and analyze author data from OpenAlex")

# Sidebar for instructions
with st.sidebar:
    st.header("📖 How to Use")
    st.markdown("""
    **Option 1: Fetch from OpenAlex**
    1. Paste your OpenAlex URL
    2. Click "Fetch Data"
    3. Search and download results

    **Option 2: Upload File**
    1. Upload JSON file
    2. Search and download results
    
    **Option 3: Direct API Search**
    1. Enter search criteria
    2. Click "Search"
    3. Analyze results
    """)
    
    st.markdown("---")
    
    st.subheader("⚙️ API Settings")
    st.caption("💡 Email included in API requests for OpenAlex 'polite pool' (faster access)")
    st.code("halozen@pm.me", language=None)
    
    # Hardcoded email
    st.session_state.user_email = "halozen@pm.me"

# Initialize session state for works
if 'works' not in st.session_state:
    st.session_state.works = None

# Create tabs for different input methods
tab1, tab2, tab3 = st.tabs(["🌐 Fetch from OpenAlex", "📁 Upload File", "🔍 Direct API Search"])

with tab1:
    st.subheader("Fetch Data from OpenAlex")

    openalex_url = st.text_input(
        "Paste OpenAlex URL (web or API format)",
        placeholder="https://openalex.org/works?filter=...",
        help="Paste either the web URL or API URL from OpenAlex"
    )

    col1, col2 = st.columns([1, 3])
    with col1:
        max_pages = st.number_input(
            "Max Pages",
            min_value=1,
            max_value=50,
            value=5,
            help="Each page has up to 200 works. 5 pages = 1,000 works max"
        )

    if st.button("🔍 Fetch Data from OpenAlex", type="primary"):
        if not openalex_url:
            st.error("Please enter an OpenAlex URL")
        else:
            try:
                import re
                import urllib.parse
                
                # Parse the URL
                parsed = urllib.parse.urlparse(openalex_url)
                query_params = urllib.parse.parse_qs(parsed.query)
                
                # Build API URL - OpenAlex API format
                api_url = "https://api.openalex.org/works"
                
                # Extract filter parameter
                if 'filter' in query_params:
                    filter_value = query_params['filter'][0]
                    api_url = f"{api_url}?filter={filter_value}"
                
                # Add per-page parameter
                separator = '&' if '?' in api_url else '?'
                api_url = f"{api_url}{separator}per-page=200"

                st.info(f"📡 Fetching from OpenAlex API... (up to {max_pages} pages)")
                st.code(api_url, language=None)  # Show the API URL being used

                all_works = []

                # Progress bar
                progress_bar = st.progress(0)
                status_text = st.empty()

                for page in range(1, max_pages + 1):
                    # Add page parameter
                    page_url = api_url
                    if '?' in page_url:
                        page_url = f"{page_url}&page={page}"
                    else:
                        page_url = f"{page_url}?page={page}"

                    status_text.text(f"Fetching page {page}/{max_pages}...")

                    # Fetch data with user agent
                    import urllib.request
                    req = urllib.request.Request(page_url)
                    req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')
                    
                    with urllib.request.urlopen(req) as response:
                        data = json.loads(response.read().decode())
                        results = data.get('results', [])

                        if not results:
                            break

                        # Transform to Excel format
                        for work in results:
                            all_works.append(transform_openalex_api_to_excel_format(work))

                    progress_bar.progress(page / max_pages)

                    # Stop if we got fewer than 200 results (last page)
                    if len(results) < 200:
                        break

                progress_bar.empty()
                status_text.empty()

                if all_works:
                    st.session_state.works = all_works
                    st.success(f"✅ Fetched {len(all_works):,} works from OpenAlex")

                    # Offer to download the JSON
                    json_data = json.dumps(all_works, indent=2)
                    st.download_button(
                        label="💾 Download Raw Data (JSON)",
                        data=json_data,
                        file_name="openalex_data.json",
                        mime="application/json"
                    )
                else:
                    st.warning("No works found. Try a different URL or check your filters.")

            except Exception as e:
                st.error(f"❌ Error fetching data: {str(e)}")
                st.info("Make sure your URL is valid and try again.")

with tab2:
    st.subheader("Upload Existing Data File")

    uploaded_file = st.file_uploader(
        "Upload your OpenAlex data (JSON)",
        type=['json'],
        help="Upload a previously saved JSON file"
    )

    if uploaded_file:
        try:
            # Load data
            data = json.load(uploaded_file)

            # Handle both formats
            if isinstance(data, dict) and 'results' in data:
                api_works = data['results']
                st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in api_works]
                st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format")
            elif isinstance(data, list):
                if data and 'title' in data[0]:
                    st.session_state.works = [transform_openalex_api_to_excel_format(w) for w in data]
                    st.info(f"ℹ️ Transformed {len(st.session_state.works)} works from OpenAlex API format")
                else:
                    st.session_state.works = data
            else:
                st.error("❌ Unexpected JSON format")
                st.stop()

            st.success(f"✅ Loaded {len(st.session_state.works):,} works from file")

        except json.JSONDecodeError:
            st.error("❌ Invalid JSON file")
        except Exception as e:
            st.error(f"❌ Error processing file: {str(e)}")

with tab3:
    st.subheader("Direct API Search")
    st.markdown("Search OpenAlex directly without needing a pre-built URL")
    
    # Search filters
    col1, col2 = st.columns(2)
    
    with col1:
        api_author_name = st.text_input(
            "Author Name",
            placeholder="e.g., John Smith",
            help="Search for works by a specific author",
            key="api_author_name"
        )
        
        api_institution = st.text_input(
            "Institution",
            placeholder="e.g., Harvard University",
            help="Filter by institution/affiliation",
            key="api_institution"
        )
        
        api_topic = st.text_input(
            "Topic/Keyword",
            placeholder="e.g., machine learning",
            help="Search by topic or keyword",
            key="api_topic"
        )
        
        api_journals = st.text_area(
            "Journal(s)",
            placeholder="Nature\nScience\nCell",
            help="Enter journal names, one per line. Leave blank for all journals.",
            key="api_journals"
        )
    
    with col2:
        api_year_from = st.number_input(
            "Publication Year From",
            min_value=1900,
            max_value=2025,
            value=2020,
            help="Start year for publication range",
            key="api_year_from"
        )
        
        api_year_to = st.number_input(
            "Publication Year To",
            min_value=1900,
            max_value=2025,
            value=2025,
            help="End year for publication range",
            key="api_year_to"
        )
        
        api_min_citations = st.number_input(
            "Minimum Citations",
            min_value=0,
            max_value=10000,
            value=0,
            help="Filter works with at least this many citations",
            key="api_min_citations"
        )
        
        api_max_citations = st.number_input(
            "Maximum Citations",
            min_value=0,
            max_value=100000,
            value=0,
            help="Filter works with at most this many citations (0 = no limit)",
            key="api_max_citations"
        )
        
        api_max_results = st.number_input(
            "Maximum Results",
            min_value=100,
            max_value=50000,
            value=1000,
            step=100,
            help="Maximum number of works to retrieve (Warning: >10,000 may be slow)",
            key="api_max_results"
        )
        api_author_name = st.text_input(
            "Author Name",
            placeholder="e.g., John Smith",
            help="Search for works by a specific author"
        )
        
        api_institution = st.text_input(
            "Institution",
            placeholder="e.g., Harvard University",
            help="Filter by institution/affiliation"
        )
        
        api_topic = st.text_input(
            "Topic/Keyword",
            placeholder="e.g., machine learning",
            help="Search by topic or keyword"
        )
    
    with col2:
        api_year_from = st.number_input(
            "Publication Year From",
            min_value=1900,
            max_value=2025,
            value=2020,
            help="Start year for publication range"
        )
        
        api_year_to = st.number_input(
            "Publication Year To",
            min_value=1900,
            max_value=2025,
            value=2025,
            help="End year for publication range"
        )
        
        api_max_results = st.number_input(
            "Maximum Results",
            min_value=100,
            max_value=50000,
            value=1000,
            step=100,
            help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
        )
    
    # Warning for large requests
    if api_max_results > 10000:
        st.warning(f"⚠️ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.")
    
    if st.button("🔍 Search OpenAlex API", type="primary"):
        try:
            from datetime import datetime
            
            # Record search start time
            search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            # Build API query
            filters = []
            
            if api_author_name:
                filters.append(f'author.search:{api_author_name}')
            
            if api_institution:
                filters.append(f'authorships.institutions.display_name.search:{api_institution}')
            
            if api_topic:
                filters.append(f'title_and_abstract.search:{api_topic}')
            
            if api_year_from and api_year_to:
                filters.append(f'publication_year:{api_year_from}-{api_year_to}')
            
            # Add citation filters
            if api_min_citations > 0:
                filters.append(f'cited_by_count:>{api_min_citations - 1}')
            
            if api_max_citations > 0:
                filters.append(f'cited_by_count:<{api_max_citations + 1}')
            
            # Add journal filters - use display_name.search with OR
            if api_journals:
                journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()]
                if journal_list:
                    # Create OR filter for journals using pipe separator
                    # Format: primary_location.source.display_name.search:journal1|journal2|journal3
                    journal_search = '|'.join(journal_list)
                    filters.append(f'primary_location.source.display_name.search:{journal_search}')
            
            # Add article/review/letter filter
            filters.append('type:article|review|letter')
            
            if not filters or filters == ['type:article|review|letter']:
                st.warning("Please enter at least one search criterion")
                st.stop()
            
            # Build API URL
            filter_string = ','.join(filters)
            api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
            
            # Store search parameters for summary
            search_params = {
                'author': api_author_name if api_author_name else 'Any',
                'institution': api_institution if api_institution else 'Any',
                'topic': api_topic if api_topic else 'Any',
                'journals': journal_list if api_journals else ['Any'],
                'years': f'{api_year_from}-{api_year_to}',
                'min_citations': api_min_citations,
                'max_citations': api_max_citations if api_max_citations > 0 else 'No limit',
                'search_date': search_datetime
            }
            
            st.info(f"📡 Searching OpenAlex API...")
            st.code(api_url, language=None)
            
            all_works = []
            max_pages = (api_max_results // 200) + 1
            
            progress_bar = st.progress(0)
            status_text = st.empty()
            
            for page in range(1, max_pages + 1):
                page_url = f"{api_url}&page={page}"
                status_text.text(f"Fetching page {page}/{max_pages}... ({len(all_works)} works so far)")
                
                import urllib.request
                req = urllib.request.Request(page_url)
                req.add_header('User-Agent', f'Mozilla/5.0 (mailto:{st.session_state.user_email})')
                
                with urllib.request.urlopen(req) as response:
                    data = json.loads(response.read().decode())
                    results = data.get('results', [])
                    
                    if not results:
                        break
                    
                    for work in results:
                        transformed = transform_openalex_api_to_excel_format(work)
                        if transformed:  # Only add if transformation succeeded
                            all_works.append(transformed)
                
                progress_bar.progress(min(page / max_pages, 1.0))
                
                if len(all_works) >= api_max_results:
                    all_works = all_works[:api_max_results]
                    break
                
                if len(results) < 200:
                    break
            
            progress_bar.empty()
            status_text.empty()
            
            if all_works:
                st.session_state.works = all_works
                st.session_state.search_params = search_params  # Store search parameters
                
                st.success(f"✅ Found {len(all_works):,} works from OpenAlex")
                
                # Display search summary
                st.markdown("---")
                st.subheader("📊 Search Summary")
                
                summary_col1, summary_col2 = st.columns(2)
                
                with summary_col1:
                    st.markdown(f"**Search Date:** {search_params['search_date']}")
                    st.markdown(f"**Author:** {search_params['author']}")
                    st.markdown(f"**Institution:** {search_params['institution']}")
                    st.markdown(f"**Topic:** {search_params['topic']}")
                
                with summary_col2:
                    st.markdown(f"**Years:** {search_params['years']}")
                    st.markdown(f"**Min Citations:** {search_params['min_citations']}")
                    st.markdown(f"**Max Citations:** {search_params['max_citations']}")
                    if search_params['journals'] != ['Any']:
                        st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}")
                    else:
                        st.markdown(f"**Journals:** Any")
                
                st.markdown(f"**Total Works Retrieved:** {len(all_works):,}")
                st.markdown("---")
                
                # Offer to download the JSON
                json_data = json.dumps(all_works, indent=2)
                st.download_button(
                    label="💾 Download Raw Data (JSON)",
                    data=json_data,
                    file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json",
                    mime="application/json"
                )
            else:
                st.warning("No works found. Try different search criteria.")
        
        except Exception as e:
            st.error(f"❌ Error searching API: {str(e)}")
            st.info("Make sure your search criteria are valid and try again.")

# Only show search interface if we have data
if st.session_state.works:
    st.markdown("---")
    st.header("🔎 Search Authors")

    # Search criteria in columns
    col1, col2 = st.columns(2)

    with col1:
        topic_search = st.text_input(
            "🔬 Search by Topic",
            placeholder="e.g., neuroscience",
            help="Filter works by topic keyword (case-insensitive)"
        )

        author_search = st.text_input(
            "👤 Search by Author Name",
            placeholder="e.g., Smith",
            help="Filter authors by name (partial match)"
        )

    with col2:
        journal_search = st.text_input(
            "📄 Search by Journal",
            placeholder="e.g., Nature",
            help="Filter works by journal name"
        )

        country_search = st.text_input(
            "🌍 Search by Country",
            placeholder="e.g., United States or US",
            help="Filter authors by country (name or code)"
        )

    # Additional options
    col3, col4, col5 = st.columns(3)

    with col3:
        min_articles = st.number_input(
            "Minimum Articles",
            min_value=1,
            max_value=100,
            value=3,
            help="Minimum number of publications"
        )

    with col4:
        max_results = st.number_input(
            "Maximum Results",
            min_value=1,
            max_value=500,
            value=50,
            help="Maximum number of authors to display"
        )

    with col5:
        sort_by = st.selectbox(
            "Sort By",
            ["Count", "Average Citations", "Median Citations"],
            help="How to sort the results"
        )
    
    # Additional display options
    st.markdown("**Display Options:**")
    col6, col7 = st.columns(2)
    with col6:
        link_type = st.selectbox(
            "Link Author Names To",
            ["None", "ORCID", "OpenAlex"],
            help="Make author names clickable links"
        )
    with col7:
        link_topics = st.checkbox(
            "Link Topics to OpenAlex",
            value=True,
            help="Make topic names clickable"
        )

    # Search button
    if st.button("🔍 Search Authors", type="primary"):
        with st.spinner("Processing author profiles..."):

            # Process works
            profiles = process_works_to_author_profiles(
                st.session_state.works,
                topic_filter=topic_search.lower() if topic_search else None,
                journal_filter=journal_search.lower() if journal_search else None,
                country_filter=country_search.lower() if country_search else None
            )

            # Build results
            results = []
            for normalized_name, profile in profiles.items():
                if profile['count'] < min_articles:
                    continue

                # Author name filter
                if author_search:
                    display_name = profile['display_name'].lower()
                    if author_search.lower() not in normalized_name.lower() and author_search.lower() not in display_name:
                        continue

                citations = profile['citations']
                median_cites = sorted(citations)[len(citations)//2] if citations else 0
                avg_cites = round(sum(citations) / len(citations), 1) if citations else 0

                most_common_country = profile['countries'].most_common(1)
                country_code = most_common_country[0][0] if most_common_country else ''
                country_name = get_country_name(country_code) if country_code else ''
                continent = get_continent(country_code) if country_code else ''

                top_topics = ', '.join([t for t, _ in profile['topics'].most_common(5)])
                
                # Create clickable topic links if enabled
                if link_topics and top_topics:
                    topic_list = [t for t, _ in profile['topics'].most_common(5)]
                    # Create OpenAlex works filter links using stored topic IDs
                    linked_topics = []
                    for topic in topic_list:
                        # Get the topic ID if we have it
                        topic_id = profile['topic_ids'].get(topic, '')
                        if topic_id:
                            # Extract just the ID (e.g., "T12345" from full URL or already formatted)
                            if '/' in topic_id:
                                topic_id = topic_id.split('/')[-1]
                            # Convert to lowercase for the filter (e.g., T10316 -> t10316)
                            topic_id_lower = topic_id.lower()
                            # Link to works filtered by this topic with additional parameters
                            link = f'<a href="https://openalex.org/works?filter=primary_topic.id:{topic_id_lower},primary_location.source.type:source-types/journal&group_by=publication_year,open_access.is_oa,primary_topic.id,authorships.institutions.lineage,type,authorships.author.id,primary_location.source.id,primary_location.source.type&page=1" target="_blank">{topic}</a>'
                        else:
                            # Fallback to search if no ID available
                            topic_encoded = topic.replace(' ', '+')
                            link = f'<a href="https://openalex.org/topics?search={topic_encoded}" target="_blank">{topic}</a>'
                        linked_topics.append(link)
                    top_topics_display = ', '.join(linked_topics)
                else:
                    top_topics_display = top_topics
                
                top_coauthors = ', '.join([c for c, _ in profile['coauthors'].most_common(5)])
                top_journals = ', '.join([j for j, _ in profile['journals'].most_common(5)])

                # Create author name with optional link
                author_display = profile['display_name']
                if link_type == "ORCID" and profile['orcid']:
                    author_display = f'<a href="{profile["orcid"]}" target="_blank">{profile["display_name"]}</a>'
                elif link_type == "OpenAlex" and profile['openalex_id']:
                    # Extract just the ID from the full URL if needed
                    openalex_id = profile['openalex_id'].split('/')[-1] if '/' in profile['openalex_id'] else profile['openalex_id']
                    author_display = f'<a href="https://openalex.org/authors/{openalex_id}" target="_blank">{profile["display_name"]}</a>'

                results.append({
                    'Author': author_display,
                    'Count': profile['count'],
                    'Median Citations': median_cites,
                    'Average Citations': avg_cites,
                    'Country': country_name,
                    'Continent': continent,
                    'Top Topics': top_topics_display,
                    'Top Co-authors': top_coauthors,
                    'Top Journals': top_journals
                })

            # For Excel export, create a version without HTML
            df_export = pd.DataFrame(results).copy()
            # Remove HTML tags for Excel export
            df_export['Author'] = df_export['Author'].str.replace('<[^<]+?>', '', regex=True)
            df_export['Top Topics'] = df_export['Top Topics'].str.replace('<[^<]+?>', '', regex=True)

            # Sort results
            if sort_by == "Count":
                results.sort(key=lambda x: x['Count'], reverse=True)
            elif sort_by == "Average Citations":
                results.sort(key=lambda x: x['Average Citations'], reverse=True)
            elif sort_by == "Median Citations":
                results.sort(key=lambda x: x['Median Citations'], reverse=True)

            # Limit results
            results = results[:max_results]

            if results:
                st.success(f"✅ Found {len(results)} matching authors")

                # Convert to DataFrame
                df = pd.DataFrame(results)
                
                # Custom CSS for better table display
                st.markdown("""
                <style>
                    /* Style the HTML table */
                    table {
                        width: 100%;
                        border-collapse: collapse;
                        margin: 20px 0;
                        font-size: 14px;
                    }
                    
                    table thead tr {
                        background-color: #164A78;
                        color: white;
                        text-align: left;
                        font-weight: bold;
                    }
                    
                    table th, table td {
                        padding: 12px 15px;
                        border: 1px solid #ddd;
                    }
                    
                    table tbody tr {
                        border-bottom: 1px solid #dddddd;
                    }
                    
                    table tbody tr:nth-of-type(even) {
                        background-color: #f3f3f3;
                    }
                    
                    table tbody tr:hover {
                        background-color: #e8f4f8;
                    }
                    
                    /* Center numeric columns - Count, Median, Average */
                    table td:nth-child(2),
                    table td:nth-child(3),
                    table td:nth-child(4) {
                        text-align: center;
                    }
                    
                    /* Enable text wrapping */
                    table td {
                        white-space: normal;
                        word-wrap: break-word;
                        max-width: 300px;
                    }
                    
                    /* Make links blue and underlined */
                    table a {
                        color: #1e88e5;
                        text-decoration: underline;
                    }
                    
                    table a:hover {
                        color: #0d47a1;
                    }
                </style>
                """, unsafe_allow_html=True)

                # Display results as HTML table with clickable links
                st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)

                # Summary statistics
                col1, col2, col3, col4 = st.columns(4)
                with col1:
                    st.metric("Total Authors", len(results))
                with col2:
                    st.metric("Avg Publications", f"{df['Count'].mean():.1f}")
                with col3:
                    st.metric("Avg Citations", f"{df['Average Citations'].mean():.1f}")
                with col4:
                    orcid_count = sum(1 for r in results if link_type == "ORCID" and '<a href=' in r['Author'])
                    openalex_count = sum(1 for r in results if link_type == "OpenAlex" and '<a href=' in r['Author'])
                    linked_count = orcid_count if link_type == "ORCID" else openalex_count
                    st.metric("With Links", linked_count if link_type != "None" else "N/A")

                # Download button - use clean data without HTML
                output = BytesIO()
                with pd.ExcelWriter(output, engine='openpyxl') as writer:
                    df_export.to_excel(writer, index=False, sheet_name='Author Search Results')

                st.download_button(
                    label="📥 Download Results (Excel)",
                    data=output.getvalue(),
                    file_name="author_search_results.xlsx",
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )

            else:
                st.warning("No authors match your search criteria. Try adjusting your filters.")

else:
    st.info("👆 Fetch data from OpenAlex or upload a JSON file to get started")