import streamlit as st import pandas as pd import urllib.request import urllib.parse import urllib.error import json import time from typing import Dict, Optional, List from io import StringIO st.set_page_config(page_title="OpenAlex H-Index Lookup", page_icon="๐Ÿ“š", layout="wide") # Initialize session state to fix Hugging Face connection issues if 'initialized' not in st.session_state: st.session_state.initialized = True st.rerun() # API Configuration BASE_URL = "https://api.openalex.org" RATE_LIMIT_DELAY = 0.15 POLITE_EMAIL = "halozen@pm.me" def get_author_by_id(author_id: str) -> Optional[Dict]: """Fetch author information by OpenAlex ID.""" if not author_id.upper().startswith('A'): author_id = f"A{author_id}" params = urllib.parse.urlencode({'mailto': POLITE_EMAIL}) url = f"{BASE_URL}/authors/{author_id}?{params}" try: with urllib.request.urlopen(url, timeout=10) as response: if response.status == 200: data = response.read() return json.loads(data.decode('utf-8')) except Exception as e: st.warning(f"Error fetching author {author_id}: {str(e)}") return None def search_author_by_name(name: str, affiliation_hint: str = None, max_results: int = 5) -> List[Dict]: """Search for authors by name, using affiliation hint to re-rank results.""" params = { 'search': name, 'per-page': max_results * 4, 'mailto': POLITE_EMAIL } url = f"{BASE_URL}/authors?{urllib.parse.urlencode(params)}" try: with urllib.request.urlopen(url, timeout=10) as response: if response.status == 200: data = response.read() json_data = json.loads(data.decode('utf-8')) results = json_data.get('results', []) def sort_key(author): has_orcid = 1 if author.get('orcid') else 0 works_count = author.get('works_count', 0) affiliation_match = 0 if affiliation_hint: hint_lower = affiliation_hint.lower() last_institutions = author.get('last_known_institutions', []) for inst in last_institutions: if inst: inst_name = inst.get('display_name', '') or '' country = inst.get('country_code', '') or '' inst_name_lower = inst_name.lower() country_lower = country.lower() if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower: affiliation_match = 1 break if affiliation_match == 0: all_affiliations = author.get('affiliations', []) for aff in all_affiliations: if aff: inst = aff.get('institution', {}) or {} inst_name = inst.get('display_name', '') or '' country = inst.get('country_code', '') or '' inst_name_lower = inst_name.lower() country_lower = country.lower() if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower: affiliation_match = 1 break return (affiliation_match, has_orcid, works_count) results.sort(key=sort_key, reverse=True) return results[:max_results] except Exception as e: st.warning(f"Error searching {name}: {str(e)}") return [] def get_top_journals(author_data: Dict, max_journals: int = 5) -> str: """Get the top 5 journals where the author has published most frequently.""" if not author_data or 'id' not in author_data: return "N/A" author_id = author_data['id'] params = urllib.parse.urlencode({ 'filter': f'authorships.author.id:{author_id},primary_location.source.type:journal', 'group_by': 'primary_location.source.id', 'mailto': POLITE_EMAIL }) url = f"{BASE_URL}/works?{params}" try: with urllib.request.urlopen(url, timeout=10) as response: if response.status == 200: data = response.read() json_data = json.loads(data.decode('utf-8')) group_by_results = json_data.get('group_by', []) journals = [] for item in group_by_results[:max_journals]: key_display_name = item.get('key_display_name') count = item.get('count', 0) if key_display_name and key_display_name != 'unknown': journals.append(f"{key_display_name} ({count})") return ", ".join(journals) if journals else "N/A" except Exception as e: st.warning(f"Error fetching journals: {str(e)}") return "N/A" def detect_input_type(input_str: str) -> tuple: """ Detect if input is a name, ORCID, or OpenAlex ID. Returns: (type, cleaned_value) where type is 'name', 'orcid', or 'openalex_id' """ input_str = input_str.strip() # Check for ORCID format: 0000-0000-0000-0000 or URLs if 'orcid.org/' in input_str.lower(): # Extract ORCID from URL orcid = input_str.split('orcid.org/')[-1].strip('/') return ('orcid', orcid) elif input_str.replace('-', '').isdigit() and len(input_str.replace('-', '')) == 16: # Raw ORCID format: 0000-0002-1825-0097 return ('orcid', input_str) # Check for OpenAlex ID format: A1234567890 or URLs if 'openalex.org/A' in input_str or 'openalex.org/authors/A' in input_str: # Extract ID from URL openalex_id = input_str.split('/')[-1].strip() if openalex_id.startswith('A'): return ('openalex_id', openalex_id) elif input_str.upper().startswith('A') and len(input_str) > 5 and input_str[1:].isdigit(): # Raw OpenAlex ID format: A5023888391 return ('openalex_id', input_str.upper()) # Otherwise treat as a name return ('name', input_str) def get_author_by_orcid(orcid: str) -> Optional[Dict]: """Fetch author information by ORCID.""" # Clean ORCID orcid = orcid.replace('https://orcid.org/', '').replace('http://orcid.org/', '').strip('/') params = urllib.parse.urlencode({ 'filter': f'orcid:{orcid}', 'mailto': POLITE_EMAIL }) url = f"{BASE_URL}/authors?{params}" try: with urllib.request.urlopen(url, timeout=10) as response: if response.status == 200: data = response.read() json_data = json.loads(data.decode('utf-8')) results = json_data.get('results', []) if results: return results[0] except Exception as e: st.warning(f"Error fetching ORCID {orcid}: {str(e)}") return None def process_author(name_or_id: str, hint: str = None) -> Dict: """ Process a single author and return their data. Accepts: author name, ORCID, or OpenAlex ID """ # Detect what type of input we have input_type, cleaned_input = detect_input_type(name_or_id) author = None if input_type == 'orcid': # Look up by ORCID author = get_author_by_orcid(cleaned_input) if author: display_name = author.get('display_name', name_or_id) else: return { 'Name': name_or_id, 'ORCID': cleaned_input, 'H-Index': None, 'Works Count': None, 'Cited By Count': None, '2yr Mean Citedness': None, 'i10 Index': None, 'Top Topic': None, 'Top Topic Count': None, 'Top 5 Journals': None, 'Last Known Institution': None, 'Warning': f'ORCID not found: {cleaned_input}' } elif input_type == 'openalex_id': # Look up by OpenAlex ID author = get_author_by_id(cleaned_input) if author: display_name = author.get('display_name', name_or_id) else: return { 'Name': name_or_id, 'ORCID': None, 'H-Index': None, 'Works Count': None, 'Cited By Count': None, '2yr Mean Citedness': None, 'i10 Index': None, 'Top Topic': None, 'Top Topic Count': None, 'Top 5 Journals': None, 'Last Known Institution': None, 'Warning': f'OpenAlex ID not found: {cleaned_input}' } else: # input_type == 'name' # Original name search logic results = search_author_by_name(cleaned_input, affiliation_hint=hint, max_results=3) if not results: return { 'Name': cleaned_input, 'ORCID': None, 'H-Index': None, 'Works Count': None, 'Cited By Count': None, '2yr Mean Citedness': None, 'i10 Index': None, 'Top Topic': None, 'Top Topic Count': None, 'Top 5 Journals': None, 'Last Known Institution': None, 'Warning': 'Not found' } author = results[0] display_name = author.get('display_name', cleaned_input) # Check for disambiguation issues (only for name searches) warning = "" if len(results) > 1: if not author.get('orcid'): warning = "โš ๏ธ Multiple matches, no ORCID" else: similar_names = [r.get('display_name', '') for r in results[1:] if r.get('display_name', '').lower() == display_name.lower()] if similar_names: warning = f"โš ๏ธ {len(similar_names)+1} exact name matches" if display_name.lower() != cleaned_input.lower(): if warning: warning += f" | Matched to: {display_name}" else: warning = f"โš ๏ธ Matched to: {display_name}" # Extract data (same for all input types) if not author: return { 'Name': name_or_id, 'ORCID': None, 'H-Index': None, 'Works Count': None, 'Cited By Count': None, '2yr Mean Citedness': None, 'i10 Index': None, 'Top Topic': None, 'Top Topic Count': None, 'Top 5 Journals': None, 'Last Known Institution': None, 'Warning': 'Not found' } h_index = author.get('summary_stats', {}).get('h_index') works_count = author.get('works_count') cited_by_count = author.get('cited_by_count') summary_stats = author.get('summary_stats', {}) two_yr_mean = summary_stats.get('2yr_mean_citedness') i10_index = summary_stats.get('i10_index') orcid = author.get('orcid', '') topics = author.get('topics', []) top_topic_name = topics[0].get('display_name') if topics else None top_topic_count = topics[0].get('count') if topics else None top_journals = get_top_journals(author) last_institutions = author.get('last_known_institutions', []) institution_names = [inst.get('display_name', '') for inst in last_institutions] if last_institutions else [] last_institution = ", ".join(institution_names) if institution_names else None # Use display_name if we found it, otherwise use original input final_name = display_name if 'display_name' in locals() else name_or_id return { 'Name': final_name, 'ORCID': orcid, 'H-Index': h_index, 'Works Count': works_count, 'Cited By Count': cited_by_count, '2yr Mean Citedness': round(two_yr_mean, 2) if two_yr_mean else None, 'i10 Index': i10_index, 'Top Topic': top_topic_name, 'Top Topic Count': top_topic_count, 'Top 5 Journals': top_journals, 'Last Known Institution': last_institution, 'Warning': warning if input_type == 'name' and warning else None } def process_dataframe(df: pd.DataFrame) -> pd.DataFrame: """Process a dataframe of authors and return results.""" # Add Institution_Hint column if it doesn't exist if 'Institution_Hint' not in df.columns: df['Institution_Hint'] = None results = [] progress_bar = st.progress(0) status_text = st.empty() for idx, row in df.iterrows(): name = row['Name'] hint = row.get('Institution_Hint') if pd.notna(name) and str(name).strip(): status_text.text(f"Processing {idx+1}/{len(df)}: {name}") result = process_author( str(name).strip(), str(hint).strip() if pd.notna(hint) else None ) results.append(result) # Rate limiting time.sleep(RATE_LIMIT_DELAY) progress_bar.progress((idx + 1) / len(df)) status_text.text("โœ… Processing complete!") return pd.DataFrame(results) def display_results(results_df: pd.DataFrame): """Display results with statistics and download button.""" st.subheader("๐Ÿ“Š Results") st.dataframe(results_df, use_container_width=True) # Statistics col1, col2, col3, col4 = st.columns(4) with col1: found = results_df['H-Index'].notna().sum() st.metric("Found", f"{found}/{len(results_df)}") with col2: avg_h = results_df['H-Index'].mean() st.metric("Avg H-Index", f"{avg_h:.1f}" if pd.notna(avg_h) else "N/A") with col3: with_orcid = results_df['ORCID'].notna().sum() st.metric("With ORCID", f"{with_orcid}/{len(results_df)}") with col4: warnings = results_df['Warning'].notna().sum() st.metric("Warnings", warnings) # Download button csv = results_df.to_csv(index=False) st.download_button( label="๐Ÿ“ฅ Download Results as CSV", data=csv, file_name="openalex_results.csv", mime="text/csv", type="primary" ) # ============================================================================ # MAIN APP # ============================================================================ st.title("๐Ÿ“š OpenAlex H-Index Lookup Tool") st.markdown(""" Batch lookup h-indices and publication metrics for researchers using the OpenAlex API. """) # Sidebar with st.sidebar: st.header("โ„น๏ธ How to Use") st.markdown(""" 1. **Choose input method:** - Upload CSV file - Paste CSV data - Run test with sample data 2. **CSV format:** - `Name` column (required) - accepts: - Author names (e.g., "John Smith") - ORCID IDs (e.g., "0000-0002-1825-0097") - OpenAlex IDs (e.g., "A5023888391") - `Institution_Hint` column (optional) 3. **Click Process** to retrieve data 4. **Download** results as CSV **Tips:** - Mix names and IDs in the same file! - Institution hints improve name matching - ORCIDs and OpenAlex IDs = 100% accurate - Processing ~6-7 authors per second """) st.divider() st.markdown("**Data source:** [OpenAlex](https://openalex.org)") st.markdown("**Rate limit:** ~0.15s per author") # Test Mode Button st.subheader("๐Ÿงช Quick Test") if st.button("Run with Sample Data", help="Test with Einstein, Curie, and Newton"): test_data = pd.DataFrame({ 'Name': ['Albert Einstein', 'Marie Curie', 'Isaac Newton'], 'Institution_Hint': ['Princeton', 'Paris', 'Cambridge'] }) with st.spinner("Processing sample data..."): results_df = process_dataframe(test_data) display_results(results_df) st.divider() # Main Input Section with Tabs st.subheader("๐Ÿ“‹ Input Your Data") tab1, tab2, tab3 = st.tabs(["๐Ÿ“ค Upload CSV", "๐Ÿ“ Paste CSV", "๐Ÿ“ฅ Download Template"]) with tab1: st.markdown("Upload a CSV file with author names:") uploaded_file = st.file_uploader( "Choose a CSV file", type=['csv'], help="CSV must have a 'Name' column. 'Institution_Hint' is optional.", key="csv_uploader" ) if uploaded_file is not None: try: df = pd.read_csv(uploaded_file) # Validate columns if 'Name' not in df.columns: st.error("โŒ CSV must have a 'Name' column") else: st.success(f"โœ… Loaded {len(df)} names") # Preview with st.expander("๐Ÿ“‹ Preview uploaded data"): st.dataframe(df.head(10)) # Process button if st.button("๐Ÿš€ Process Authors", type="primary", key="process_upload"): results_df = process_dataframe(df) display_results(results_df) except Exception as e: st.error(f"Error reading file: {str(e)}") with tab2: st.markdown("Paste CSV data directly (useful if file upload doesn't work):") csv_text = st.text_area( "Paste your CSV data here:", height=200, placeholder="Name,Institution_Hint\nAlbert Einstein,Princeton\nMarie Curie,Paris\nJohn Smith,MIT", help="Include headers in first row. Separate columns with commas." ) if st.button("๐Ÿš€ Process Pasted Data", type="primary", key="process_paste") and csv_text: try: df = pd.read_csv(StringIO(csv_text)) # Validate columns if 'Name' not in df.columns: st.error("โŒ CSV must have a 'Name' column") else: st.success(f"โœ… Parsed {len(df)} names") # Preview with st.expander("๐Ÿ“‹ Preview pasted data"): st.dataframe(df.head(10)) # Process results_df = process_dataframe(df) display_results(results_df) except Exception as e: st.error(f"Error parsing CSV: {str(e)}") st.info("Make sure your data is in valid CSV format with headers.") with tab3: st.markdown("Download a template CSV to get started:") st.info("๐Ÿ’ก **Pro tip:** You can mix names, ORCIDs, and OpenAlex IDs in the same file!") example_df = pd.DataFrame({ 'Name': [ 'Albert Einstein', '0000-0002-1825-0097', # Example ORCID 'A5023888391', # Example OpenAlex ID 'Marie Curie' ], 'Institution_Hint': ['Princeton', 'Optional', 'Optional', 'Paris'] }) st.dataframe(example_df) st.markdown(""" **Accepted formats in Name column:** - Regular name: `John Smith` - ORCID: `0000-0002-1825-0097` or `https://orcid.org/0000-0002-1825-0097` - OpenAlex ID: `A5023888391` or `https://openalex.org/A5023888391` """) template_csv = example_df.to_csv(index=False) st.download_button( label="๐Ÿ“ฅ Download Template CSV", data=template_csv, file_name="openalex_template.csv", mime="text/csv", help="Download this template and fill in with your data" )