Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import urllib.request | |
| import urllib.parse | |
| import urllib.error | |
| import json | |
| import time | |
| from typing import Dict, Optional, List | |
| from io import StringIO | |
| st.set_page_config(page_title="OpenAlex H-Index Lookup", page_icon="π", layout="wide") | |
| # Initialize session state to fix Hugging Face connection issues | |
| if 'initialized' not in st.session_state: | |
| st.session_state.initialized = True | |
| st.rerun() | |
| # API Configuration | |
| BASE_URL = "https://api.openalex.org" | |
| RATE_LIMIT_DELAY = 0.15 | |
| POLITE_EMAIL = "halozen@pm.me" | |
| def get_author_by_id(author_id: str) -> Optional[Dict]: | |
| """Fetch author information by OpenAlex ID.""" | |
| if not author_id.upper().startswith('A'): | |
| author_id = f"A{author_id}" | |
| params = urllib.parse.urlencode({'mailto': POLITE_EMAIL}) | |
| url = f"{BASE_URL}/authors/{author_id}?{params}" | |
| try: | |
| with urllib.request.urlopen(url, timeout=10) as response: | |
| if response.status == 200: | |
| data = response.read() | |
| return json.loads(data.decode('utf-8')) | |
| except Exception as e: | |
| st.warning(f"Error fetching author {author_id}: {str(e)}") | |
| return None | |
| def search_author_by_name(name: str, affiliation_hint: str = None, max_results: int = 5) -> List[Dict]: | |
| """Search for authors by name, using affiliation hint to re-rank results.""" | |
| params = { | |
| 'search': name, | |
| 'per-page': max_results * 4, | |
| 'mailto': POLITE_EMAIL | |
| } | |
| url = f"{BASE_URL}/authors?{urllib.parse.urlencode(params)}" | |
| try: | |
| with urllib.request.urlopen(url, timeout=10) as response: | |
| if response.status == 200: | |
| data = response.read() | |
| json_data = json.loads(data.decode('utf-8')) | |
| results = json_data.get('results', []) | |
| def sort_key(author): | |
| has_orcid = 1 if author.get('orcid') else 0 | |
| works_count = author.get('works_count', 0) | |
| affiliation_match = 0 | |
| if affiliation_hint: | |
| hint_lower = affiliation_hint.lower() | |
| last_institutions = author.get('last_known_institutions', []) | |
| for inst in last_institutions: | |
| if inst: | |
| inst_name = inst.get('display_name', '') or '' | |
| country = inst.get('country_code', '') or '' | |
| inst_name_lower = inst_name.lower() | |
| country_lower = country.lower() | |
| if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower: | |
| affiliation_match = 1 | |
| break | |
| if affiliation_match == 0: | |
| all_affiliations = author.get('affiliations', []) | |
| for aff in all_affiliations: | |
| if aff: | |
| inst = aff.get('institution', {}) or {} | |
| inst_name = inst.get('display_name', '') or '' | |
| country = inst.get('country_code', '') or '' | |
| inst_name_lower = inst_name.lower() | |
| country_lower = country.lower() | |
| if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower: | |
| affiliation_match = 1 | |
| break | |
| return (affiliation_match, has_orcid, works_count) | |
| results.sort(key=sort_key, reverse=True) | |
| return results[:max_results] | |
| except Exception as e: | |
| st.warning(f"Error searching {name}: {str(e)}") | |
| return [] | |
| def get_top_journals(author_data: Dict, max_journals: int = 5) -> str: | |
| """Get the top 5 journals where the author has published most frequently.""" | |
| if not author_data or 'id' not in author_data: | |
| return "N/A" | |
| author_id = author_data['id'] | |
| params = urllib.parse.urlencode({ | |
| 'filter': f'authorships.author.id:{author_id},primary_location.source.type:journal', | |
| 'group_by': 'primary_location.source.id', | |
| 'mailto': POLITE_EMAIL | |
| }) | |
| url = f"{BASE_URL}/works?{params}" | |
| try: | |
| with urllib.request.urlopen(url, timeout=10) as response: | |
| if response.status == 200: | |
| data = response.read() | |
| json_data = json.loads(data.decode('utf-8')) | |
| group_by_results = json_data.get('group_by', []) | |
| journals = [] | |
| for item in group_by_results[:max_journals]: | |
| key_display_name = item.get('key_display_name') | |
| count = item.get('count', 0) | |
| if key_display_name and key_display_name != 'unknown': | |
| journals.append(f"{key_display_name} ({count})") | |
| return ", ".join(journals) if journals else "N/A" | |
| except Exception as e: | |
| st.warning(f"Error fetching journals: {str(e)}") | |
| return "N/A" | |
| def detect_input_type(input_str: str) -> tuple: | |
| """ | |
| Detect if input is a name, ORCID, or OpenAlex ID. | |
| Returns: (type, cleaned_value) where type is 'name', 'orcid', or 'openalex_id' | |
| """ | |
| input_str = input_str.strip() | |
| # Check for ORCID format: 0000-0000-0000-0000 or URLs | |
| if 'orcid.org/' in input_str.lower(): | |
| # Extract ORCID from URL | |
| orcid = input_str.split('orcid.org/')[-1].strip('/') | |
| return ('orcid', orcid) | |
| elif input_str.replace('-', '').isdigit() and len(input_str.replace('-', '')) == 16: | |
| # Raw ORCID format: 0000-0002-1825-0097 | |
| return ('orcid', input_str) | |
| # Check for OpenAlex ID format: A1234567890 or URLs | |
| if 'openalex.org/A' in input_str or 'openalex.org/authors/A' in input_str: | |
| # Extract ID from URL | |
| openalex_id = input_str.split('/')[-1].strip() | |
| if openalex_id.startswith('A'): | |
| return ('openalex_id', openalex_id) | |
| elif input_str.upper().startswith('A') and len(input_str) > 5 and input_str[1:].isdigit(): | |
| # Raw OpenAlex ID format: A5023888391 | |
| return ('openalex_id', input_str.upper()) | |
| # Otherwise treat as a name | |
| return ('name', input_str) | |
| def get_author_by_orcid(orcid: str) -> Optional[Dict]: | |
| """Fetch author information by ORCID.""" | |
| # Clean ORCID | |
| orcid = orcid.replace('https://orcid.org/', '').replace('http://orcid.org/', '').strip('/') | |
| params = urllib.parse.urlencode({ | |
| 'filter': f'orcid:{orcid}', | |
| 'mailto': POLITE_EMAIL | |
| }) | |
| url = f"{BASE_URL}/authors?{params}" | |
| try: | |
| with urllib.request.urlopen(url, timeout=10) as response: | |
| if response.status == 200: | |
| data = response.read() | |
| json_data = json.loads(data.decode('utf-8')) | |
| results = json_data.get('results', []) | |
| if results: | |
| return results[0] | |
| except Exception as e: | |
| st.warning(f"Error fetching ORCID {orcid}: {str(e)}") | |
| return None | |
| def process_author(name_or_id: str, hint: str = None) -> Dict: | |
| """ | |
| Process a single author and return their data. | |
| Accepts: author name, ORCID, or OpenAlex ID | |
| """ | |
| # Detect what type of input we have | |
| input_type, cleaned_input = detect_input_type(name_or_id) | |
| author = None | |
| if input_type == 'orcid': | |
| # Look up by ORCID | |
| author = get_author_by_orcid(cleaned_input) | |
| if author: | |
| display_name = author.get('display_name', name_or_id) | |
| else: | |
| return { | |
| 'Name': name_or_id, | |
| 'ORCID': cleaned_input, | |
| 'H-Index': None, | |
| 'Works Count': None, | |
| 'Cited By Count': None, | |
| '2yr Mean Citedness': None, | |
| 'i10 Index': None, | |
| 'Top Topic': None, | |
| 'Top Topic Count': None, | |
| 'Top 5 Journals': None, | |
| 'Last Known Institution': None, | |
| 'Warning': f'ORCID not found: {cleaned_input}' | |
| } | |
| elif input_type == 'openalex_id': | |
| # Look up by OpenAlex ID | |
| author = get_author_by_id(cleaned_input) | |
| if author: | |
| display_name = author.get('display_name', name_or_id) | |
| else: | |
| return { | |
| 'Name': name_or_id, | |
| 'ORCID': None, | |
| 'H-Index': None, | |
| 'Works Count': None, | |
| 'Cited By Count': None, | |
| '2yr Mean Citedness': None, | |
| 'i10 Index': None, | |
| 'Top Topic': None, | |
| 'Top Topic Count': None, | |
| 'Top 5 Journals': None, | |
| 'Last Known Institution': None, | |
| 'Warning': f'OpenAlex ID not found: {cleaned_input}' | |
| } | |
| else: # input_type == 'name' | |
| # Original name search logic | |
| results = search_author_by_name(cleaned_input, affiliation_hint=hint, max_results=3) | |
| if not results: | |
| return { | |
| 'Name': cleaned_input, | |
| 'ORCID': None, | |
| 'H-Index': None, | |
| 'Works Count': None, | |
| 'Cited By Count': None, | |
| '2yr Mean Citedness': None, | |
| 'i10 Index': None, | |
| 'Top Topic': None, | |
| 'Top Topic Count': None, | |
| 'Top 5 Journals': None, | |
| 'Last Known Institution': None, | |
| 'Warning': 'Not found' | |
| } | |
| author = results[0] | |
| display_name = author.get('display_name', cleaned_input) | |
| # Check for disambiguation issues (only for name searches) | |
| warning = "" | |
| if len(results) > 1: | |
| if not author.get('orcid'): | |
| warning = "β οΈ Multiple matches, no ORCID" | |
| else: | |
| similar_names = [r.get('display_name', '') for r in results[1:] | |
| if r.get('display_name', '').lower() == display_name.lower()] | |
| if similar_names: | |
| warning = f"β οΈ {len(similar_names)+1} exact name matches" | |
| if display_name.lower() != cleaned_input.lower(): | |
| if warning: | |
| warning += f" | Matched to: {display_name}" | |
| else: | |
| warning = f"β οΈ Matched to: {display_name}" | |
| # Extract data (same for all input types) | |
| if not author: | |
| return { | |
| 'Name': name_or_id, | |
| 'ORCID': None, | |
| 'H-Index': None, | |
| 'Works Count': None, | |
| 'Cited By Count': None, | |
| '2yr Mean Citedness': None, | |
| 'i10 Index': None, | |
| 'Top Topic': None, | |
| 'Top Topic Count': None, | |
| 'Top 5 Journals': None, | |
| 'Last Known Institution': None, | |
| 'Warning': 'Not found' | |
| } | |
| h_index = author.get('summary_stats', {}).get('h_index') | |
| works_count = author.get('works_count') | |
| cited_by_count = author.get('cited_by_count') | |
| summary_stats = author.get('summary_stats', {}) | |
| two_yr_mean = summary_stats.get('2yr_mean_citedness') | |
| i10_index = summary_stats.get('i10_index') | |
| orcid = author.get('orcid', '') | |
| topics = author.get('topics', []) | |
| top_topic_name = topics[0].get('display_name') if topics else None | |
| top_topic_count = topics[0].get('count') if topics else None | |
| top_journals = get_top_journals(author) | |
| last_institutions = author.get('last_known_institutions', []) | |
| institution_names = [inst.get('display_name', '') for inst in last_institutions] if last_institutions else [] | |
| last_institution = ", ".join(institution_names) if institution_names else None | |
| # Use display_name if we found it, otherwise use original input | |
| final_name = display_name if 'display_name' in locals() else name_or_id | |
| return { | |
| 'Name': final_name, | |
| 'ORCID': orcid, | |
| 'H-Index': h_index, | |
| 'Works Count': works_count, | |
| 'Cited By Count': cited_by_count, | |
| '2yr Mean Citedness': round(two_yr_mean, 2) if two_yr_mean else None, | |
| 'i10 Index': i10_index, | |
| 'Top Topic': top_topic_name, | |
| 'Top Topic Count': top_topic_count, | |
| 'Top 5 Journals': top_journals, | |
| 'Last Known Institution': last_institution, | |
| 'Warning': warning if input_type == 'name' and warning else None | |
| } | |
| def process_dataframe(df: pd.DataFrame) -> pd.DataFrame: | |
| """Process a dataframe of authors and return results.""" | |
| # Add Institution_Hint column if it doesn't exist | |
| if 'Institution_Hint' not in df.columns: | |
| df['Institution_Hint'] = None | |
| results = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for idx, row in df.iterrows(): | |
| name = row['Name'] | |
| hint = row.get('Institution_Hint') | |
| if pd.notna(name) and str(name).strip(): | |
| status_text.text(f"Processing {idx+1}/{len(df)}: {name}") | |
| result = process_author( | |
| str(name).strip(), | |
| str(hint).strip() if pd.notna(hint) else None | |
| ) | |
| results.append(result) | |
| # Rate limiting | |
| time.sleep(RATE_LIMIT_DELAY) | |
| progress_bar.progress((idx + 1) / len(df)) | |
| status_text.text("β Processing complete!") | |
| return pd.DataFrame(results) | |
| def display_results(results_df: pd.DataFrame): | |
| """Display results with statistics and download button.""" | |
| st.subheader("π Results") | |
| st.dataframe(results_df, use_container_width=True) | |
| # Statistics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| found = results_df['H-Index'].notna().sum() | |
| st.metric("Found", f"{found}/{len(results_df)}") | |
| with col2: | |
| avg_h = results_df['H-Index'].mean() | |
| st.metric("Avg H-Index", f"{avg_h:.1f}" if pd.notna(avg_h) else "N/A") | |
| with col3: | |
| with_orcid = results_df['ORCID'].notna().sum() | |
| st.metric("With ORCID", f"{with_orcid}/{len(results_df)}") | |
| with col4: | |
| warnings = results_df['Warning'].notna().sum() | |
| st.metric("Warnings", warnings) | |
| # Download button | |
| csv = results_df.to_csv(index=False) | |
| st.download_button( | |
| label="π₯ Download Results as CSV", | |
| data=csv, | |
| file_name="openalex_results.csv", | |
| mime="text/csv", | |
| type="primary" | |
| ) | |
| # ============================================================================ | |
| # MAIN APP | |
| # ============================================================================ | |
| st.title("π OpenAlex H-Index Lookup Tool") | |
| st.markdown(""" | |
| Batch lookup h-indices and publication metrics for researchers using the OpenAlex API. | |
| """) | |
| # Sidebar | |
| with st.sidebar: | |
| st.header("βΉοΈ How to Use") | |
| st.markdown(""" | |
| 1. **Choose input method:** | |
| - Upload CSV file | |
| - Paste CSV data | |
| - Run test with sample data | |
| 2. **CSV format:** | |
| - `Name` column (required) - accepts: | |
| - Author names (e.g., "John Smith") | |
| - ORCID IDs (e.g., "0000-0002-1825-0097") | |
| - OpenAlex IDs (e.g., "A5023888391") | |
| - `Institution_Hint` column (optional) | |
| 3. **Click Process** to retrieve data | |
| 4. **Download** results as CSV | |
| **Tips:** | |
| - Mix names and IDs in the same file! | |
| - Institution hints improve name matching | |
| - ORCIDs and OpenAlex IDs = 100% accurate | |
| - Processing ~6-7 authors per second | |
| """) | |
| st.divider() | |
| st.markdown("**Data source:** [OpenAlex](https://openalex.org)") | |
| st.markdown("**Rate limit:** ~0.15s per author") | |
| # Test Mode Button | |
| st.subheader("π§ͺ Quick Test") | |
| if st.button("Run with Sample Data", help="Test with Einstein, Curie, and Newton"): | |
| test_data = pd.DataFrame({ | |
| 'Name': ['Albert Einstein', 'Marie Curie', 'Isaac Newton'], | |
| 'Institution_Hint': ['Princeton', 'Paris', 'Cambridge'] | |
| }) | |
| with st.spinner("Processing sample data..."): | |
| results_df = process_dataframe(test_data) | |
| display_results(results_df) | |
| st.divider() | |
| # Main Input Section with Tabs | |
| st.subheader("π Input Your Data") | |
| tab1, tab2, tab3 = st.tabs(["π€ Upload CSV", "π Paste CSV", "π₯ Download Template"]) | |
| with tab1: | |
| st.markdown("Upload a CSV file with author names:") | |
| uploaded_file = st.file_uploader( | |
| "Choose a CSV file", | |
| type=['csv'], | |
| help="CSV must have a 'Name' column. 'Institution_Hint' is optional.", | |
| key="csv_uploader" | |
| ) | |
| if uploaded_file is not None: | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| # Validate columns | |
| if 'Name' not in df.columns: | |
| st.error("β CSV must have a 'Name' column") | |
| else: | |
| st.success(f"β Loaded {len(df)} names") | |
| # Preview | |
| with st.expander("π Preview uploaded data"): | |
| st.dataframe(df.head(10)) | |
| # Process button | |
| if st.button("π Process Authors", type="primary", key="process_upload"): | |
| results_df = process_dataframe(df) | |
| display_results(results_df) | |
| except Exception as e: | |
| st.error(f"Error reading file: {str(e)}") | |
| with tab2: | |
| st.markdown("Paste CSV data directly (useful if file upload doesn't work):") | |
| csv_text = st.text_area( | |
| "Paste your CSV data here:", | |
| height=200, | |
| placeholder="Name,Institution_Hint\nAlbert Einstein,Princeton\nMarie Curie,Paris\nJohn Smith,MIT", | |
| help="Include headers in first row. Separate columns with commas." | |
| ) | |
| if st.button("π Process Pasted Data", type="primary", key="process_paste") and csv_text: | |
| try: | |
| df = pd.read_csv(StringIO(csv_text)) | |
| # Validate columns | |
| if 'Name' not in df.columns: | |
| st.error("β CSV must have a 'Name' column") | |
| else: | |
| st.success(f"β Parsed {len(df)} names") | |
| # Preview | |
| with st.expander("π Preview pasted data"): | |
| st.dataframe(df.head(10)) | |
| # Process | |
| results_df = process_dataframe(df) | |
| display_results(results_df) | |
| except Exception as e: | |
| st.error(f"Error parsing CSV: {str(e)}") | |
| st.info("Make sure your data is in valid CSV format with headers.") | |
| with tab3: | |
| st.markdown("Download a template CSV to get started:") | |
| st.info("π‘ **Pro tip:** You can mix names, ORCIDs, and OpenAlex IDs in the same file!") | |
| example_df = pd.DataFrame({ | |
| 'Name': [ | |
| 'Albert Einstein', | |
| '0000-0002-1825-0097', # Example ORCID | |
| 'A5023888391', # Example OpenAlex ID | |
| 'Marie Curie' | |
| ], | |
| 'Institution_Hint': ['Princeton', 'Optional', 'Optional', 'Paris'] | |
| }) | |
| st.dataframe(example_df) | |
| st.markdown(""" | |
| **Accepted formats in Name column:** | |
| - Regular name: `John Smith` | |
| - ORCID: `0000-0002-1825-0097` or `https://orcid.org/0000-0002-1825-0097` | |
| - OpenAlex ID: `A5023888391` or `https://openalex.org/A5023888391` | |
| """) | |
| template_csv = example_df.to_csv(index=False) | |
| st.download_button( | |
| label="π₯ Download Template CSV", | |
| data=template_csv, | |
| file_name="openalex_template.csv", | |
| mime="text/csv", | |
| help="Download this template and fill in with your data" | |
| ) |