import streamlit as st
import pandas as pd
import urllib.request
import urllib.parse
import urllib.error
import json
import time
from typing import Dict, Optional, List
from io import StringIO

st.set_page_config(page_title="OpenAlex H-Index Lookup", page_icon="📚", layout="wide")

# Initialize session state to fix Hugging Face connection issues
if 'initialized' not in st.session_state:
    st.session_state.initialized = True
    st.rerun()

# API Configuration
BASE_URL = "https://api.openalex.org"
RATE_LIMIT_DELAY = 0.15
POLITE_EMAIL = "halozen@pm.me"

def get_author_by_id(author_id: str) -> Optional[Dict]:
    """Fetch author information by OpenAlex ID."""
    if not author_id.upper().startswith('A'):
        author_id = f"A{author_id}"

    params = urllib.parse.urlencode({'mailto': POLITE_EMAIL})
    url = f"{BASE_URL}/authors/{author_id}?{params}"

    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            if response.status == 200:
                data = response.read()
                return json.loads(data.decode('utf-8'))
    except Exception as e:
        st.warning(f"Error fetching author {author_id}: {str(e)}")
    return None

def search_author_by_name(name: str, affiliation_hint: str = None, max_results: int = 5) -> List[Dict]:
    """Search for authors by name, using affiliation hint to re-rank results."""
    params = {
        'search': name,
        'per-page': max_results * 4,
        'mailto': POLITE_EMAIL
    }

    url = f"{BASE_URL}/authors?{urllib.parse.urlencode(params)}"

    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            if response.status == 200:
                data = response.read()
                json_data = json.loads(data.decode('utf-8'))
                results = json_data.get('results', [])

                def sort_key(author):
                    has_orcid = 1 if author.get('orcid') else 0
                    works_count = author.get('works_count', 0)

                    affiliation_match = 0
                    if affiliation_hint:
                        hint_lower = affiliation_hint.lower()

                        last_institutions = author.get('last_known_institutions', [])
                        for inst in last_institutions:
                            if inst:
                                inst_name = inst.get('display_name', '') or ''
                                country = inst.get('country_code', '') or ''
                                inst_name_lower = inst_name.lower()
                                country_lower = country.lower()
                                if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower:
                                    affiliation_match = 1
                                    break

                        if affiliation_match == 0:
                            all_affiliations = author.get('affiliations', [])
                            for aff in all_affiliations:
                                if aff:
                                    inst = aff.get('institution', {}) or {}
                                    inst_name = inst.get('display_name', '') or ''
                                    country = inst.get('country_code', '') or ''
                                    inst_name_lower = inst_name.lower()
                                    country_lower = country.lower()
                                    if hint_lower in inst_name_lower or hint_lower in country_lower or inst_name_lower in hint_lower:
                                        affiliation_match = 1
                                        break

                    return (affiliation_match, has_orcid, works_count)

                results.sort(key=sort_key, reverse=True)
                return results[:max_results]
    except Exception as e:
        st.warning(f"Error searching {name}: {str(e)}")
    return []

def get_top_journals(author_data: Dict, max_journals: int = 5) -> str:
    """Get the top 5 journals where the author has published most frequently."""
    if not author_data or 'id' not in author_data:
        return "N/A"

    author_id = author_data['id']

    params = urllib.parse.urlencode({
        'filter': f'authorships.author.id:{author_id},primary_location.source.type:journal',
        'group_by': 'primary_location.source.id',
        'mailto': POLITE_EMAIL
    })
    url = f"{BASE_URL}/works?{params}"

    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            if response.status == 200:
                data = response.read()
                json_data = json.loads(data.decode('utf-8'))
                group_by_results = json_data.get('group_by', [])

                journals = []
                for item in group_by_results[:max_journals]:
                    key_display_name = item.get('key_display_name')
                    count = item.get('count', 0)
                    if key_display_name and key_display_name != 'unknown':
                        journals.append(f"{key_display_name} ({count})")

                return ", ".join(journals) if journals else "N/A"
    except Exception as e:
        st.warning(f"Error fetching journals: {str(e)}")
    return "N/A"

def detect_input_type(input_str: str) -> tuple:
    """
    Detect if input is a name, ORCID, or OpenAlex ID.
    Returns: (type, cleaned_value) where type is 'name', 'orcid', or 'openalex_id'
    """
    input_str = input_str.strip()
    
    # Check for ORCID format: 0000-0000-0000-0000 or URLs
    if 'orcid.org/' in input_str.lower():
        # Extract ORCID from URL
        orcid = input_str.split('orcid.org/')[-1].strip('/')
        return ('orcid', orcid)
    elif input_str.replace('-', '').isdigit() and len(input_str.replace('-', '')) == 16:
        # Raw ORCID format: 0000-0002-1825-0097
        return ('orcid', input_str)
    
    # Check for OpenAlex ID format: A1234567890 or URLs
    if 'openalex.org/A' in input_str or 'openalex.org/authors/A' in input_str:
        # Extract ID from URL
        openalex_id = input_str.split('/')[-1].strip()
        if openalex_id.startswith('A'):
            return ('openalex_id', openalex_id)
    elif input_str.upper().startswith('A') and len(input_str) > 5 and input_str[1:].isdigit():
        # Raw OpenAlex ID format: A5023888391
        return ('openalex_id', input_str.upper())
    
    # Otherwise treat as a name
    return ('name', input_str)


def get_author_by_orcid(orcid: str) -> Optional[Dict]:
    """Fetch author information by ORCID."""
    # Clean ORCID
    orcid = orcid.replace('https://orcid.org/', '').replace('http://orcid.org/', '').strip('/')
    
    params = urllib.parse.urlencode({
        'filter': f'orcid:{orcid}',
        'mailto': POLITE_EMAIL
    })
    url = f"{BASE_URL}/authors?{params}"

    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            if response.status == 200:
                data = response.read()
                json_data = json.loads(data.decode('utf-8'))
                results = json_data.get('results', [])
                if results:
                    return results[0]
    except Exception as e:
        st.warning(f"Error fetching ORCID {orcid}: {str(e)}")
    return None


def process_author(name_or_id: str, hint: str = None) -> Dict:
    """
    Process a single author and return their data.
    Accepts: author name, ORCID, or OpenAlex ID
    """
    # Detect what type of input we have
    input_type, cleaned_input = detect_input_type(name_or_id)
    
    author = None
    
    if input_type == 'orcid':
        # Look up by ORCID
        author = get_author_by_orcid(cleaned_input)
        if author:
            display_name = author.get('display_name', name_or_id)
        else:
            return {
                'Name': name_or_id,
                'ORCID': cleaned_input,
                'H-Index': None,
                'Works Count': None,
                'Cited By Count': None,
                '2yr Mean Citedness': None,
                'i10 Index': None,
                'Top Topic': None,
                'Top Topic Count': None,
                'Top 5 Journals': None,
                'Last Known Institution': None,
                'Warning': f'ORCID not found: {cleaned_input}'
            }
    
    elif input_type == 'openalex_id':
        # Look up by OpenAlex ID
        author = get_author_by_id(cleaned_input)
        if author:
            display_name = author.get('display_name', name_or_id)
        else:
            return {
                'Name': name_or_id,
                'ORCID': None,
                'H-Index': None,
                'Works Count': None,
                'Cited By Count': None,
                '2yr Mean Citedness': None,
                'i10 Index': None,
                'Top Topic': None,
                'Top Topic Count': None,
                'Top 5 Journals': None,
                'Last Known Institution': None,
                'Warning': f'OpenAlex ID not found: {cleaned_input}'
            }
    
    else:  # input_type == 'name'
        # Original name search logic
        results = search_author_by_name(cleaned_input, affiliation_hint=hint, max_results=3)
        
        if not results:
            return {
                'Name': cleaned_input,
                'ORCID': None,
                'H-Index': None,
                'Works Count': None,
                'Cited By Count': None,
                '2yr Mean Citedness': None,
                'i10 Index': None,
                'Top Topic': None,
                'Top Topic Count': None,
                'Top 5 Journals': None,
                'Last Known Institution': None,
                'Warning': 'Not found'
            }
        
        author = results[0]
        display_name = author.get('display_name', cleaned_input)
        
        # Check for disambiguation issues (only for name searches)
        warning = ""
        if len(results) > 1:
            if not author.get('orcid'):
                warning = "⚠️ Multiple matches, no ORCID"
            else:
                similar_names = [r.get('display_name', '') for r in results[1:]
                               if r.get('display_name', '').lower() == display_name.lower()]
                if similar_names:
                    warning = f"⚠️ {len(similar_names)+1} exact name matches"

        if display_name.lower() != cleaned_input.lower():
            if warning:
                warning += f" | Matched to: {display_name}"
            else:
                warning = f"⚠️ Matched to: {display_name}"

    # Extract data (same for all input types)
    if not author:
        return {
            'Name': name_or_id,
            'ORCID': None,
            'H-Index': None,
            'Works Count': None,
            'Cited By Count': None,
            '2yr Mean Citedness': None,
            'i10 Index': None,
            'Top Topic': None,
            'Top Topic Count': None,
            'Top 5 Journals': None,
            'Last Known Institution': None,
            'Warning': 'Not found'
        }
    
    h_index = author.get('summary_stats', {}).get('h_index')
    works_count = author.get('works_count')
    cited_by_count = author.get('cited_by_count')
    summary_stats = author.get('summary_stats', {})
    two_yr_mean = summary_stats.get('2yr_mean_citedness')
    i10_index = summary_stats.get('i10_index')

    orcid = author.get('orcid', '')

    topics = author.get('topics', [])
    top_topic_name = topics[0].get('display_name') if topics else None
    top_topic_count = topics[0].get('count') if topics else None

    top_journals = get_top_journals(author)

    last_institutions = author.get('last_known_institutions', [])
    institution_names = [inst.get('display_name', '') for inst in last_institutions] if last_institutions else []
    last_institution = ", ".join(institution_names) if institution_names else None

    # Use display_name if we found it, otherwise use original input
    final_name = display_name if 'display_name' in locals() else name_or_id

    return {
        'Name': final_name,
        'ORCID': orcid,
        'H-Index': h_index,
        'Works Count': works_count,
        'Cited By Count': cited_by_count,
        '2yr Mean Citedness': round(two_yr_mean, 2) if two_yr_mean else None,
        'i10 Index': i10_index,
        'Top Topic': top_topic_name,
        'Top Topic Count': top_topic_count,
        'Top 5 Journals': top_journals,
        'Last Known Institution': last_institution,
        'Warning': warning if input_type == 'name' and warning else None
    }

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Process a dataframe of authors and return results."""
    # Add Institution_Hint column if it doesn't exist
    if 'Institution_Hint' not in df.columns:
        df['Institution_Hint'] = None
    
    results = []
    progress_bar = st.progress(0)
    status_text = st.empty()

    for idx, row in df.iterrows():
        name = row['Name']
        hint = row.get('Institution_Hint')

        if pd.notna(name) and str(name).strip():
            status_text.text(f"Processing {idx+1}/{len(df)}: {name}")

            result = process_author(
                str(name).strip(),
                str(hint).strip() if pd.notna(hint) else None
            )
            results.append(result)

            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)

        progress_bar.progress((idx + 1) / len(df))

    status_text.text("✅ Processing complete!")
    return pd.DataFrame(results)

def display_results(results_df: pd.DataFrame):
    """Display results with statistics and download button."""
    st.subheader("📊 Results")
    st.dataframe(results_df, use_container_width=True)

    # Statistics
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        found = results_df['H-Index'].notna().sum()
        st.metric("Found", f"{found}/{len(results_df)}")
    with col2:
        avg_h = results_df['H-Index'].mean()
        st.metric("Avg H-Index", f"{avg_h:.1f}" if pd.notna(avg_h) else "N/A")
    with col3:
        with_orcid = results_df['ORCID'].notna().sum()
        st.metric("With ORCID", f"{with_orcid}/{len(results_df)}")
    with col4:
        warnings = results_df['Warning'].notna().sum()
        st.metric("Warnings", warnings)

    # Download button
    csv = results_df.to_csv(index=False)
    st.download_button(
        label="📥 Download Results as CSV",
        data=csv,
        file_name="openalex_results.csv",
        mime="text/csv",
        type="primary"
    )

# ============================================================================
# MAIN APP
# ============================================================================

st.title("📚 OpenAlex H-Index Lookup Tool")
st.markdown("""
Batch lookup h-indices and publication metrics for researchers using the OpenAlex API.
""")

# Sidebar
with st.sidebar:
    st.header("ℹ️ How to Use")
    st.markdown("""
    1. **Choose input method:**
       - Upload CSV file
       - Paste CSV data
       - Run test with sample data
    2. **CSV format:**
       - `Name` column (required) - accepts:
         - Author names (e.g., "John Smith")
         - ORCID IDs (e.g., "0000-0002-1825-0097")
         - OpenAlex IDs (e.g., "A5023888391")
       - `Institution_Hint` column (optional)
    3. **Click Process** to retrieve data
    4. **Download** results as CSV
    
    **Tips:**
    - Mix names and IDs in the same file!
    - Institution hints improve name matching
    - ORCIDs and OpenAlex IDs = 100% accurate
    - Processing ~6-7 authors per second
    """)

    st.divider()
    st.markdown("**Data source:** [OpenAlex](https://openalex.org)")
    st.markdown("**Rate limit:** ~0.15s per author")

# Test Mode Button
st.subheader("🧪 Quick Test")
if st.button("Run with Sample Data", help="Test with Einstein, Curie, and Newton"):
    test_data = pd.DataFrame({
        'Name': ['Albert Einstein', 'Marie Curie', 'Isaac Newton'],
        'Institution_Hint': ['Princeton', 'Paris', 'Cambridge']
    })
    
    with st.spinner("Processing sample data..."):
        results_df = process_dataframe(test_data)
    
    display_results(results_df)

st.divider()

# Main Input Section with Tabs
st.subheader("📋 Input Your Data")

tab1, tab2, tab3 = st.tabs(["📤 Upload CSV", "📝 Paste CSV", "📥 Download Template"])

with tab1:
    st.markdown("Upload a CSV file with author names:")
    uploaded_file = st.file_uploader(
        "Choose a CSV file", 
        type=['csv'],
        help="CSV must have a 'Name' column. 'Institution_Hint' is optional.",
        key="csv_uploader"
    )

    if uploaded_file is not None:
        try:
            df = pd.read_csv(uploaded_file)

            # Validate columns
            if 'Name' not in df.columns:
                st.error("❌ CSV must have a 'Name' column")
            else:
                st.success(f"✅ Loaded {len(df)} names")

                # Preview
                with st.expander("📋 Preview uploaded data"):
                    st.dataframe(df.head(10))

                # Process button
                if st.button("🚀 Process Authors", type="primary", key="process_upload"):
                    results_df = process_dataframe(df)
                    display_results(results_df)

        except Exception as e:
            st.error(f"Error reading file: {str(e)}")

with tab2:
    st.markdown("Paste CSV data directly (useful if file upload doesn't work):")
    
    csv_text = st.text_area(
        "Paste your CSV data here:", 
        height=200,
        placeholder="Name,Institution_Hint\nAlbert Einstein,Princeton\nMarie Curie,Paris\nJohn Smith,MIT",
        help="Include headers in first row. Separate columns with commas."
    )
    
    if st.button("🚀 Process Pasted Data", type="primary", key="process_paste") and csv_text:
        try:
            df = pd.read_csv(StringIO(csv_text))
            
            # Validate columns
            if 'Name' not in df.columns:
                st.error("❌ CSV must have a 'Name' column")
            else:
                st.success(f"✅ Parsed {len(df)} names")
                
                # Preview
                with st.expander("📋 Preview pasted data"):
                    st.dataframe(df.head(10))
                
                # Process
                results_df = process_dataframe(df)
                display_results(results_df)
                
        except Exception as e:
            st.error(f"Error parsing CSV: {str(e)}")
            st.info("Make sure your data is in valid CSV format with headers.")

with tab3:
    st.markdown("Download a template CSV to get started:")
    
    st.info("💡 **Pro tip:** You can mix names, ORCIDs, and OpenAlex IDs in the same file!")
    
    example_df = pd.DataFrame({
        'Name': [
            'Albert Einstein', 
            '0000-0002-1825-0097',  # Example ORCID
            'A5023888391',  # Example OpenAlex ID
            'Marie Curie'
        ],
        'Institution_Hint': ['Princeton', 'Optional', 'Optional', 'Paris']
    })
    
    st.dataframe(example_df)
    
    st.markdown("""
    **Accepted formats in Name column:**
    - Regular name: `John Smith`
    - ORCID: `0000-0002-1825-0097` or `https://orcid.org/0000-0002-1825-0097`
    - OpenAlex ID: `A5023888391` or `https://openalex.org/A5023888391`
    """)
    
    template_csv = example_df.to_csv(index=False)
    st.download_button(
        label="📥 Download Template CSV",
        data=template_csv,
        file_name="openalex_template.csv",
        mime="text/csv",
        help="Download this template and fill in with your data"
    )