Spaces:

INLEXIO
/

semantic-search

Sleeping

App Files Files Community

INLEXIO commited on Oct 23, 2025

Commit

d8d5f96

verified ·

1 Parent(s): e8c6cf8

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +390 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,392 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import requests
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from collections import defaultdict
+import time
+# Page config
+st.set_page_config(
+    page_title="OpenAlex Semantic Search",
+    page_icon="🔬",
+    layout="wide"
+)
+# Cache the model loading
+@st.cache_resource
+def load_model():
+    """Load the sentence transformer model"""
+    return SentenceTransformer('all-MiniLM-L6-v2')
+@st.cache_data(ttl=3600)
+def search_openalex_papers(query, num_results=50):
+    """
+    Search OpenAlex for papers related to the query
+    """
+    base_url = "https://api.openalex.org/works"
+    params = {
+        "search": query,
+        "per_page": num_results,
+        "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
+        "mailto": "user@example.com"  # Polite pool
+    }
+    try:
+        response = requests.get(base_url, params=params, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        return data.get("results", [])
+    except Exception as e:
+        st.error(f"Error fetching papers: {str(e)}")
+        return []
+def reconstruct_abstract(inverted_index):
+    """
+    Reconstruct abstract from OpenAlex inverted index format
+    """
+    if not inverted_index:
+        return ""
+    # Create list of (position, word) tuples
+    words_with_positions = []
+    for word, positions in inverted_index.items():
+        for pos in positions:
+            words_with_positions.append((pos, word))
+    # Sort by position and join
+    words_with_positions.sort(key=lambda x: x[0])
+    return " ".join([word for _, word in words_with_positions])
+@st.cache_data(ttl=3600)
+def get_author_details(author_id):
+    """
+    Fetch detailed author information from OpenAlex
+    """
+    base_url = f"https://api.openalex.org/authors/{author_id}"
+    params = {
+        "mailto": "user@example.com"
+    }
+    try:
+        response = requests.get(base_url, params=params, timeout=10)
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        return None
+def calculate_semantic_similarity(query_embedding, paper_embeddings):
+    """
+    Calculate cosine similarity between query and papers
+    """
+    # Normalize embeddings
+    query_norm = query_embedding / np.linalg.norm(query_embedding)
+    paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True)
+    # Calculate cosine similarity
+    similarities = np.dot(paper_norms, query_norm)
+    return similarities
+def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
+    """
+    Extract authors from papers and rank them based on:
+    - Semantic relevance (average of their paper scores)
+    - H-index
+    - Total citations
+    """
+    author_data = defaultdict(lambda: {
+        'name': '',
+        'id': '',
+        'paper_scores': [],
+        'paper_ids': [],
+        'total_citations': 0,
+        'works_count': 0,
+        'h_index': 0,
+        'institution': ''
+    })
+    # Collect author information from papers
+    for paper, score in zip(papers, paper_scores):
+        for authorship in paper.get('authorships', []):
+            author = authorship.get('author', {})
+            author_id = author.get('id', '').split('/')[-1] if author.get('id') else None
+            if author_id and author_id.startswith('A'):
+                author_data[author_id]['name'] = author.get('display_name', 'Unknown')
+                author_data[author_id]['id'] = author_id
+                author_data[author_id]['paper_scores'].append(score)
+                author_data[author_id]['paper_ids'].append(paper.get('id', ''))
+                # Get institution
+                institutions = authorship.get('institutions', [])
+                if institutions and not author_data[author_id]['institution']:
+                    author_data[author_id]['institution'] = institutions[0].get('display_name', '')
+    # Filter authors with minimum paper count
+    filtered_authors = {
+        aid: data for aid, data in author_data.items()
+        if len(data['paper_scores']) >= min_papers
+    }
+    # Fetch detailed metrics for each author
+    with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."):
+        progress_bar = st.progress(0)
+        for idx, (author_id, data) in enumerate(filtered_authors.items()):
+            author_details = get_author_details(author_id)
+            if author_details:
+                data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0)
+                data['total_citations'] = author_details.get('cited_by_count', 0)
+                data['works_count'] = author_details.get('works_count', 0)
+            progress_bar.progress((idx + 1) / len(filtered_authors))
+            time.sleep(0.1)  # Rate limiting
+        progress_bar.empty()
+    # Calculate composite score for ranking
+    ranked_authors = []
+    for author_id, data in filtered_authors.items():
+        avg_relevance = np.mean(data['paper_scores'])
+        # Normalize metrics (using log scale for citations)
+        normalized_h_index = data['h_index'] / 100.0  # Assume max h-index of 100
+        normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
+        # Composite score: weighted combination
+        composite_score = (
+            0.5 * avg_relevance +  # 50% semantic relevance
+            0.3 * normalized_h_index +  # 30% h-index
+            0.2 * normalized_citations  # 20% citations
+        )
+        ranked_authors.append({
+            'author_id': author_id,
+            'name': data['name'],
+            'institution': data['institution'],
+            'h_index': data['h_index'],
+            'total_citations': data['total_citations'],
+            'works_count': data['works_count'],
+            'num_relevant_papers': len(data['paper_scores']),
+            'avg_relevance_score': avg_relevance,
+            'composite_score': composite_score,
+            'openalex_url': f"https://openalex.org/{author_id}"
+        })
+    # Sort by composite score
+    ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True)
+    return ranked_authors
+def main():
+    st.title("🔬 OpenAlex Semantic Search")
+    st.markdown("""
+    Search for academic papers and discover top researchers using semantic search powered by OpenAlex.
+    **How it works:**
+    1. Enter your search terms (e.g., "machine learning for drug discovery")
+    2. The app finds relevant papers using semantic similarity
+    3. Authors are ranked by relevance, h-index, and citation metrics
+    """)
+    # Sidebar controls
+    st.sidebar.header("Search Settings")
+    num_papers = st.sidebar.slider(
+        "Number of papers to fetch",
+        min_value=20,
+        max_value=100,
+        value=50,
+        step=10
+    )
+    top_papers_display = st.sidebar.slider(
+        "Top papers to display",
+        min_value=5,
+        max_value=30,
+        value=10,
+        step=5
+    )
+    top_authors_display = st.sidebar.slider(
+        "Top authors to display",
+        min_value=5,
+        max_value=50,
+        value=20,
+        step=5
+    )
+    min_papers_per_author = st.sidebar.slider(
+        "Minimum papers per author",
+        min_value=1,
+        max_value=5,
+        value=2,
+        step=1,
+        help="Minimum number of relevant papers an author must have to be included"
+    )
+    # Main search input
+    query = st.text_input(
+        "Enter your search query:",
+        placeholder="e.g., 'graph neural networks for protein structure prediction'",
+        help="Enter keywords or a description of what you're looking for"
+    )
+    search_button = st.button("🔍 Search", type="primary")
+    if search_button and query:
+        # Load model
+        with st.spinner("Loading semantic model..."):
+            model = load_model()
+        # Search papers
+        with st.spinner(f"Searching OpenAlex for papers about '{query}'..."):
+            papers = search_openalex_papers(query, num_papers)
+        if not papers:
+            st.warning("No papers found. Try different search terms.")
+            return
+        st.success(f"Found {len(papers)} papers!")
+        # Prepare papers for semantic search
+        with st.spinner("Analyzing papers with semantic search..."):
+            paper_texts = []
+            valid_papers = []
+            for paper in papers:
+                title = paper.get('display_name', '') or paper.get('title', '')
+                abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
+                # Combine title and abstract (title weighted more)
+                text = f"{title} {title} {abstract}"  # Title appears twice for emphasis
+                if text.strip():
+                    paper_texts.append(text)
+                    valid_papers.append(paper)
+            if not paper_texts:
+                st.error("No valid paper content found.")
+                return
+            # Generate embeddings
+            query_embedding = model.encode(query, convert_to_tensor=False)
+            paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True)
+            # Calculate similarities
+            similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
+            # Sort papers by similarity
+            sorted_indices = np.argsort(similarities)[::-1]
+            sorted_papers = [valid_papers[i] for i in sorted_indices]
+            sorted_scores = [similarities[i] for i in sorted_indices]
+        # Display top papers
+        st.header(f"📄 Top {top_papers_display} Most Relevant Papers")
+        for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])):
+            with st.expander(f"**{idx+1}. {paper.get('display_name', 'Untitled')}** (Relevance: {score:.3f})"):
+                col1, col2 = st.columns([3, 1])
+                with col1:
+                    abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
+                    if abstract:
+                        st.markdown(f"**Abstract:** {abstract[:500]}{'...' if len(abstract) > 500 else ''}")
+                    else:
+                        st.markdown("*No abstract available*")
+                    # Authors
+                    authors = [a.get('author', {}).get('display_name', 'Unknown')
+                              for a in paper.get('authorships', [])]
+                    if authors:
+                        st.markdown(f"**Authors:** {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}")
+                with col2:
+                    st.metric("Year", paper.get('publication_year', 'N/A'))
+                    st.metric("Citations", paper.get('cited_by_count', 0))
+                    paper_id = paper.get('id', '').split('/')[-1]
+                    if paper_id:
+                        st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})")
+        # Rank authors
+        st.header(f"👨‍🔬 Top {top_authors_display} Researchers")
+        ranked_authors = rank_authors(
+            sorted_papers,
+            sorted_scores,
+            model,
+            query_embedding,
+            min_papers=min_papers_per_author
+        )
+        if not ranked_authors:
+            st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.")
+            return
+        # Display authors in a table
+        st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.")
+        for idx, author in enumerate(ranked_authors[:top_authors_display], 1):
+            with st.container():
+                col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
+                with col1:
+                    st.markdown(f"**{idx}. [{author['name']}]({author['openalex_url']})**")
+                    if author['institution']:
+                        st.caption(author['institution'])
+                with col2:
+                    st.metric("H-Index", author['h_index'])
+                with col3:
+                    st.metric("Citations", f"{author['total_citations']:,}")
+                with col4:
+                    st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
+                st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}")
+                st.divider()
+        # Download results
+        st.header("📥 Download Results")
+        # Prepare CSV data for authors
+        import io
+        import csv
+        csv_buffer = io.StringIO()
+        csv_writer = csv.writer(csv_buffer)
+        # Write header
+        csv_writer.writerow([
+            'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
+            'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
+        ])
+        # Write data
+        for idx, author in enumerate(ranked_authors, 1):
+            csv_writer.writerow([
+                idx,
+                author['name'],
+                author['institution'],
+                author['h_index'],
+                author['total_citations'],
+                author['works_count'],
+                author['num_relevant_papers'],
+                f"{author['avg_relevance_score']:.4f}",
+                f"{author['composite_score']:.4f}",
+                author['openalex_url']
+            ])
+        csv_data = csv_buffer.getvalue()
+        st.download_button(
+            label="Download Author Rankings (CSV)",
+            data=csv_data,
+            file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv",
+            mime="text/csv"
+        )
+if __name__ == "__main__":
+    main()