Spaces:

INLEXIO
/

openalex-search

Sleeping

App Files Files Community

INLEXIO commited on Oct 20, 2025

Commit

b579bd3

verified ·

1 Parent(s): 9542306

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +159 -25

src/streamlit_app.py CHANGED Viewed

@@ -263,6 +263,10 @@ def process_works_to_author_profiles(works, topic_filter=None, journal_filter=No
 def transform_openalex_api_to_excel_format(api_work):
     """Convert OpenAlex API format to match Excel export format"""
     # Extract primary topic
     primary_topic = None
@@ -275,28 +279,31 @@ def transform_openalex_api_to_excel_format(api_work):
             'subfield': {
                 'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
                 'display_name': topic.get('subfield', {}).get('display_name', '')
-            },
             'field': {
                 'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
                 'display_name': topic.get('field', {}).get('display_name', '')
-            },
             'domain': {
                 'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
                 'display_name': topic.get('domain', {}).get('display_name', '')
-            }
         }
     # Extract authorships
     authorships = []
     for authorship in api_work.get('authorships', []):
         author = authorship.get('author', {})
         # Extract countries from institutions
         countries = []
         for institution in authorship.get('institutions', []):
-            country_code = institution.get('country_code', '')
-            if country_code:
-                countries.append(country_code)
         # Remove duplicates
         countries = list(set(countries))
@@ -310,6 +317,12 @@ def transform_openalex_api_to_excel_format(api_work):
             'countries': countries
         })
     # Build simplified work object
     return {
         'doi': api_work.get('doi', ''),
@@ -318,11 +331,11 @@ def transform_openalex_api_to_excel_format(api_work):
         'cited_by_count': api_work.get('cited_by_count', 0),
         'primary_location': {
             'source': {
-                'display_name': api_work.get('primary_location', {}).get('source', {}).get('display_name', '')
             }
         },
         'biblio': {
-            'issue': api_work.get('biblio', {}).get('issue', '')
         },
         'primary_topic': primary_topic,
         'mesh': [],
@@ -358,19 +371,11 @@ with st.sidebar:
     st.markdown("---")
     st.subheader("⚙️ API Settings")
-    user_email = st.text_input(
-        "Your Email (Optional)",
-        placeholder="your@email.com",
-        help="Add your email to join OpenAlex's 'polite pool' for faster API access"
-    )
-    # Store email in session state
-    if user_email:
-        st.session_state.user_email = user_email
-    elif 'user_email' not in st.session_state:
-        st.session_state.user_email = "research@example.com"  # Default fallback
-    st.caption("💡 Adding your email gets you 100,000 requests/day instead of 100,000/day throttled")
 # Initialize session state for works
 if 'works' not in st.session_state:
@@ -546,6 +551,12 @@ with tab3:
             placeholder="e.g., machine learning",
             help="Search by topic or keyword"
         )
     with col2:
         api_year_from = st.number_input(
@@ -564,17 +575,85 @@ with tab3:
             help="End year for publication range"
         )
         api_max_results = st.number_input(
             "Maximum Results",
             min_value=100,
-            max_value=10000,
             value=1000,
             step=100,
-            help="Maximum number of works to retrieve"
         )
     if st.button("🔍 Search OpenAlex API", type="primary"):
         try:
             # Build API query
             filters = []
@@ -590,10 +669,25 @@ with tab3:
             if api_year_from and api_year_to:
                 filters.append(f'publication_year:{api_year_from}-{api_year_to}')
             # Add article/review/letter filter
             filters.append('type:article|review|letter')
-            if not filters:
                 st.warning("Please enter at least one search criterion")
                 st.stop()
@@ -601,6 +695,18 @@ with tab3:
             filter_string = ','.join(filters)
             api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
             st.info(f"📡 Searching OpenAlex API...")
             st.code(api_url, language=None)
@@ -626,7 +732,9 @@ with tab3:
                         break
                     for work in results:
-                        all_works.append(transform_openalex_api_to_excel_format(work))
                 progress_bar.progress(min(page / max_pages, 1.0))
@@ -642,14 +750,40 @@ with tab3:
             if all_works:
                 st.session_state.works = all_works
                 st.success(f"✅ Found {len(all_works):,} works from OpenAlex")
                 # Offer to download the JSON
                 json_data = json.dumps(all_works, indent=2)
                 st.download_button(
                     label="💾 Download Raw Data (JSON)",
                     data=json_data,
-                    file_name="openalex_api_search.json",
                     mime="application/json"
                 )
             else:

 def transform_openalex_api_to_excel_format(api_work):
     """Convert OpenAlex API format to match Excel export format"""
+    # Safety check
+    if not api_work:
+        return None
     # Extract primary topic
     primary_topic = None
             'subfield': {
                 'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
                 'display_name': topic.get('subfield', {}).get('display_name', '')
+            } if topic.get('subfield') else {'id': '', 'display_name': ''},
             'field': {
                 'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
                 'display_name': topic.get('field', {}).get('display_name', '')
+            } if topic.get('field') else {'id': '', 'display_name': ''},
             'domain': {
                 'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
                 'display_name': topic.get('domain', {}).get('display_name', '')
+            } if topic.get('domain') else {'id': '', 'display_name': ''}
         }
     # Extract authorships
     authorships = []
     for authorship in api_work.get('authorships', []):
         author = authorship.get('author', {})
+        if not author:
+            continue
         # Extract countries from institutions
         countries = []
         for institution in authorship.get('institutions', []):
+            if institution:
+                country_code = institution.get('country_code', '')
+                if country_code:
+                    countries.append(country_code)
         # Remove duplicates
         countries = list(set(countries))
             'countries': countries
         })
+    # Safely extract primary location source
+    primary_location = api_work.get('primary_location', {})
+    source_name = ''
+    if primary_location and primary_location.get('source'):
+        source_name = primary_location['source'].get('display_name', '')
     # Build simplified work object
     return {
         'doi': api_work.get('doi', ''),
         'cited_by_count': api_work.get('cited_by_count', 0),
         'primary_location': {
             'source': {
+                'display_name': source_name
             }
         },
         'biblio': {
+            'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else ''
         },
         'primary_topic': primary_topic,
         'mesh': [],
     st.markdown("---")
     st.subheader("⚙️ API Settings")
+    st.caption("💡 Email included in API requests for OpenAlex 'polite pool' (faster access)")
+    st.code("halozen@pm.me", language=None)
+    # Hardcoded email
+    st.session_state.user_email = "halozen@pm.me"
 # Initialize session state for works
 if 'works' not in st.session_state:
             placeholder="e.g., machine learning",
             help="Search by topic or keyword"
         )
+        api_journals = st.text_area(
+            "Journal(s)",
+            placeholder="Nature\nScience\nCell",
+            help="Enter journal names, one per line. Leave blank for all journals."
+        )
     with col2:
         api_year_from = st.number_input(
             help="End year for publication range"
         )
+        api_min_citations = st.number_input(
+            "Minimum Citations",
+            min_value=0,
+            max_value=10000,
+            value=0,
+            help="Filter works with at least this many citations"
+        )
+        api_max_citations = st.number_input(
+            "Maximum Citations",
+            min_value=0,
+            max_value=100000,
+            value=0,
+            help="Filter works with at most this many citations (0 = no limit)"
+        )
         api_max_results = st.number_input(
             "Maximum Results",
             min_value=100,
+            max_value=50000,
+            value=1000,
+            step=100,
+            help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
+        ):
+        api_author_name = st.text_input(
+            "Author Name",
+            placeholder="e.g., John Smith",
+            help="Search for works by a specific author"
+        )
+        api_institution = st.text_input(
+            "Institution",
+            placeholder="e.g., Harvard University",
+            help="Filter by institution/affiliation"
+        )
+        api_topic = st.text_input(
+            "Topic/Keyword",
+            placeholder="e.g., machine learning",
+            help="Search by topic or keyword"
+        )
+    with col2:
+        api_year_from = st.number_input(
+            "Publication Year From",
+            min_value=1900,
+            max_value=2025,
+            value=2020,
+            help="Start year for publication range"
+        )
+        api_year_to = st.number_input(
+            "Publication Year To",
+            min_value=1900,
+            max_value=2025,
+            value=2025,
+            help="End year for publication range"
+        )
+        api_max_results = st.number_input(
+            "Maximum Results",
+            min_value=100,
+            max_value=50000,
             value=1000,
             step=100,
+            help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
         )
+    # Warning for large requests
+    if api_max_results > 10000:
+        st.warning(f"⚠️ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.")
     if st.button("🔍 Search OpenAlex API", type="primary"):
         try:
+            from datetime import datetime
+            # Record search start time
+            search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
             # Build API query
             filters = []
             if api_year_from and api_year_to:
                 filters.append(f'publication_year:{api_year_from}-{api_year_to}')
+            # Add citation filters
+            if api_min_citations > 0:
+                filters.append(f'cited_by_count:>{api_min_citations - 1}')
+            if api_max_citations > 0:
+                filters.append(f'cited_by_count:<{api_max_citations + 1}')
+            # Add journal filters
+            if api_journals:
+                journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()]
+                if journal_list:
+                    # Use primary_location.source.display_name for each journal
+                    journal_filters = '|'.join([f'primary_location.source.display_name.search:{j}' for j in journal_list])
+                    filters.append(f'({journal_filters})')
             # Add article/review/letter filter
             filters.append('type:article|review|letter')
+            if not filters or filters == ['type:article|review|letter']:
                 st.warning("Please enter at least one search criterion")
                 st.stop()
             filter_string = ','.join(filters)
             api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
+            # Store search parameters for summary
+            search_params = {
+                'author': api_author_name if api_author_name else 'Any',
+                'institution': api_institution if api_institution else 'Any',
+                'topic': api_topic if api_topic else 'Any',
+                'journals': journal_list if api_journals else ['Any'],
+                'years': f'{api_year_from}-{api_year_to}',
+                'min_citations': api_min_citations,
+                'max_citations': api_max_citations if api_max_citations > 0 else 'No limit',
+                'search_date': search_datetime
+            }
             st.info(f"📡 Searching OpenAlex API...")
             st.code(api_url, language=None)
                         break
                     for work in results:
+                        transformed = transform_openalex_api_to_excel_format(work)
+                        if transformed:  # Only add if transformation succeeded
+                            all_works.append(transformed)
                 progress_bar.progress(min(page / max_pages, 1.0))
             if all_works:
                 st.session_state.works = all_works
+                st.session_state.search_params = search_params  # Store search parameters
                 st.success(f"✅ Found {len(all_works):,} works from OpenAlex")
+                # Display search summary
+                st.markdown("---")
+                st.subheader("📊 Search Summary")
+                summary_col1, summary_col2 = st.columns(2)
+                with summary_col1:
+                    st.markdown(f"**Search Date:** {search_params['search_date']}")
+                    st.markdown(f"**Author:** {search_params['author']}")
+                    st.markdown(f"**Institution:** {search_params['institution']}")
+                    st.markdown(f"**Topic:** {search_params['topic']}")
+                with summary_col2:
+                    st.markdown(f"**Years:** {search_params['years']}")
+                    st.markdown(f"**Min Citations:** {search_params['min_citations']}")
+                    st.markdown(f"**Max Citations:** {search_params['max_citations']}")
+                    if search_params['journals'] != ['Any']:
+                        st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}")
+                    else:
+                        st.markdown(f"**Journals:** Any")
+                st.markdown(f"**Total Works Retrieved:** {len(all_works):,}")
+                st.markdown("---")
                 # Offer to download the JSON
                 json_data = json.dumps(all_works, indent=2)
                 st.download_button(
                     label="💾 Download Raw Data (JSON)",
                     data=json_data,
+                    file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json",
                     mime="application/json"
                 )
             else: