INLEXIO commited on
Commit
b579bd3
·
verified ·
1 Parent(s): 9542306

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +159 -25
src/streamlit_app.py CHANGED
@@ -263,6 +263,10 @@ def process_works_to_author_profiles(works, topic_filter=None, journal_filter=No
263
 
264
  def transform_openalex_api_to_excel_format(api_work):
265
  """Convert OpenAlex API format to match Excel export format"""
 
 
 
 
266
 
267
  # Extract primary topic
268
  primary_topic = None
@@ -275,28 +279,31 @@ def transform_openalex_api_to_excel_format(api_work):
275
  'subfield': {
276
  'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
277
  'display_name': topic.get('subfield', {}).get('display_name', '')
278
- },
279
  'field': {
280
  'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
281
  'display_name': topic.get('field', {}).get('display_name', '')
282
- },
283
  'domain': {
284
  'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
285
  'display_name': topic.get('domain', {}).get('display_name', '')
286
- }
287
  }
288
 
289
  # Extract authorships
290
  authorships = []
291
  for authorship in api_work.get('authorships', []):
292
  author = authorship.get('author', {})
 
 
293
 
294
  # Extract countries from institutions
295
  countries = []
296
  for institution in authorship.get('institutions', []):
297
- country_code = institution.get('country_code', '')
298
- if country_code:
299
- countries.append(country_code)
 
300
 
301
  # Remove duplicates
302
  countries = list(set(countries))
@@ -310,6 +317,12 @@ def transform_openalex_api_to_excel_format(api_work):
310
  'countries': countries
311
  })
312
 
 
 
 
 
 
 
313
  # Build simplified work object
314
  return {
315
  'doi': api_work.get('doi', ''),
@@ -318,11 +331,11 @@ def transform_openalex_api_to_excel_format(api_work):
318
  'cited_by_count': api_work.get('cited_by_count', 0),
319
  'primary_location': {
320
  'source': {
321
- 'display_name': api_work.get('primary_location', {}).get('source', {}).get('display_name', '')
322
  }
323
  },
324
  'biblio': {
325
- 'issue': api_work.get('biblio', {}).get('issue', '')
326
  },
327
  'primary_topic': primary_topic,
328
  'mesh': [],
@@ -358,19 +371,11 @@ with st.sidebar:
358
  st.markdown("---")
359
 
360
  st.subheader("⚙️ API Settings")
361
- user_email = st.text_input(
362
- "Your Email (Optional)",
363
- placeholder="your@email.com",
364
- help="Add your email to join OpenAlex's 'polite pool' for faster API access"
365
- )
366
-
367
- # Store email in session state
368
- if user_email:
369
- st.session_state.user_email = user_email
370
- elif 'user_email' not in st.session_state:
371
- st.session_state.user_email = "research@example.com" # Default fallback
372
 
373
- st.caption("💡 Adding your email gets you 100,000 requests/day instead of 100,000/day throttled")
 
374
 
375
  # Initialize session state for works
376
  if 'works' not in st.session_state:
@@ -546,6 +551,12 @@ with tab3:
546
  placeholder="e.g., machine learning",
547
  help="Search by topic or keyword"
548
  )
 
 
 
 
 
 
549
 
550
  with col2:
551
  api_year_from = st.number_input(
@@ -564,17 +575,85 @@ with tab3:
564
  help="End year for publication range"
565
  )
566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  api_max_results = st.number_input(
568
  "Maximum Results",
569
  min_value=100,
570
- max_value=10000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  value=1000,
572
  step=100,
573
- help="Maximum number of works to retrieve"
574
  )
575
 
 
 
 
 
576
  if st.button("🔍 Search OpenAlex API", type="primary"):
577
  try:
 
 
 
 
 
578
  # Build API query
579
  filters = []
580
 
@@ -590,10 +669,25 @@ with tab3:
590
  if api_year_from and api_year_to:
591
  filters.append(f'publication_year:{api_year_from}-{api_year_to}')
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  # Add article/review/letter filter
594
  filters.append('type:article|review|letter')
595
 
596
- if not filters:
597
  st.warning("Please enter at least one search criterion")
598
  st.stop()
599
 
@@ -601,6 +695,18 @@ with tab3:
601
  filter_string = ','.join(filters)
602
  api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
603
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  st.info(f"📡 Searching OpenAlex API...")
605
  st.code(api_url, language=None)
606
 
@@ -626,7 +732,9 @@ with tab3:
626
  break
627
 
628
  for work in results:
629
- all_works.append(transform_openalex_api_to_excel_format(work))
 
 
630
 
631
  progress_bar.progress(min(page / max_pages, 1.0))
632
 
@@ -642,14 +750,40 @@ with tab3:
642
 
643
  if all_works:
644
  st.session_state.works = all_works
 
 
645
  st.success(f"✅ Found {len(all_works):,} works from OpenAlex")
646
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
  # Offer to download the JSON
648
  json_data = json.dumps(all_works, indent=2)
649
  st.download_button(
650
  label="💾 Download Raw Data (JSON)",
651
  data=json_data,
652
- file_name="openalex_api_search.json",
653
  mime="application/json"
654
  )
655
  else:
 
263
 
264
  def transform_openalex_api_to_excel_format(api_work):
265
  """Convert OpenAlex API format to match Excel export format"""
266
+
267
+ # Safety check
268
+ if not api_work:
269
+ return None
270
 
271
  # Extract primary topic
272
  primary_topic = None
 
279
  'subfield': {
280
  'id': topic.get('subfield', {}).get('id', '').split('/')[-1] if topic.get('subfield', {}).get('id') else '',
281
  'display_name': topic.get('subfield', {}).get('display_name', '')
282
+ } if topic.get('subfield') else {'id': '', 'display_name': ''},
283
  'field': {
284
  'id': topic.get('field', {}).get('id', '').split('/')[-1] if topic.get('field', {}).get('id') else '',
285
  'display_name': topic.get('field', {}).get('display_name', '')
286
+ } if topic.get('field') else {'id': '', 'display_name': ''},
287
  'domain': {
288
  'id': topic.get('domain', {}).get('id', '').split('/')[-1] if topic.get('domain', {}).get('id') else '',
289
  'display_name': topic.get('domain', {}).get('display_name', '')
290
+ } if topic.get('domain') else {'id': '', 'display_name': ''}
291
  }
292
 
293
  # Extract authorships
294
  authorships = []
295
  for authorship in api_work.get('authorships', []):
296
  author = authorship.get('author', {})
297
+ if not author:
298
+ continue
299
 
300
  # Extract countries from institutions
301
  countries = []
302
  for institution in authorship.get('institutions', []):
303
+ if institution:
304
+ country_code = institution.get('country_code', '')
305
+ if country_code:
306
+ countries.append(country_code)
307
 
308
  # Remove duplicates
309
  countries = list(set(countries))
 
317
  'countries': countries
318
  })
319
 
320
+ # Safely extract primary location source
321
+ primary_location = api_work.get('primary_location', {})
322
+ source_name = ''
323
+ if primary_location and primary_location.get('source'):
324
+ source_name = primary_location['source'].get('display_name', '')
325
+
326
  # Build simplified work object
327
  return {
328
  'doi': api_work.get('doi', ''),
 
331
  'cited_by_count': api_work.get('cited_by_count', 0),
332
  'primary_location': {
333
  'source': {
334
+ 'display_name': source_name
335
  }
336
  },
337
  'biblio': {
338
+ 'issue': api_work.get('biblio', {}).get('issue', '') if api_work.get('biblio') else ''
339
  },
340
  'primary_topic': primary_topic,
341
  'mesh': [],
 
371
  st.markdown("---")
372
 
373
  st.subheader("⚙️ API Settings")
374
+ st.caption("💡 Email included in API requests for OpenAlex 'polite pool' (faster access)")
375
+ st.code("halozen@pm.me", language=None)
 
 
 
 
 
 
 
 
 
376
 
377
+ # Hardcoded email
378
+ st.session_state.user_email = "halozen@pm.me"
379
 
380
  # Initialize session state for works
381
  if 'works' not in st.session_state:
 
551
  placeholder="e.g., machine learning",
552
  help="Search by topic or keyword"
553
  )
554
+
555
+ api_journals = st.text_area(
556
+ "Journal(s)",
557
+ placeholder="Nature\nScience\nCell",
558
+ help="Enter journal names, one per line. Leave blank for all journals."
559
+ )
560
 
561
  with col2:
562
  api_year_from = st.number_input(
 
575
  help="End year for publication range"
576
  )
577
 
578
+ api_min_citations = st.number_input(
579
+ "Minimum Citations",
580
+ min_value=0,
581
+ max_value=10000,
582
+ value=0,
583
+ help="Filter works with at least this many citations"
584
+ )
585
+
586
+ api_max_citations = st.number_input(
587
+ "Maximum Citations",
588
+ min_value=0,
589
+ max_value=100000,
590
+ value=0,
591
+ help="Filter works with at most this many citations (0 = no limit)"
592
+ )
593
+
594
  api_max_results = st.number_input(
595
  "Maximum Results",
596
  min_value=100,
597
+ max_value=50000,
598
+ value=1000,
599
+ step=100,
600
+ help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
601
+ ):
602
+ api_author_name = st.text_input(
603
+ "Author Name",
604
+ placeholder="e.g., John Smith",
605
+ help="Search for works by a specific author"
606
+ )
607
+
608
+ api_institution = st.text_input(
609
+ "Institution",
610
+ placeholder="e.g., Harvard University",
611
+ help="Filter by institution/affiliation"
612
+ )
613
+
614
+ api_topic = st.text_input(
615
+ "Topic/Keyword",
616
+ placeholder="e.g., machine learning",
617
+ help="Search by topic or keyword"
618
+ )
619
+
620
+ with col2:
621
+ api_year_from = st.number_input(
622
+ "Publication Year From",
623
+ min_value=1900,
624
+ max_value=2025,
625
+ value=2020,
626
+ help="Start year for publication range"
627
+ )
628
+
629
+ api_year_to = st.number_input(
630
+ "Publication Year To",
631
+ min_value=1900,
632
+ max_value=2025,
633
+ value=2025,
634
+ help="End year for publication range"
635
+ )
636
+
637
+ api_max_results = st.number_input(
638
+ "Maximum Results",
639
+ min_value=100,
640
+ max_value=50000,
641
  value=1000,
642
  step=100,
643
+ help="Maximum number of works to retrieve (Warning: >10,000 may be slow)"
644
  )
645
 
646
+ # Warning for large requests
647
+ if api_max_results > 10000:
648
+ st.warning(f"⚠️ Requesting {api_max_results:,} results may take several minutes and could cause memory issues on free hosting.")
649
+
650
  if st.button("🔍 Search OpenAlex API", type="primary"):
651
  try:
652
+ from datetime import datetime
653
+
654
+ # Record search start time
655
+ search_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
656
+
657
  # Build API query
658
  filters = []
659
 
 
669
  if api_year_from and api_year_to:
670
  filters.append(f'publication_year:{api_year_from}-{api_year_to}')
671
 
672
+ # Add citation filters
673
+ if api_min_citations > 0:
674
+ filters.append(f'cited_by_count:>{api_min_citations - 1}')
675
+
676
+ if api_max_citations > 0:
677
+ filters.append(f'cited_by_count:<{api_max_citations + 1}')
678
+
679
+ # Add journal filters
680
+ if api_journals:
681
+ journal_list = [j.strip() for j in api_journals.split('\n') if j.strip()]
682
+ if journal_list:
683
+ # Use primary_location.source.display_name for each journal
684
+ journal_filters = '|'.join([f'primary_location.source.display_name.search:{j}' for j in journal_list])
685
+ filters.append(f'({journal_filters})')
686
+
687
  # Add article/review/letter filter
688
  filters.append('type:article|review|letter')
689
 
690
+ if not filters or filters == ['type:article|review|letter']:
691
  st.warning("Please enter at least one search criterion")
692
  st.stop()
693
 
 
695
  filter_string = ','.join(filters)
696
  api_url = f"https://api.openalex.org/works?filter={filter_string}&per-page=200"
697
 
698
+ # Store search parameters for summary
699
+ search_params = {
700
+ 'author': api_author_name if api_author_name else 'Any',
701
+ 'institution': api_institution if api_institution else 'Any',
702
+ 'topic': api_topic if api_topic else 'Any',
703
+ 'journals': journal_list if api_journals else ['Any'],
704
+ 'years': f'{api_year_from}-{api_year_to}',
705
+ 'min_citations': api_min_citations,
706
+ 'max_citations': api_max_citations if api_max_citations > 0 else 'No limit',
707
+ 'search_date': search_datetime
708
+ }
709
+
710
  st.info(f"📡 Searching OpenAlex API...")
711
  st.code(api_url, language=None)
712
 
 
732
  break
733
 
734
  for work in results:
735
+ transformed = transform_openalex_api_to_excel_format(work)
736
+ if transformed: # Only add if transformation succeeded
737
+ all_works.append(transformed)
738
 
739
  progress_bar.progress(min(page / max_pages, 1.0))
740
 
 
750
 
751
  if all_works:
752
  st.session_state.works = all_works
753
+ st.session_state.search_params = search_params # Store search parameters
754
+
755
  st.success(f"✅ Found {len(all_works):,} works from OpenAlex")
756
 
757
+ # Display search summary
758
+ st.markdown("---")
759
+ st.subheader("📊 Search Summary")
760
+
761
+ summary_col1, summary_col2 = st.columns(2)
762
+
763
+ with summary_col1:
764
+ st.markdown(f"**Search Date:** {search_params['search_date']}")
765
+ st.markdown(f"**Author:** {search_params['author']}")
766
+ st.markdown(f"**Institution:** {search_params['institution']}")
767
+ st.markdown(f"**Topic:** {search_params['topic']}")
768
+
769
+ with summary_col2:
770
+ st.markdown(f"**Years:** {search_params['years']}")
771
+ st.markdown(f"**Min Citations:** {search_params['min_citations']}")
772
+ st.markdown(f"**Max Citations:** {search_params['max_citations']}")
773
+ if search_params['journals'] != ['Any']:
774
+ st.markdown(f"**Journals:** {', '.join(search_params['journals'][:3])}{'...' if len(search_params['journals']) > 3 else ''}")
775
+ else:
776
+ st.markdown(f"**Journals:** Any")
777
+
778
+ st.markdown(f"**Total Works Retrieved:** {len(all_works):,}")
779
+ st.markdown("---")
780
+
781
  # Offer to download the JSON
782
  json_data = json.dumps(all_works, indent=2)
783
  st.download_button(
784
  label="💾 Download Raw Data (JSON)",
785
  data=json_data,
786
+ file_name=f"openalex_api_search_{search_datetime.replace(':', '-').replace(' ', '_')}.json",
787
  mime="application/json"
788
  )
789
  else: