evijit HF Staff commited on
Commit
f30a36e
·
1 Parent(s): 813c7cf

add paperverse code

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +766 -66
  3. integrated_ml_taxonomy.json +488 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: DataVerse Explorer
3
  emoji: 😻
4
- colorFrom: green
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Hub Organization Stats
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: PaperVerse Explorer
3
  emoji: 😻
4
+ colorFrom: red
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Research Paper Stats on Hugging Face
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,46 +2,366 @@ import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
  import time
 
 
 
 
 
5
  from datasets import load_dataset
 
 
6
 
7
  # --- Constants ---
8
  TOP_K_CHOICES = list(range(5, 51, 5))
9
- HF_DATASET_ID = "evijit/dataverse_daily_data"
10
- TAG_FILTER_CHOICES = [
11
- "None", "Audio & Speech", "Time series", "Robotics", "Music",
12
- "Video", "Images", "Text", "Biomedical", "Sciences"
 
 
 
 
 
 
 
13
  ]
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def load_datasets_data():
16
- """Load the processed datasets data from the Hugging Face Hub."""
17
  start_time = time.time()
18
  print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
19
  try:
20
- dataset_dict = load_dataset(HF_DATASET_ID)
21
- df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
23
  print(msg)
24
  return df, True, msg
25
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  err_msg = f"Failed to load dataset. Error: {e}"
27
  print(err_msg)
28
  return pd.DataFrame(), False, err_msg
29
 
30
- def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
31
  """
32
  Filter data and prepare it for a multi-level treemap.
33
  - Preserves individual datasets for the top K organizations.
34
  - Groups all other organizations into a single "Other" category.
 
35
  """
36
  if df is None or df.empty:
37
  return pd.DataFrame()
38
 
39
  filtered_df = df.copy()
40
 
41
- col_map = {
42
- "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
43
- "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
44
- "Video": "has_video", "Images": "has_image", "Text": "has_text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
 
47
  if tag_filter and tag_filter != "None" and tag_filter in col_map:
@@ -55,63 +375,209 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
55
  filtered_df[count_by] = 0.0
56
  filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
57
 
58
- all_org_totals = filtered_df.groupby("organization")[count_by].sum()
59
- top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
 
60
 
61
- top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
62
- other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
63
-
64
- final_df_for_plot = top_orgs_df
65
-
66
- if other_total > 0:
67
- other_row = pd.DataFrame([{'organization': 'Other', 'id': 'Other', count_by: other_total}])
68
- final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- if skip_cats and len(skip_cats) > 0:
71
- final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
72
 
73
- final_df_for_plot["root"] = "datasets"
74
- return final_df_for_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def create_treemap(treemap_data, count_by, title=None):
77
  """Generate the Plotly treemap figure from the prepared data."""
78
  if treemap_data.empty or treemap_data[count_by].sum() <= 0:
79
  fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
80
  fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
81
  return fig
82
-
83
- fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
84
- title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
 
 
 
86
  fig.update_traces(
87
- textinfo="label+value+percent root",
88
- hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>"
 
 
 
 
 
 
 
 
 
89
  )
90
  return fig
91
 
92
  # --- Gradio UI Blocks ---
93
- with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
 
 
 
 
 
 
 
 
 
 
94
  datasets_data_state = gr.State(pd.DataFrame())
95
  loading_complete_state = gr.State(False)
 
96
 
97
  with gr.Row():
98
- gr.Markdown("# 🤗 Dataverse Explorer")
99
 
100
- with gr.Row():
101
- with gr.Column(scale=1):
102
- count_by_dropdown = gr.Dropdown(label="Metric", choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")], value="downloads")
103
- tag_filter_dropdown = gr.Dropdown(label="Filter by Tag", choices=TAG_FILTER_CHOICES, value="None")
104
- top_k_dropdown = gr.Dropdown(label="Number of Top Organizations", choices=TOP_K_CHOICES, value=25)
105
- skip_cats_textbox = gr.Textbox(label="Organizations to Skip from the plot", value="Other")
106
- generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
107
-
108
- with gr.Column(scale=3):
109
- plot_output = gr.Plot()
110
- status_message_md = gr.Markdown("Initializing...")
111
- data_info_md = gr.Markdown("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def _update_button_interactivity(is_loaded_flag):
114
  return gr.update(interactive=is_loaded_flag)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  ## CHANGE: New combined function to load data and generate the initial plot on startup.
117
  def load_and_generate_initial_plot(progress=gr.Progress()):
@@ -126,57 +592,140 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
126
  ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
127
  date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  data_info_text = (f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
130
  f"- Status: {status_msg_from_load}\n"
131
- f"- Total datasets loaded: {len(current_df):,}\n"
132
  f"- Data as of: {date_display}\n")
133
  else:
134
  data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
 
 
 
 
135
  except Exception as e:
136
  status_msg_from_load = f"An unexpected error occurred: {str(e)}"
137
  data_info_text = f"### Critical Error\n- {status_msg_from_load}"
138
  load_success_flag = False
139
  current_df = pd.DataFrame() # Ensure df is empty on failure
 
 
 
 
140
  print(f"Critical error in load_and_generate_initial_plot: {e}")
141
 
142
  # --- Part 2: Generate Initial Plot ---
143
  progress(0.6, desc="Generating initial plot...")
144
- # Get default values directly from the UI component definitions
145
- default_metric = "downloads"
146
  default_tag = "None"
147
  default_k = 25
148
- default_skip_cats = "Other"
 
 
 
 
 
 
149
 
150
- # Reuse the existing controller function for plotting
151
  initial_plot, initial_status = ui_generate_plot_controller(
152
- default_metric, default_tag, default_k, default_skip_cats, current_df, progress
153
  )
154
 
155
- return current_df, load_success_flag, data_info_text, initial_status, initial_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
158
- skip_cats_input, df_current_datasets, progress=gr.Progress()):
 
 
159
  if df_current_datasets is None or df_current_datasets.empty:
160
  return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded. Cannot generate plot."
161
 
162
  progress(0.1, desc="Aggregating data...")
163
  cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
164
 
165
- treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, cats_to_skip)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  progress(0.7, desc="Generating plot...")
168
- title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
169
- chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
170
- plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  if treemap_df.empty:
173
  plot_stats_md = "No data matches the selected filters. Please try different options."
174
  else:
175
  total_value_in_plot = treemap_df[metric_choice].sum()
176
- total_datasets_in_plot = treemap_df[treemap_df['id'] != 'Other']['id'].nunique()
177
- plot_stats_md = (f"## Plot Statistics\n- **Organizations/Categories Shown**: {treemap_df['organization'].nunique():,}\n"
178
- f"- **Individual Datasets Shown**: {total_datasets_in_plot:,}\n"
179
- f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}")
 
 
 
 
 
 
 
180
 
181
  return plotly_fig, plot_stats_md
182
 
@@ -186,7 +735,19 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
186
  demo.load(
187
  fn=load_and_generate_initial_plot,
188
  inputs=[],
189
- outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md, plot_output]
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
191
 
192
  loading_complete_state.change(
@@ -195,12 +756,151 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
195
  outputs=generate_plot_button
196
  )
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  generate_plot_button.click(
199
  fn=ui_generate_plot_controller,
200
- inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
201
- skip_cats_textbox, datasets_data_state],
 
 
 
 
 
 
 
 
 
 
 
 
202
  outputs=[plot_output, status_message_md]
203
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  if __name__ == "__main__":
206
  print("Application starting...")
 
2
  import pandas as pd
3
  import plotly.express as px
4
  import time
5
+ import os
6
+ import tempfile
7
+ import requests
8
+ import duckdb
9
+ import json
10
  from datasets import load_dataset
11
+ from huggingface_hub import logout as hf_logout
12
+ from gradio_rangeslider import RangeSlider
13
 
14
  # --- Constants ---
15
  TOP_K_CHOICES = list(range(5, 51, 5))
16
+ HF_DATASET_ID = "evijit/paperverse_daily_data"
17
+ # Direct parquet file URL (public)
18
+ PARQUET_URL = "https://huggingface.co/datasets/evijit/paperverse_daily_data/resolve/main/papers_with_semantic_taxonomy.parquet"
19
+ TAXONOMY_JSON_PATH = "integrated_ml_taxonomy.json"
20
+
21
+ # Simple content filters derived from the new dataset
22
+ TAG_FILTER_CHOICES = [
23
+ "None",
24
+ "Has Code",
25
+ "Has Media",
26
+ "Has Organization",
27
  ]
28
 
29
+ # Load taxonomy from JSON file
30
+ def load_taxonomy():
31
+ """Load the ML taxonomy from JSON file."""
32
+ try:
33
+ with open(TAXONOMY_JSON_PATH, 'r') as f:
34
+ taxonomy = json.load(f)
35
+
36
+ # Extract choices for dropdowns
37
+ categories = sorted(taxonomy.keys())
38
+
39
+ # Build subcategories and topics
40
+ all_subcategories = set()
41
+ all_topics = set()
42
+
43
+ for category, subcats in taxonomy.items():
44
+ for subcat, topics in subcats.items():
45
+ all_subcategories.add(subcat)
46
+ all_topics.update(topics)
47
+
48
+ return {
49
+ 'categories': ["All"] + categories,
50
+ 'subcategories': ["All"] + sorted(all_subcategories),
51
+ 'topics': ["All"] + sorted(all_topics),
52
+ 'taxonomy': taxonomy
53
+ }
54
+ except Exception as e:
55
+ print(f"Error loading taxonomy from JSON: {e}")
56
+ return {
57
+ 'categories': ["All"],
58
+ 'subcategories': ["All"],
59
+ 'topics': ["All"],
60
+ 'taxonomy': {}
61
+ }
62
+
63
+ TAXONOMY_DATA = load_taxonomy()
64
+
65
+ def _first_non_null(*values):
66
+ for v in values:
67
+ if v is None:
68
+ continue
69
+ # treat empty strings as null-ish
70
+ if isinstance(v, str) and v.strip() == "":
71
+ continue
72
+ return v
73
+ return None
74
+
75
+
76
+ def _get_nested(row, *paths):
77
+ """Try multiple dotted paths in a row that may contain dicts; return first non-null."""
78
+ for path in paths:
79
+ cur = row
80
+ ok = True
81
+ for key in path.split('.'):
82
+ if isinstance(cur, dict) and key in cur:
83
+ cur = cur[key]
84
+ else:
85
+ ok = False
86
+ break
87
+ if ok and cur is not None:
88
+ return cur
89
+ return None
90
+
91
+
92
  def load_datasets_data():
93
+ """Load the PaperVerse Daily dataset from the Hugging Face Hub and normalize columns used by the app."""
94
  start_time = time.time()
95
  print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
96
  try:
97
+ # First try: direct parquet download (avoids any auth header issues)
98
+ try:
99
+ print(f"Trying direct parquet download: {PARQUET_URL}")
100
+ with requests.get(PARQUET_URL, stream=True, timeout=120) as resp:
101
+ resp.raise_for_status()
102
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmpf:
103
+ for chunk in resp.iter_content(chunk_size=1024 * 1024):
104
+ if chunk:
105
+ tmpf.write(chunk)
106
+ tmp_path = tmpf.name
107
+ try:
108
+ # Use DuckDB to read parquet to avoid pyarrow decoding issues
109
+ df = duckdb.query(f"SELECT * FROM read_parquet('{tmp_path}')").df()
110
+ finally:
111
+ try:
112
+ os.remove(tmp_path)
113
+ except Exception:
114
+ pass
115
+ print("Loaded DataFrame from direct parquet download via DuckDB.")
116
+ except Exception as direct_e:
117
+ print(f"Direct parquet load failed: {direct_e}. Falling back to datasets loader...")
118
+ # Force anonymous access in case an invalid cached token is present
119
+ # Clear any token environment variables that could inject a bad Authorization header
120
+ for env_key in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HF_HUB_TOKEN"):
121
+ if os.environ.pop(env_key, None) is not None:
122
+ print(f"Cleared env var: {env_key}")
123
+
124
+ # Prefer explicit train split when available
125
+ try:
126
+ dataset_obj = load_dataset(HF_DATASET_ID, split="train", token=None)
127
+ except TypeError:
128
+ dataset_obj = load_dataset(HF_DATASET_ID, split="train", use_auth_token=False)
129
+ except Exception:
130
+ # Fallback: load all splits and pick the first available
131
+ try:
132
+ dataset_obj = load_dataset(HF_DATASET_ID, token=None)
133
+ except TypeError:
134
+ dataset_obj = load_dataset(HF_DATASET_ID, use_auth_token=False)
135
+
136
+ # Handle both Dataset and DatasetDict
137
+ try:
138
+ # If it's a Dataset (single split), this will work
139
+ df = dataset_obj.to_pandas()
140
+ except AttributeError:
141
+ # Otherwise assume DatasetDict and take the first split
142
+ first_split = list(dataset_obj.keys())[0]
143
+ df = dataset_obj[first_split].to_pandas()
144
+
145
+ # --- Normalize expected columns for the visualization ---
146
+ # organization: prefer top-level organization_name, then paper_organization.name/fullname, else Unknown
147
+ if 'organization_name' in df.columns:
148
+ org_series = df['organization_name']
149
+ else:
150
+ # try nested dicts commonly produced by HF datasets
151
+ org_series = df.apply(
152
+ lambda r: _first_non_null(
153
+ _get_nested(r, 'paper_organization.name'),
154
+ _get_nested(r, 'paper_organization.fullname'),
155
+ _get_nested(r, 'organization.name'),
156
+ _get_nested(r, 'organization.fullname')
157
+ ), axis=1
158
+ )
159
+ df['organization'] = org_series.fillna('Unknown')
160
+
161
+ # Extract organization avatar/logo
162
+ if 'organization_name' in df.columns:
163
+ # Try to get avatar from paper_organization or organization struct
164
+ def _get_avatar(row):
165
+ for path in ['paper_organization.avatar', 'organization.avatar']:
166
+ av = _get_nested(row, path)
167
+ if av and isinstance(av, str) and av.strip():
168
+ return av
169
+ return None
170
+ org_avatar_series = df.apply(_get_avatar, axis=1)
171
+ else:
172
+ org_avatar_series = pd.Series([None] * len(df))
173
+ df['organization_avatar'] = org_avatar_series
174
+
175
+ # id for each paper row
176
+ cand_cols = [
177
+ 'paper_id', 'paper_discussionId', 'key'
178
+ ]
179
+ id_val = None
180
+ for c in cand_cols:
181
+ if c in df.columns:
182
+ id_val = df[c]
183
+ break
184
+ if id_val is None:
185
+ # fallback to title + index
186
+ if 'paper_title' in df.columns:
187
+ df['id'] = df['paper_title'].astype(str) + '_' + df.reset_index().index.astype(str)
188
+ elif 'title' in df.columns:
189
+ df['id'] = df['title'].astype(str) + '_' + df.reset_index().index.astype(str)
190
+ else:
191
+ df['id'] = df.reset_index().index.astype(str)
192
+ else:
193
+ df['id'] = id_val.astype(str)
194
+
195
+ # numeric metrics used for aggregation
196
+ def _to_num(col_name):
197
+ if col_name in df.columns:
198
+ return pd.to_numeric(df[col_name], errors='coerce').fillna(0.0)
199
+ return pd.Series([0.0] * len(df))
200
+
201
+ df['paper_upvotes'] = _to_num('paper_upvotes')
202
+ df['numComments'] = _to_num('numComments')
203
+ df['paper_githubStars'] = _to_num('paper_githubStars')
204
+
205
+ # computed boolean filters
206
+ def _has_code(row):
207
+ # Check for GitHub repo
208
+ try:
209
+ gh = row['paper_githubRepo'] if 'paper_githubRepo' in row and pd.notna(row['paper_githubRepo']) else None
210
+ if isinstance(gh, str) and len(gh.strip()) > 0:
211
+ return True
212
+ except Exception:
213
+ pass
214
+ # Check for project page
215
+ try:
216
+ pp = row.get('paper_projectPage') if isinstance(row, dict) else row.get('paper_projectPage', None)
217
+ if isinstance(pp, str) and len(str(pp).strip()) > 0 and str(pp).strip().lower() != 'n/a':
218
+ return True
219
+ except Exception:
220
+ pass
221
+ return False
222
+
223
+ def _has_media(row):
224
+ for c in ['paper_mediaUrls', 'mediaUrls']:
225
+ try:
226
+ v = row[c]
227
+ if isinstance(v, list) and len(v) > 0:
228
+ return True
229
+ # some providers store arrays as strings like "[... ]"
230
+ if isinstance(v, str) and v.strip().startswith('[') and len(v.strip()) > 2:
231
+ return True
232
+ except Exception:
233
+ continue
234
+ return False
235
+
236
+ df['has_code'] = df.apply(_has_code, axis=1)
237
+ df['has_media'] = df.apply(_has_media, axis=1)
238
+ df['has_organization'] = df['organization'].astype(str).str.strip().ne('Unknown')
239
+
240
+ # Process publishedAt field for date filtering
241
+ if 'publishedAt' in df.columns:
242
+ df['publishedAt_dt'] = pd.to_datetime(df['publishedAt'], errors='coerce')
243
+ else:
244
+ df['publishedAt_dt'] = pd.NaT
245
+
246
+ # Ensure topic hierarchy columns exist and are strings
247
+ for col_name, default_val in [
248
+ ('primary_category', 'Unknown'),
249
+ ('primary_subcategory', 'Unknown'),
250
+ ('primary_topic', 'Unknown'),
251
+ ]:
252
+ if col_name not in df.columns:
253
+ df[col_name] = default_val
254
+ else:
255
+ df[col_name] = df[col_name].fillna(default_val).astype(str).replace({'': default_val})
256
+
257
+ # Create a human-friendly paper label for treemap leaves: "<title> — <topic>"
258
+ def _pick_title(row):
259
+ t1 = row.get('paper_title') if isinstance(row, dict) else None
260
+ try:
261
+ t1 = row['paper_title'] if 'paper_title' in row and pd.notna(row['paper_title']) and str(row['paper_title']).strip() != '' else None
262
+ except Exception:
263
+ pass
264
+ if t1 is not None:
265
+ return str(t1)
266
+ try:
267
+ t2 = row['title'] if 'title' in row and pd.notna(row['title']) and str(row['title']).strip() != '' else None
268
+ except Exception:
269
+ t2 = None
270
+ return str(t2) if t2 is not None else 'Untitled'
271
+
272
+ def _pick_topic(row):
273
+ # Prefer primary_topic, else first of taxonomy_topics
274
+ try:
275
+ pt = row['primary_topic'] if 'primary_topic' in row and pd.notna(row['primary_topic']) and str(row['primary_topic']).strip() != '' else None
276
+ except Exception:
277
+ pt = None
278
+ if pt is not None:
279
+ return str(pt)
280
+ try:
281
+ tt = row['taxonomy_topics'] if 'taxonomy_topics' in row else None
282
+ if isinstance(tt, list) and len(tt) > 0:
283
+ return str(tt[0])
284
+ # Sometimes arrays are serialized as strings like "[ ... ]"
285
+ if isinstance(tt, str) and tt.strip().startswith('[') and len(tt.strip()) > 2:
286
+ # naive parse for first quoted token
287
+ inner = tt.strip().lstrip('[').rstrip(']')
288
+ first = inner.split(',')[0].strip().strip('"\'')
289
+ return first if first else 'No topic'
290
+ except Exception:
291
+ pass
292
+ return 'No topic'
293
+
294
+ titles = df.apply(_pick_title, axis=1)
295
+ df['paper_label'] = titles.astype(str)
296
+ # Build a Topic Chain for hover details
297
+ df['topic_chain'] = (
298
+ df['primary_category'].astype(str) + ' > ' +
299
+ df['primary_subcategory'].astype(str) + ' > ' +
300
+ df['primary_topic'].astype(str)
301
+ )
302
+
303
+ # Ensure link fields exist for hover details
304
+ for link_col in ['paper_githubRepo', 'paper_projectPage']:
305
+ if link_col not in df.columns:
306
+ df[link_col] = 'N/A'
307
+ else:
308
+ df[link_col] = df[link_col].fillna('N/A').replace({'': 'N/A'})
309
+
310
  msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
311
  print(msg)
312
  return df, True, msg
313
  except Exception as e:
314
+ # If we encountered invalid credentials, try logging out programmatically and retry once anonymously
315
+ if "Invalid credentials" in str(e) or "401 Client Error" in str(e):
316
+ try:
317
+ print("Encountered auth error; attempting to clear cached token and retry anonymously...")
318
+ hf_logout()
319
+ try:
320
+ dataset_dict = load_dataset(HF_DATASET_ID, token=None)
321
+ except TypeError:
322
+ dataset_dict = load_dataset(HF_DATASET_ID, use_auth_token=False)
323
+ df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
324
+ msg = f"Successfully loaded dataset after clearing token in {time.time() - start_time:.2f}s."
325
+ print(msg)
326
+ return df, True, msg
327
+ except Exception as e2:
328
+ err_msg = f"Failed to load dataset after retry. Error: {e2} (initial: {e})"
329
+ print(err_msg)
330
+ return pd.DataFrame(), False, err_msg
331
  err_msg = f"Failed to load dataset. Error: {e}"
332
  print(err_msg)
333
  return pd.DataFrame(), False, err_msg
334
 
335
+ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None, group_by='organization', date_range=None):
336
  """
337
  Filter data and prepare it for a multi-level treemap.
338
  - Preserves individual datasets for the top K organizations.
339
  - Groups all other organizations into a single "Other" category.
340
+ - date_range: tuple of (min_timestamp, max_timestamp) in seconds since epoch
341
  """
342
  if df is None or df.empty:
343
  return pd.DataFrame()
344
 
345
  filtered_df = df.copy()
346
 
347
+ # Apply date range filter
348
+ if date_range is not None and 'publishedAt_dt' in filtered_df.columns:
349
+ min_ts, max_ts = date_range
350
+ min_date = pd.to_datetime(min_ts, unit='s')
351
+ max_date = pd.to_datetime(max_ts, unit='s')
352
+ # Remove timezone info for comparison if publishedAt_dt is tz-naive
353
+ if filtered_df['publishedAt_dt'].dt.tz is None:
354
+ min_date = min_date.tz_localize(None)
355
+ max_date = max_date.tz_localize(None)
356
+ filtered_df = filtered_df[
357
+ (filtered_df['publishedAt_dt'] >= min_date) &
358
+ (filtered_df['publishedAt_dt'] <= max_date)
359
+ ]
360
+
361
+ col_map = {
362
+ "Has Code": "has_code",
363
+ "Has Media": "has_media",
364
+ "Has Organization": "has_organization",
365
  }
366
 
367
  if tag_filter and tag_filter != "None" and tag_filter in col_map:
 
375
  filtered_df[count_by] = 0.0
376
  filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
377
 
378
+ if group_by == 'organization':
379
+ all_org_totals = filtered_df.groupby("organization")[count_by].sum()
380
+ top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
381
 
382
+ top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
383
+ other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
384
+
385
+ final_df_for_plot = top_orgs_df
386
+
387
+ if other_total > 0:
388
+ other_row = pd.DataFrame([{
389
+ 'organization': 'Other',
390
+ 'paper_label': 'Other',
391
+ 'primary_category': 'Other',
392
+ 'primary_subcategory': 'Other',
393
+ 'primary_topic': 'Other',
394
+ 'topic_chain': 'Other > Other > Other',
395
+ 'paper_githubRepo': 'N/A',
396
+ 'paper_projectPage': 'N/A',
397
+ 'organization_avatar': None,
398
+ count_by: other_total
399
+ }])
400
+ final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
401
 
402
+ if skip_cats and len(skip_cats) > 0:
403
+ final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
404
 
405
+ final_df_for_plot["root"] = "papers"
406
+ return final_df_for_plot
407
+ else:
408
+ # Topic grouping: apply top-k to topic combinations and handle skip list
409
+ topic_totals = filtered_df.groupby(['primary_category', 'primary_subcategory', 'primary_topic'])[count_by].sum()
410
+ top_topics = topic_totals.nlargest(top_k, keep='first').index.tolist()
411
+
412
+ # Filter to top topics
413
+ top_topics_df = filtered_df[
414
+ filtered_df.apply(
415
+ lambda r: (r['primary_category'], r['primary_subcategory'], r['primary_topic']) in top_topics,
416
+ axis=1
417
+ )
418
+ ].copy()
419
+
420
+ # Apply skip filter (skip by primary_topic name)
421
+ if skip_cats and len(skip_cats) > 0:
422
+ top_topics_df = top_topics_df[~top_topics_df['primary_topic'].isin(skip_cats)]
423
+
424
+ top_topics_df["root"] = "papers"
425
+ return top_topics_df
426
 
427
+ def create_treemap(treemap_data, count_by, title=None, path=None, metric_label=None):
428
  """Generate the Plotly treemap figure from the prepared data."""
429
  if treemap_data.empty or treemap_data[count_by].sum() <= 0:
430
  fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
431
  fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
432
  return fig
433
+ if path is None:
434
+ path = ["root", "organization", "paper_label"]
435
+ # Add custom data columns as regular columns for Plotly to access
436
+ # This ensures all nodes (including intermediate hierarchy nodes) have these fields
437
+ # Ensure organization_avatar column exists (for search details, not hover)
438
+ if 'organization_avatar' not in treemap_data.columns:
439
+ treemap_data['organization_avatar'] = None
440
+
441
+ fig = px.treemap(
442
+ treemap_data,
443
+ path=path,
444
+ values=count_by,
445
+ hover_data={
446
+ 'primary_category': True,
447
+ 'primary_subcategory': True,
448
+ 'primary_topic': True,
449
+ 'paper_githubRepo': True,
450
+ 'paper_projectPage': True,
451
+ },
452
+ title=title,
453
+ color_discrete_sequence=px.colors.qualitative.Plotly
454
+ )
455
  fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
456
+ display_metric = metric_label if metric_label else count_by
457
+
458
+ # Clean hover without organization avatar (images shown in search details instead)
459
  fig.update_traces(
460
+ textinfo="label+value",
461
+ hovertemplate=(
462
+ "<b>%{label}</b><br>"
463
+ + "%{value:,} " + display_metric +
464
+ "<br><br><b>Topic Hierarchy:</b><br>"
465
+ + "%{customdata[0]} > %{customdata[1]} > %{customdata[2]}<br>"
466
+ + "<br><b>Links:</b><br>"
467
+ + "GitHub: %{customdata[3]}<br>"
468
+ + "Project: %{customdata[4]}"
469
+ + "<extra></extra>"
470
+ ),
471
  )
472
  return fig
473
 
474
  # --- Gradio UI Blocks ---
475
+ with gr.Blocks(
476
+ title="📚 PaperVerse Daily Explorer",
477
+ fill_width=True,
478
+ css="""
479
+ /* Hide the timestamp numbers on the range slider */
480
+ #date-range-slider-wrapper .head,
481
+ #date-range-slider-wrapper div[data-testid="range-slider"] > span {
482
+ display: none !important;
483
+ }
484
+ """
485
+ ) as demo:
486
  datasets_data_state = gr.State(pd.DataFrame())
487
  loading_complete_state = gr.State(False)
488
+ date_range_state = gr.State(None) # Store min/max timestamps
489
 
490
  with gr.Row():
491
+ gr.Markdown("# 📚 PaperVerse Daily Explorer")
492
 
493
+ with gr.Tabs():
494
+ with gr.Tab("📊 Treemap Visualization"):
495
+ with gr.Row():
496
+ with gr.Column(scale=1):
497
+ count_by_dropdown = gr.Dropdown(
498
+ label="Metric",
499
+ choices=[
500
+ ("Upvotes", "paper_upvotes"),
501
+ ("Comments", "numComments"),
502
+ ],
503
+ value="paper_upvotes",
504
+ )
505
+ group_by_dropdown = gr.Dropdown(
506
+ label="Group by",
507
+ choices=[("Organization", "organization"), ("Topic", "topic")],
508
+ value="organization",
509
+ )
510
+ gr.Markdown("**Filters**")
511
+ filter_code = gr.Checkbox(label="Has Code", value=False)
512
+ filter_media = gr.Checkbox(label="Has Media", value=False)
513
+ filter_org = gr.Checkbox(label="Has Organization", value=False)
514
+
515
+ gr.Markdown("**Date Range**")
516
+ date_range_slider = RangeSlider(
517
+ minimum=0,
518
+ maximum=100,
519
+ value=(0, 100),
520
+ label="Paper Release Date Range",
521
+ interactive=True,
522
+ elem_id="date-range-slider-wrapper"
523
+ )
524
+ date_range_display = gr.Markdown("Loading date range...")
525
+
526
+ top_k_dropdown = gr.Dropdown(label="Number of Top Organizations", choices=TOP_K_CHOICES, value=25)
527
+ category_filter_dropdown = gr.Dropdown(label="Primary Category", choices=["All"], value="All")
528
+ subcategory_filter_dropdown = gr.Dropdown(label="Primary Subcategory", choices=["All"], value="All")
529
+ topic_filter_dropdown = gr.Dropdown(label="Primary Topic", choices=["All"], value="All")
530
+ skip_cats_textbox = gr.Textbox(label="Organizations to Skip", value="unaffiliated, Other")
531
+ generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
532
+
533
+ with gr.Column(scale=3):
534
+ plot_output = gr.Plot()
535
+ status_message_md = gr.Markdown("Initializing...")
536
+ data_info_md = gr.Markdown("")
537
+
538
+ with gr.Tab("🔍 Paper Search"):
539
+ with gr.Column():
540
+ gr.Markdown("### � Search Papers and Organizations")
541
+ with gr.Row():
542
+ search_item = gr.Textbox(
543
+ label="Search Organization or Paper",
544
+ placeholder="Type organization name or paper title to see details...",
545
+ scale=4
546
+ )
547
+ search_button = gr.Button("Show Details", scale=1, variant="secondary")
548
+ selected_info_html = gr.HTML(value="<p style='color: gray;'>Enter an organization name or paper title above to see details</p>")
549
 
550
  def _update_button_interactivity(is_loaded_flag):
551
  return gr.update(interactive=is_loaded_flag)
552
+
553
+ def _format_date_range(date_range_tuple, date_range_value):
554
+ """Convert slider values to readable date range text"""
555
+ if date_range_tuple is None:
556
+ return "Date range unavailable"
557
+ min_ts, max_ts = date_range_tuple
558
+ selected_min, selected_max = date_range_value
559
+
560
+ # Convert slider values to timestamps
561
+ # The slider values are already timestamps
562
+ min_date = pd.to_datetime(selected_min, unit='s')
563
+ max_date = pd.to_datetime(selected_max, unit='s')
564
+
565
+ return f"**Selected Range:** {min_date.strftime('%B %d, %Y')} to {max_date.strftime('%B %d, %Y')}"
566
+
567
+ def _toggle_labels_by_grouping(group_by_value):
568
+ # Update labels based on grouping mode
569
+ if group_by_value == 'topic':
570
+ top_k_label = "Number of Top Topics"
571
+ skip_label = "Topics to Skip"
572
+ skip_value = "" # Clear skip box for topics
573
+ else:
574
+ top_k_label = "Number of Top Organizations"
575
+ skip_label = "Organizations to Skip"
576
+ skip_value = "unaffiliated, Other" # Default orgs to skip
577
+ return (
578
+ gr.update(label=top_k_label),
579
+ gr.update(label=skip_label, value=skip_value)
580
+ )
581
 
582
  ## CHANGE: New combined function to load data and generate the initial plot on startup.
583
  def load_and_generate_initial_plot(progress=gr.Progress()):
 
592
  ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
593
  date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
594
 
595
+ # Calculate date range from publishedAt_dt
596
+ min_ts = 0
597
+ max_ts = 100
598
+ date_range_text = "Date range unavailable"
599
+ date_range_tuple = None
600
+
601
+ if 'publishedAt_dt' in current_df.columns:
602
+ valid_dates = current_df['publishedAt_dt'].dropna()
603
+ if len(valid_dates) > 0:
604
+ min_date = valid_dates.min()
605
+ max_date = valid_dates.max()
606
+ min_ts = int(min_date.timestamp())
607
+ max_ts = int(max_date.timestamp())
608
+ date_range_tuple = (min_ts, max_ts)
609
+ date_range_text = f"**Full Range:** {min_date.strftime('%B %d, %Y')} to {max_date.strftime('%B %d, %Y')}"
610
+
611
  data_info_text = (f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
612
  f"- Status: {status_msg_from_load}\n"
613
+ f"- Total records loaded: {len(current_df):,}\n"
614
  f"- Data as of: {date_display}\n")
615
  else:
616
  data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
617
+ min_ts = 0
618
+ max_ts = 100
619
+ date_range_text = "Date range unavailable"
620
+ date_range_tuple = None
621
  except Exception as e:
622
  status_msg_from_load = f"An unexpected error occurred: {str(e)}"
623
  data_info_text = f"### Critical Error\n- {status_msg_from_load}"
624
  load_success_flag = False
625
  current_df = pd.DataFrame() # Ensure df is empty on failure
626
+ min_ts = 0
627
+ max_ts = 100
628
+ date_range_text = "Date range unavailable"
629
+ date_range_tuple = None
630
  print(f"Critical error in load_and_generate_initial_plot: {e}")
631
 
632
  # --- Part 2: Generate Initial Plot ---
633
  progress(0.6, desc="Generating initial plot...")
634
+ # Defaults matching UI definitions
635
+ default_metric = "paper_upvotes"
636
  default_tag = "None"
637
  default_k = 25
638
+ default_group_by = "organization"
639
+ default_skip_cats = "unaffiliated, Other"
640
+
641
+ # Use taxonomy from JSON instead of calculating from dataset
642
+ cat_choices = TAXONOMY_DATA['categories']
643
+ subcat_choices = TAXONOMY_DATA['subcategories']
644
+ topic_choices = TAXONOMY_DATA['topics']
645
 
646
+ # Reuse the existing controller function for plotting (with date range set to None for initial load)
647
  initial_plot, initial_status = ui_generate_plot_controller(
648
+ default_metric, False, False, False, default_k, default_group_by, "All", "All", "All", default_skip_cats, None, current_df, progress
649
  )
650
 
651
+ # Also update taxonomy dropdown choices
652
+ return (
653
+ current_df,
654
+ load_success_flag,
655
+ data_info_text,
656
+ initial_status,
657
+ initial_plot,
658
+ gr.update(choices=cat_choices, value="All"),
659
+ gr.update(choices=subcat_choices, value="All"),
660
+ gr.update(choices=topic_choices, value="All"),
661
+ gr.update(minimum=min_ts, maximum=max_ts, value=(min_ts, max_ts)),
662
+ date_range_text,
663
+ date_range_tuple,
664
+ )
665
 
666
+ def ui_generate_plot_controller(metric_choice, has_code, has_media, has_org,
667
+ k_orgs, group_by_choice,
668
+ category_choice, subcategory_choice, topic_choice,
669
+ skip_cats_input, date_range, df_current_datasets, progress=gr.Progress()):
670
  if df_current_datasets is None or df_current_datasets.empty:
671
  return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded. Cannot generate plot."
672
 
673
  progress(0.1, desc="Aggregating data...")
674
  cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
675
 
676
+ # Apply content filters (checkboxes)
677
+ df_filtered = df_current_datasets.copy()
678
+ if has_code:
679
+ df_filtered = df_filtered[df_filtered['has_code']]
680
+ if has_media:
681
+ df_filtered = df_filtered[df_filtered['has_media']]
682
+ if has_org:
683
+ df_filtered = df_filtered[df_filtered['has_organization']]
684
+
685
+ # Apply taxonomy filters
686
+ if category_choice and category_choice != 'All':
687
+ df_filtered = df_filtered[df_filtered['primary_category'] == category_choice]
688
+ if subcategory_choice and subcategory_choice != 'All':
689
+ df_filtered = df_filtered[df_filtered['primary_subcategory'] == subcategory_choice]
690
+ if topic_choice and topic_choice != 'All':
691
+ df_filtered = df_filtered[df_filtered['primary_topic'] == topic_choice]
692
+
693
+ treemap_df = make_treemap_data(df_filtered, metric_choice, k_orgs, None, cats_to_skip, group_by_choice, date_range)
694
 
695
  progress(0.7, desc="Generating plot...")
696
+ title_labels = {
697
+ "paper_upvotes": "Upvotes",
698
+ "numComments": "Comments",
699
+ }
700
+ if group_by_choice == "topic":
701
+ chart_title = f"PaperVerse Daily - {title_labels.get(metric_choice, metric_choice)} by Topic"
702
+ path = ["root", "primary_category", "primary_subcategory", "primary_topic", "paper_label"]
703
+ else:
704
+ chart_title = f"PaperVerse Daily - {title_labels.get(metric_choice, metric_choice)} by Organization"
705
+ path = ["root", "organization", "paper_label"]
706
+ plotly_fig = create_treemap(
707
+ treemap_df,
708
+ metric_choice,
709
+ chart_title,
710
+ path=path,
711
+ metric_label=title_labels.get(metric_choice, metric_choice),
712
+ )
713
 
714
  if treemap_df.empty:
715
  plot_stats_md = "No data matches the selected filters. Please try different options."
716
  else:
717
  total_value_in_plot = treemap_df[metric_choice].sum()
718
+ total_items_in_plot = treemap_df[treemap_df['paper_label'] != 'Other']['paper_label'].nunique()
719
+ if group_by_choice == "topic":
720
+ group_count = treemap_df[["primary_category", "primary_subcategory", "primary_topic"]].drop_duplicates().shape[0]
721
+ group_line = f"**Topics Shown**: {group_count:,} unique triplets"
722
+ else:
723
+ group_line = f"**Organizations Shown**: {treemap_df['organization'].nunique():,}"
724
+ plot_stats_md = (
725
+ f"## Plot Statistics\n- {group_line}\n"
726
+ f"- **Individual Papers Shown**: {total_items_in_plot:,}\n"
727
+ f"- **Total {title_labels.get(metric_choice, metric_choice)} in plot**: {int(total_value_in_plot):,}"
728
+ )
729
 
730
  return plotly_fig, plot_stats_md
731
 
 
735
  demo.load(
736
  fn=load_and_generate_initial_plot,
737
  inputs=[],
738
+ outputs=[
739
+ datasets_data_state,
740
+ loading_complete_state,
741
+ data_info_md,
742
+ status_message_md,
743
+ plot_output,
744
+ category_filter_dropdown,
745
+ subcategory_filter_dropdown,
746
+ topic_filter_dropdown,
747
+ date_range_slider,
748
+ date_range_display,
749
+ date_range_state,
750
+ ]
751
  )
752
 
753
  loading_complete_state.change(
 
756
  outputs=generate_plot_button
757
  )
758
 
759
+ # Update labels based on grouping mode
760
+ group_by_dropdown.change(
761
+ fn=_toggle_labels_by_grouping,
762
+ inputs=group_by_dropdown,
763
+ outputs=[top_k_dropdown, skip_cats_textbox],
764
+ )
765
+
766
+ # Update date range display when slider changes
767
+ date_range_slider.change(
768
+ fn=_format_date_range,
769
+ inputs=[date_range_state, date_range_slider],
770
+ outputs=date_range_display,
771
+ show_progress="hidden"
772
+ )
773
+
774
+ def handle_search_details(search_text, df_current):
775
+ """Search for an organization or paper and show detailed information."""
776
+ if not search_text or not search_text.strip():
777
+ return "<p style='color: gray;'>Please enter a search term</p>"
778
+
779
+ if df_current is None or df_current.empty:
780
+ return "<p style='color: gray;'>No data available</p>"
781
+
782
+ search_text = search_text.strip()
783
+
784
+ try:
785
+ # Try to find matching rows by organization or paper title (case-insensitive partial match)
786
+ matching_rows = df_current[
787
+ df_current['organization'].str.contains(search_text, case=False, na=False) |
788
+ df_current['paper_label'].str.contains(search_text, case=False, na=False) |
789
+ (df_current['paper_title'].str.contains(search_text, case=False, na=False) if 'paper_title' in df_current.columns else False)
790
+ ]
791
+
792
+ if matching_rows.empty:
793
+ return f"<p style='color: orange;'>No results found for: <b>{search_text}</b></p><p style='color: gray;'>Try searching for an organization name (e.g., 'Qwen', 'Meta') or paper title keyword</p>"
794
+
795
+ # Build the info panel HTML showing all matching results
796
+ num_results = len(matching_rows)
797
+ html_parts = [
798
+ f"<div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; background: #f9f9f9; max-height: 600px; overflow-y: auto;'>",
799
+ f"<h3 style='margin: 0 0 15px 0; color: #333;'>🔍 Found {num_results} result{'s' if num_results > 1 else ''} for: <span style='color: #0366d6;'>{search_text}</span></h3>"
800
+ ]
801
+
802
+ # Limit to first 20 results to avoid too much content
803
+ display_rows = matching_rows.head(20)
804
+
805
+ for idx, (_, row) in enumerate(display_rows.iterrows()):
806
+ # Add separator between results
807
+ if idx > 0:
808
+ html_parts.append("<hr style='margin: 15px 0; border: none; border-top: 1px solid #ddd;'/>")
809
+
810
+ html_parts.append("<div style='margin-bottom: 10px; overflow: auto;'>")
811
+
812
+ # Get organization avatar from precomputed column
813
+ org_avatar = row.get('organization_avatar')
814
+
815
+ # Organization logo if available
816
+ if org_avatar and isinstance(org_avatar, str) and org_avatar.strip() and org_avatar.strip().lower() not in ['none', 'null', 'n/a', '']:
817
+ html_parts.append(f"<img src='{org_avatar}' style='max-width: 60px; max-height: 60px; border-radius: 50%; margin-bottom: 8px; float: left; margin-right: 12px; border: 2px solid #ddd;' onerror=\"this.style.display='none'\"/>")
818
+
819
+ # Get paper thumbnail (direct field from schema)
820
+ paper_thumbnail = row.get('thumbnail')
821
+
822
+ # Paper thumbnail if available
823
+ if paper_thumbnail and isinstance(paper_thumbnail, str) and paper_thumbnail.strip() and paper_thumbnail.strip().lower() not in ['none', 'null', 'n/a', '']:
824
+ html_parts.append(f"<img src='{paper_thumbnail}' style='max-width: 120px; max-height: 120px; border-radius: 8px; margin-bottom: 8px; float: right; margin-left: 12px; border: 1px solid #ddd;' onerror=\"this.style.display='none'\"/>")
825
+
826
+ # Organization name
827
+ org_name = row.get('organization', 'Unknown')
828
+ html_parts.append(f"<p style='margin: 0 0 5px 0; font-weight: bold; color: #333;'>🏢 {org_name}</p>")
829
+
830
+ # Paper title
831
+ paper_title = row.get('paper_title', row.get('title', 'Untitled'))
832
+ html_parts.append(f"<p style='margin: 0 0 5px 0; color: #555; font-size: 0.95em;'>📄 {paper_title}</p>")
833
+
834
+ # Topic hierarchy
835
+ category = row.get('primary_category', 'Unknown')
836
+ subcategory = row.get('primary_subcategory', 'Unknown')
837
+ topic = row.get('primary_topic', 'Unknown')
838
+ html_parts.append(f"<p style='margin: 0 0 5px 0; font-size: 0.9em; color: #666;'><b>Topics:</b> {category} → {subcategory} → {topic}</p>")
839
+
840
+ # Metrics
841
+ upvotes = row.get('paper_upvotes', 0)
842
+ comments = row.get('numComments', 0)
843
+ html_parts.append(f"<p style='margin: 0 0 5px 0; font-size: 0.9em;'><b>Metrics:</b> ⬆️ {upvotes:,} upvotes | 💬 {comments:,} comments</p>")
844
+
845
+ # Links
846
+ github = row.get('paper_githubRepo')
847
+ project = row.get('paper_projectPage')
848
+
849
+ links = []
850
+ if github and isinstance(github, str) and github.strip() and github.strip().lower() not in ['n/a', 'none']:
851
+ links.append(f"<a href='{github}' target='_blank' style='color: #0366d6; margin-right: 15px;'>🔗 GitHub</a>")
852
+
853
+ if project and isinstance(project, str) and project.strip() and project.strip().lower() not in ['n/a', 'none']:
854
+ links.append(f"<a href='{project}' target='_blank' style='color: #0366d6;'>🔗 Project</a>")
855
+
856
+ if links:
857
+ html_parts.append(f"<p style='margin: 0; font-size: 0.9em;'>{' '.join(links)}</p>")
858
+
859
+ html_parts.append("<div style='clear: both;'></div>")
860
+ html_parts.append("</div>")
861
+
862
+ if num_results > 20:
863
+ html_parts.append(f"<p style='margin-top: 15px; color: #666; font-style: italic;'>Showing first 20 of {num_results} results. Refine your search for fewer results.</p>")
864
+
865
+ html_parts.append("</div>")
866
+
867
+ return "".join(html_parts)
868
+
869
+ except Exception as e:
870
+ return f"<p style='color: red;'>Error displaying details: {str(e)}</p>"
871
+
872
  generate_plot_button.click(
873
  fn=ui_generate_plot_controller,
874
+ inputs=[
875
+ count_by_dropdown,
876
+ filter_code,
877
+ filter_media,
878
+ filter_org,
879
+ top_k_dropdown,
880
+ group_by_dropdown,
881
+ category_filter_dropdown,
882
+ subcategory_filter_dropdown,
883
+ topic_filter_dropdown,
884
+ skip_cats_textbox,
885
+ date_range_slider,
886
+ datasets_data_state,
887
+ ],
888
  outputs=[plot_output, status_message_md]
889
  )
890
+
891
+ # Handle search button for showing details
892
+ search_button.click(
893
+ fn=handle_search_details,
894
+ inputs=[search_item, datasets_data_state],
895
+ outputs=[selected_info_html]
896
+ )
897
+
898
+ # Also trigger on Enter key in search box
899
+ search_item.submit(
900
+ fn=handle_search_details,
901
+ inputs=[search_item, datasets_data_state],
902
+ outputs=[selected_info_html]
903
+ )
904
 
905
  if __name__ == "__main__":
906
  print("Application starting...")
integrated_ml_taxonomy.json ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Algorithms and Learning Methods": {
3
+ "Supervised Learning": [
4
+ "Classification",
5
+ "Regression",
6
+ "Structured Prediction",
7
+ "Ranking and Preference Learning"
8
+ ],
9
+ "Unsupervised Learning": [
10
+ "Clustering",
11
+ "Density Estimation",
12
+ "Unsupervised Representation Learning"
13
+ ],
14
+ "Semi-Supervised and Self-Supervised Learning": [
15
+ "Semi-Supervised Learning",
16
+ "Self-Supervised Learning"
17
+ ],
18
+ "Reinforcement Learning and Planning": [
19
+ "Reinforcement Learning",
20
+ "Reinforcement Learning with Human Feedback (RLHF)",
21
+ "Markov Decision Processes",
22
+ "Model-Based RL",
23
+ "Multi-Agent RL",
24
+ "Hierarchical RL",
25
+ "Exploration",
26
+ "Decision and Control",
27
+ "Planning",
28
+ "Planning Algorithms",
29
+ "Navigation"
30
+ ],
31
+ "Transfer and Adaptation": [
32
+ "Transfer Learning",
33
+ "Meta-Learning",
34
+ "Multitask Learning",
35
+ "Lifelong Learning",
36
+ "Continual Learning",
37
+ "Few-Shot Learning",
38
+ "Domain Adaptation",
39
+ "Model Mixing Methods"
40
+ ],
41
+ "Representation Learning": [
42
+ "Representation Learning",
43
+ "Embedding Approaches",
44
+ "Metric Learning",
45
+ "Similarity and Distance Learning",
46
+ "Nonlinear Dimensionality Reduction and Manifold Learning",
47
+ "Components Analysis (CCA, ICA, LDA, PCA)",
48
+ "Sparse Coding and Dimensionality Expansion",
49
+ "Sparsity and Compressed Sensing"
50
+ ],
51
+ "Model Alignment and Adaptation": [
52
+ "Fine-Tuning",
53
+ "Instruction-Tuning",
54
+ "Prompt Tuning",
55
+ "In-Context Learning",
56
+ "Value Alignment and Human Feedback",
57
+ "Alignment Methods"
58
+ ],
59
+ "Adversarial and Robust Learning": [
60
+ "Adversarial Learning",
61
+ "AI Red Teaming and Adversarial Testing",
62
+ "Adversarial Attacks and Defenses",
63
+ "Threat Models and Mitigations"
64
+ ],
65
+ "Active and Interactive Learning": [
66
+ "Active Learning",
67
+ "Online Learning",
68
+ "Interactive Learning",
69
+ "Bandit Algorithms",
70
+ "Dialog- or Communication-Based Learning"
71
+ ],
72
+ "Ensemble and Boosting Methods": [
73
+ "Boosting and Ensemble Methods"
74
+ ],
75
+ "Specialized Learning Paradigms": [
76
+ "AutoML",
77
+ "Multimodal Learning",
78
+ "Relational Learning",
79
+ "Collaborative Filtering",
80
+ "Adaptive Data Analysis",
81
+ "Communication- or Memory-Bounded Learning",
82
+ "Large Scale Learning",
83
+ "Program Induction",
84
+ "Learning and Unlearning"
85
+ ],
86
+ "Data Handling": [
87
+ "Missing Data",
88
+ "Data Compression",
89
+ "Model Selection and Structure Learning"
90
+ ]
91
+ },
92
+ "Deep Learning": {
93
+ "Architectures": [
94
+ "CNN Architectures",
95
+ "Recurrent Networks",
96
+ "Attention Models",
97
+ "Transformer Architectures",
98
+ "Memory-Augmented Neural Networks",
99
+ "Interaction-Based Deep Networks",
100
+ "Biologically Plausible Deep Networks"
101
+ ],
102
+ "Model Types": [
103
+ "Deep Autoencoders",
104
+ "Adversarial Networks",
105
+ "Generative Models",
106
+ "Predictive Models",
107
+ "Supervised Deep Networks"
108
+ ],
109
+ "Training and Optimization": [
110
+ "Efficient Training Methods",
111
+ "Distributed Training and Inference",
112
+ "Training Dynamics",
113
+ "Optimization Instability",
114
+ "Efficient Inference Methods",
115
+ "Optimization for Deep Networks"
116
+ ],
117
+ "Model Efficiency": [
118
+ "Model Distillation",
119
+ "Model Compression",
120
+ "Quantization",
121
+ "Sample Efficient Methods",
122
+ "Memory Efficient Methods"
123
+ ],
124
+ "Inference and Decoding": [
125
+ "Decoding Algorithms",
126
+ "Reasoning Algorithms",
127
+ "Search Algorithms"
128
+ ],
129
+ "Analysis and Interpretation": [
130
+ "Analysis and Understanding of Deep Networks",
131
+ "Visualization or Exposition Techniques for Deep Networks",
132
+ "Interpretability and Explainability",
133
+ "Demystification",
134
+ "Scaling Laws",
135
+ "Emergent Capabilities",
136
+ "Grokking"
137
+ ]
138
+ },
139
+ "Probabilistic Methods": {
140
+ "Bayesian Methods": [
141
+ "Bayesian Theory",
142
+ "Bayesian Nonparametrics",
143
+ "Gaussian Processes"
144
+ ],
145
+ "Inference": [
146
+ "Variational Inference",
147
+ "MCMC",
148
+ "Belief Propagation",
149
+ "Distributed Inference",
150
+ "Uncertainty Estimation"
151
+ ],
152
+ "Models": [
153
+ "Graphical Models",
154
+ "Hierarchical Models",
155
+ "Latent Variable Models",
156
+ "Topic Models",
157
+ "Causal Inference",
158
+ "Causal Reasoning"
159
+ ],
160
+ "Probabilistic Programming": [
161
+ "Probabilistic Programming"
162
+ ]
163
+ },
164
+ "Optimization": {
165
+ "Continuous Optimization": [
166
+ "Convex Optimization",
167
+ "Non-Convex Optimization",
168
+ "Stochastic Optimization",
169
+ "Stochastic Methods"
170
+ ],
171
+ "Discrete Optimization": [
172
+ "Discrete Optimization",
173
+ "Submodular Optimization"
174
+ ],
175
+ "Evolutionary Methods": [
176
+ "Evolutionary Computation"
177
+ ]
178
+ },
179
+ "Theory": {
180
+ "Learning Theory": [
181
+ "Computational Learning Theory",
182
+ "Statistical Learning Theory",
183
+ "Models of Learning and Generalization",
184
+ "Hardness of Learning and Approximations",
185
+ "Regularization",
186
+ "Fundamental Limitations of Learning",
187
+ "Complexity of Learning Systems"
188
+ ],
189
+ "Statistical Theory": [
190
+ "Frequentist Statistics",
191
+ "High-Dimensional Inference",
192
+ "Large Deviations and Asymptotic Analysis"
193
+ ],
194
+ "Mathematical Foundations": [
195
+ "Information Theory",
196
+ "Control Theory",
197
+ "Game Theory and Computational Economics",
198
+ "Statistical Physics of Learning",
199
+ "Spaces of Functions and Kernels",
200
+ "Spectral Methods",
201
+ "Kernel Methods",
202
+ "Large Margin Methods"
203
+ ],
204
+ "Algorithmic Theory": [
205
+ "Data-driven Algorithm Design"
206
+ ]
207
+ },
208
+ "Knowledge and Reasoning": {
209
+ "Knowledge Representation": [
210
+ "Knowledge Models",
211
+ "World Models",
212
+ "Factuality"
213
+ ],
214
+ "Reasoning": [
215
+ "Commonsense Reasoning",
216
+ "Theory of Mind",
217
+ "Social Norms Understanding",
218
+ "Pragmatics"
219
+ ],
220
+ "Knowledge Integration": [
221
+ "Retrieval-Augmented Models",
222
+ "Tool Use and API Integration",
223
+ "Neurosymbolic and Hybrid AI Systems (Physics-Informed, Logic, Formal Reasoning)"
224
+ ]
225
+ },
226
+ "Evaluation and Benchmarking": {
227
+ "Evaluation Methods": [
228
+ "Benchmarks",
229
+ "Evaluation Protocols and Metrics",
230
+ "Human Evaluation",
231
+ "Machine Evaluation",
232
+ "Scalable Oversight"
233
+ ],
234
+ "Simulation and Testing": [
235
+ "Simulation Environments",
236
+ "Assurance Testing and Deployment Policies"
237
+ ]
238
+ },
239
+ "Applications": {
240
+ "Vision": [
241
+ "Computer Vision",
242
+ "Object Detection",
243
+ "Object Recognition",
244
+ "Image Segmentation",
245
+ "Body Pose, Face, and Gesture Analysis",
246
+ "Tracking and Motion in Video",
247
+ "Video Analysis",
248
+ "Visual Question Answering",
249
+ "Visual Scene Analysis and Interpretation",
250
+ "Computational Photography",
251
+ "Denoising"
252
+ ],
253
+ "Language": [
254
+ "Natural Language Processing",
255
+ "Language Representation Learning",
256
+ "Dialog Systems",
257
+ "Conversational AI"
258
+ ],
259
+ "Audio and Speech": [
260
+ "Audio and Speech Processing",
261
+ "Speech Recognition",
262
+ "Music Modeling and Analysis"
263
+ ],
264
+ "Multimodal": [
265
+ "Multimodal Models",
266
+ "Vision-Language Models",
267
+ "Audio-Visual Learning",
268
+ "Cross-Modal Learning"
269
+ ],
270
+ "Robotics and Embodied AI": [
271
+ "Robotics",
272
+ "Motor Control",
273
+ "Autonomous Systems",
274
+ "Perception and Action",
275
+ "Embodied AI"
276
+ ],
277
+ "Code and Software": [
278
+ "Program Understanding and Generation",
279
+ "Code Generation",
280
+ "Software Engineering with AI",
281
+ "Automated Reasoning and Formal Methods"
282
+ ],
283
+ "Science and Engineering": [
284
+ "Computational Biology and Bioinformatics",
285
+ "Physical Sciences (Physics, Chemistry, Biology)",
286
+ "Scientific Discovery",
287
+ "Quantum Learning"
288
+ ],
289
+ "Mathematics": [
290
+ "Mathematical Reasoning",
291
+ "Theorem Proving",
292
+ "Symbolic Mathematics"
293
+ ],
294
+ "Health and Medicine": [
295
+ "Medical Applications",
296
+ "Clinical Decision Support",
297
+ "Drug Discovery",
298
+ "Healthcare AI"
299
+ ],
300
+ "Education": [
301
+ "Educational Applications",
302
+ "Intelligent Tutoring Systems",
303
+ "Educational Technology"
304
+ ],
305
+ "Social and Web": [
306
+ "Computational Social Science",
307
+ "Recommender Systems",
308
+ "Information Retrieval",
309
+ "Web Applications and Internet Data",
310
+ "Network Analysis"
311
+ ],
312
+ "Interactive Systems": [
313
+ "Game Playing",
314
+ "Multi-Agent Systems",
315
+ "Human-AI Interaction"
316
+ ],
317
+ "Data and Signals": [
318
+ "Signal Processing",
319
+ "Time Series Analysis",
320
+ "Matrix and Tensor Factorization",
321
+ "Database Applications"
322
+ ],
323
+ "Finance and Economics": [
324
+ "Quantitative Finance and Econometrics",
325
+ "Economic Modeling"
326
+ ],
327
+ "Activity and Recognition": [
328
+ "Activity and Event Recognition"
329
+ ],
330
+ "Infrastructure": [
331
+ "Hardware and Systems",
332
+ "Sustainability"
333
+ ]
334
+ },
335
+ "Data": {
336
+ "Data Collection and Curation": [
337
+ "Pre-Training Data",
338
+ "Data Curation and Analysis",
339
+ "Manual and Algorithmic Data Processing",
340
+ "Responsible Data Management"
341
+ ],
342
+ "Data Generation": [
343
+ "Synthetic Data Generation",
344
+ "Data Augmentation"
345
+ ],
346
+ "Data Resources": [
347
+ "Benchmarks",
348
+ "Data Sets or Data Repositories",
349
+ "Datasets and Benchmarks"
350
+ ]
351
+ },
352
+ "Infrastructure and Tools": {
353
+ "Software and Libraries": [
354
+ "Software Toolkits",
355
+ "Infrastructure, Software Libraries",
356
+ "Virtual Environments"
357
+ ],
358
+ "Hardware and Systems": [
359
+ "Hardware Setups for Large-Scale Training",
360
+ "Distributed Systems",
361
+ "Specialized Hardware"
362
+ ]
363
+ },
364
+ "Neuroscience and Cognitive Science": {
365
+ "Brain Studies": [
366
+ "Brain Imaging",
367
+ "Brain Mapping",
368
+ "Brain Segmentation",
369
+ "Connectomics",
370
+ "Neural Coding",
371
+ "Spike Train Generation",
372
+ "Synaptic Modulation"
373
+ ],
374
+ "Cognitive Functions": [
375
+ "Cognitive Science",
376
+ "Memory",
377
+ "Perception",
378
+ "Visual Perception",
379
+ "Auditory Perception",
380
+ "Problem Solving",
381
+ "Reasoning",
382
+ "Linguistics",
383
+ "Psycholinguistics"
384
+ ],
385
+ "Learning and Adaptation": [
386
+ "Human or Animal Learning",
387
+ "Plasticity and Adaptation",
388
+ "Neuropsychology"
389
+ ],
390
+ "Brain-Computer Interfaces": [
391
+ "Brain-Computer Interfaces and Neural Prostheses"
392
+ ],
393
+ "Philosophy": [
394
+ "Philosophical Perspectives on AI",
395
+ "Philosophy of Mind and Language",
396
+ "Cognitive Philosophy"
397
+ ]
398
+ },
399
+ "Structured Data": {
400
+ "Graphs and Geometry": [
401
+ "Learning on Graphs",
402
+ "Geometric Deep Learning",
403
+ "Topology and Manifold Learning"
404
+ ]
405
+ },
406
+ "Societal Considerations": {
407
+ "Fairness and Equity": [
408
+ "Algorithmic Fairness and Bias",
409
+ "Bias in AI Systems",
410
+ "Equity",
411
+ "Algorithmic Recourse",
412
+ "Justice, Power, and Inequality"
413
+ ],
414
+ "Safety and Security": [
415
+ "AI Safety",
416
+ "Security",
417
+ "Adversarial Robustness",
418
+ "Risks, Harms, and Failures",
419
+ "Safe and Trustworthy AI"
420
+ ],
421
+ "Privacy": [
422
+ "Privacy, Anonymity, and Security",
423
+ "Data Protection",
424
+ "Privacy-Preserving Methods"
425
+ ],
426
+ "Misinformation and Content": [
427
+ "Misinformation and Disinformation",
428
+ "Content Moderation",
429
+ "Information Integrity"
430
+ ],
431
+ "Transparency and Accountability": [
432
+ "Fairness, Accountability, and Transparency",
433
+ "Transparency Documentation",
434
+ "Audits of AI Systems",
435
+ "Explainability for Accountability"
436
+ ],
437
+ "Human Factors": [
438
+ "Human-AI Interaction",
439
+ "Trust in AI Systems",
440
+ "Human-Centered AI",
441
+ "Participatory and Deliberative Methods"
442
+ ],
443
+ "Design and Development": [
444
+ "Sociotechnical Design and Development",
445
+ "Value-Sensitive Design",
446
+ "Diversity in Design and Development",
447
+ "Responsible Development Practices"
448
+ ],
449
+ "Societal Impacts": [
450
+ "Cultural Impacts",
451
+ "Environmental Impacts and Climate Change",
452
+ "Labor and Economic Impacts",
453
+ "Job Displacement and Automation",
454
+ "Misuse of AI Systems"
455
+ ],
456
+ "Governance and Policy": [
457
+ "Regulation and Governance",
458
+ "Legal Topics in AI",
459
+ "Policy and Law",
460
+ "Licensing and Liability",
461
+ "Organizational Factors"
462
+ ],
463
+ "Critical Perspectives": [
464
+ "Critical and Sociotechnical Foresight",
465
+ "Historical and Humanistic Perspectives",
466
+ "Social Scientific Perspectives",
467
+ "Resistance and Contestation",
468
+ "Social Epistemology"
469
+ ],
470
+ "Values and Ethics": [
471
+ "Moral and Political Philosophy of AI",
472
+ "Ethics in AI",
473
+ "Values in Technology Design"
474
+ ],
475
+ "Cross-Cultural and Multilingual": [
476
+ "Multi-Linguality",
477
+ "Low-Resource Languages",
478
+ "Vernacular Languages",
479
+ "Multiculturalism",
480
+ "Value Pluralism",
481
+ "Cross-Cultural AI"
482
+ ],
483
+ "Interdisciplinary Approaches": [
484
+ "Interdisciplinarity and Cross-Functional Teams",
485
+ "Industry, Government, and Civil Society Collaboration"
486
+ ]
487
+ }
488
+ }