PaperVerse

Running

App Files Files Community

evijit HF Staff commited on Oct 29, 2025

Commit

f30a36e

1 Parent(s): 813c7cf

add paperverse code

Browse files

Files changed (3) hide show

README.md +4 -4
app.py +766 -66
integrated_ml_taxonomy.json +488 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: DataVerse Explorer
 emoji: 😻
-colorFrom: green
-colorTo: green
 sdk: gradio
 sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Hub Organization Stats
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PaperVerse Explorer
 emoji: 😻
+colorFrom: red
+colorTo: red
 sdk: gradio
 sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Research Paper Stats on Hugging Face
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -2,46 +2,366 @@ import gradio as gr
 import pandas as pd
 import plotly.express as px
 import time
 from datasets import load_dataset
 # --- Constants ---
 TOP_K_CHOICES = list(range(5, 51, 5))
-HF_DATASET_ID = "evijit/dataverse_daily_data"
-TAG_FILTER_CHOICES = [
-    "None", "Audio & Speech", "Time series", "Robotics", "Music",
-    "Video", "Images", "Text", "Biomedical", "Sciences"
 ]
 def load_datasets_data():
-    """Load the processed datasets data from the Hugging Face Hub."""
     start_time = time.time()
     print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
     try:
-        dataset_dict = load_dataset(HF_DATASET_ID)
-        df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
         msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
         print(msg)
         return df, True, msg
     except Exception as e:
         err_msg = f"Failed to load dataset. Error: {e}"
         print(err_msg)
         return pd.DataFrame(), False, err_msg
-def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
     """
     Filter data and prepare it for a multi-level treemap.
     - Preserves individual datasets for the top K organizations.
     - Groups all other organizations into a single "Other" category.
     """
     if df is None or df.empty:
         return pd.DataFrame()
     filtered_df = df.copy()
-    col_map = {
-        "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
-        "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
-        "Video": "has_video", "Images": "has_image", "Text": "has_text"
     }
     if tag_filter and tag_filter != "None" and tag_filter in col_map:
@@ -55,63 +375,209 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
-    all_org_totals = filtered_df.groupby("organization")[count_by].sum()
-    top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
-    top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
-    other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
-    final_df_for_plot = top_orgs_df
-    if other_total > 0:
-        other_row = pd.DataFrame([{'organization': 'Other', 'id': 'Other', count_by: other_total}])
-        final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
-    if skip_cats and len(skip_cats) > 0:
-        final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
-    final_df_for_plot["root"] = "datasets"
-    return final_df_for_plot
-def create_treemap(treemap_data, count_by, title=None):
     """Generate the Plotly treemap figure from the prepared data."""
     if treemap_data.empty or treemap_data[count_by].sum() <= 0:
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
-    fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
-                     title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     fig.update_traces(
-        textinfo="label+value+percent root",
-        hovertemplate="<b>%{label}</b><br>%{value:,} " + count_by + "<br>%{percentRoot:.2%} of total<extra></extra>"
     )
     return fig
 # --- Gradio UI Blocks ---
-with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
     with gr.Row():
-        gr.Markdown("# 🤗 Dataverse Explorer")
-    with gr.Row():
-        with gr.Column(scale=1):
-            count_by_dropdown = gr.Dropdown(label="Metric", choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")], value="downloads")
-            tag_filter_dropdown = gr.Dropdown(label="Filter by Tag", choices=TAG_FILTER_CHOICES, value="None")
-            top_k_dropdown = gr.Dropdown(label="Number of Top Organizations", choices=TOP_K_CHOICES, value=25)
-            skip_cats_textbox = gr.Textbox(label="Organizations to Skip from the plot", value="Other")
-            generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
-        with gr.Column(scale=3):
-            plot_output = gr.Plot()
-            status_message_md = gr.Markdown("Initializing...")
-            data_info_md = gr.Markdown("")
     def _update_button_interactivity(is_loaded_flag):
         return gr.update(interactive=is_loaded_flag)
     ## CHANGE: New combined function to load data and generate the initial plot on startup.
     def load_and_generate_initial_plot(progress=gr.Progress()):
@@ -126,57 +592,140 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
                 data_info_text = (f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
                                   f"- Status: {status_msg_from_load}\n"
-                                  f"- Total datasets loaded: {len(current_df):,}\n"
                                   f"- Data as of: {date_display}\n")
             else:
                 data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
         except Exception as e:
             status_msg_from_load = f"An unexpected error occurred: {str(e)}"
             data_info_text = f"### Critical Error\n- {status_msg_from_load}"
             load_success_flag = False
             current_df = pd.DataFrame() # Ensure df is empty on failure
             print(f"Critical error in load_and_generate_initial_plot: {e}")
         # --- Part 2: Generate Initial Plot ---
         progress(0.6, desc="Generating initial plot...")
-        # Get default values directly from the UI component definitions
-        default_metric = "downloads"
         default_tag = "None"
         default_k = 25
-        default_skip_cats = "Other"
-        # Reuse the existing controller function for plotting
         initial_plot, initial_status = ui_generate_plot_controller(
-            default_metric, default_tag, default_k, default_skip_cats, current_df, progress
         )
-        return current_df, load_success_flag, data_info_text, initial_status, initial_plot
-    def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
-                                   skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
             return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded. Cannot generate plot."
         progress(0.1, desc="Aggregating data...")
         cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
-        treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, cats_to_skip)
         progress(0.7, desc="Generating plot...")
-        title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
-        chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
-        plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
-            total_datasets_in_plot = treemap_df[treemap_df['id'] != 'Other']['id'].nunique()
-            plot_stats_md = (f"## Plot Statistics\n- **Organizations/Categories Shown**: {treemap_df['organization'].nunique():,}\n"
-                             f"- **Individual Datasets Shown**: {total_datasets_in_plot:,}\n"
-                             f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}")
         return plotly_fig, plot_stats_md
@@ -186,7 +735,19 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     demo.load(
         fn=load_and_generate_initial_plot,
         inputs=[],
-        outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md, plot_output]
     )
     loading_complete_state.change(
@@ -195,12 +756,151 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
         outputs=generate_plot_button
     )
     generate_plot_button.click(
         fn=ui_generate_plot_controller,
-        inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
-                skip_cats_textbox, datasets_data_state],
         outputs=[plot_output, status_message_md]
     )
 if __name__ == "__main__":
     print("Application starting...")

 import pandas as pd
 import plotly.express as px
 import time
+import os
+import tempfile
+import requests
+import duckdb
+import json
 from datasets import load_dataset
+from huggingface_hub import logout as hf_logout
+from gradio_rangeslider import RangeSlider
 # --- Constants ---
 TOP_K_CHOICES = list(range(5, 51, 5))
+HF_DATASET_ID = "evijit/paperverse_daily_data"
+# Direct parquet file URL (public)
+PARQUET_URL = "https://huggingface.co/datasets/evijit/paperverse_daily_data/resolve/main/papers_with_semantic_taxonomy.parquet"
+TAXONOMY_JSON_PATH = "integrated_ml_taxonomy.json"
+# Simple content filters derived from the new dataset
+TAG_FILTER_CHOICES = [
+    "None",
+    "Has Code",
+    "Has Media",
+    "Has Organization",
 ]
+# Load taxonomy from JSON file
+def load_taxonomy():
+    """Load the ML taxonomy from JSON file."""
+    try:
+        with open(TAXONOMY_JSON_PATH, 'r') as f:
+            taxonomy = json.load(f)
+        # Extract choices for dropdowns
+        categories = sorted(taxonomy.keys())
+        # Build subcategories and topics
+        all_subcategories = set()
+        all_topics = set()
+        for category, subcats in taxonomy.items():
+            for subcat, topics in subcats.items():
+                all_subcategories.add(subcat)
+                all_topics.update(topics)
+        return {
+            'categories': ["All"] + categories,
+            'subcategories': ["All"] + sorted(all_subcategories),
+            'topics': ["All"] + sorted(all_topics),
+            'taxonomy': taxonomy
+        }
+    except Exception as e:
+        print(f"Error loading taxonomy from JSON: {e}")
+        return {
+            'categories': ["All"],
+            'subcategories': ["All"],
+            'topics': ["All"],
+            'taxonomy': {}
+        }
+TAXONOMY_DATA = load_taxonomy()
+def _first_non_null(*values):
+    for v in values:
+        if v is None:
+            continue
+        # treat empty strings as null-ish
+        if isinstance(v, str) and v.strip() == "":
+            continue
+        return v
+    return None
+def _get_nested(row, *paths):
+    """Try multiple dotted paths in a row that may contain dicts; return first non-null."""
+    for path in paths:
+        cur = row
+        ok = True
+        for key in path.split('.'):
+            if isinstance(cur, dict) and key in cur:
+                cur = cur[key]
+            else:
+                ok = False
+                break
+        if ok and cur is not None:
+            return cur
+    return None
 def load_datasets_data():
+    """Load the PaperVerse Daily dataset from the Hugging Face Hub and normalize columns used by the app."""
     start_time = time.time()
     print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
     try:
+        # First try: direct parquet download (avoids any auth header issues)
+        try:
+            print(f"Trying direct parquet download: {PARQUET_URL}")
+            with requests.get(PARQUET_URL, stream=True, timeout=120) as resp:
+                resp.raise_for_status()
+                with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmpf:
+                    for chunk in resp.iter_content(chunk_size=1024 * 1024):
+                        if chunk:
+                            tmpf.write(chunk)
+                    tmp_path = tmpf.name
+            try:
+                # Use DuckDB to read parquet to avoid pyarrow decoding issues
+                df = duckdb.query(f"SELECT * FROM read_parquet('{tmp_path}')").df()
+            finally:
+                try:
+                    os.remove(tmp_path)
+                except Exception:
+                    pass
+            print("Loaded DataFrame from direct parquet download via DuckDB.")
+        except Exception as direct_e:
+            print(f"Direct parquet load failed: {direct_e}. Falling back to datasets loader...")
+            # Force anonymous access in case an invalid cached token is present
+            # Clear any token environment variables that could inject a bad Authorization header
+            for env_key in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HF_HUB_TOKEN"):
+                if os.environ.pop(env_key, None) is not None:
+                    print(f"Cleared env var: {env_key}")
+            # Prefer explicit train split when available
+            try:
+                dataset_obj = load_dataset(HF_DATASET_ID, split="train", token=None)
+            except TypeError:
+                dataset_obj = load_dataset(HF_DATASET_ID, split="train", use_auth_token=False)
+            except Exception:
+                # Fallback: load all splits and pick the first available
+                try:
+                    dataset_obj = load_dataset(HF_DATASET_ID, token=None)
+                except TypeError:
+                    dataset_obj = load_dataset(HF_DATASET_ID, use_auth_token=False)
+            # Handle both Dataset and DatasetDict
+            try:
+                # If it's a Dataset (single split), this will work
+                df = dataset_obj.to_pandas()
+            except AttributeError:
+                # Otherwise assume DatasetDict and take the first split
+                first_split = list(dataset_obj.keys())[0]
+                df = dataset_obj[first_split].to_pandas()
+        # --- Normalize expected columns for the visualization ---
+        # organization: prefer top-level organization_name, then paper_organization.name/fullname, else Unknown
+        if 'organization_name' in df.columns:
+            org_series = df['organization_name']
+        else:
+            # try nested dicts commonly produced by HF datasets
+            org_series = df.apply(
+                lambda r: _first_non_null(
+                    _get_nested(r, 'paper_organization.name'),
+                    _get_nested(r, 'paper_organization.fullname'),
+                    _get_nested(r, 'organization.name'),
+                    _get_nested(r, 'organization.fullname')
+                ), axis=1
+            )
+        df['organization'] = org_series.fillna('Unknown')
+        # Extract organization avatar/logo
+        if 'organization_name' in df.columns:
+            # Try to get avatar from paper_organization or organization struct
+            def _get_avatar(row):
+                for path in ['paper_organization.avatar', 'organization.avatar']:
+                    av = _get_nested(row, path)
+                    if av and isinstance(av, str) and av.strip():
+                        return av
+                return None
+            org_avatar_series = df.apply(_get_avatar, axis=1)
+        else:
+            org_avatar_series = pd.Series([None] * len(df))
+        df['organization_avatar'] = org_avatar_series
+        # id for each paper row
+        cand_cols = [
+            'paper_id', 'paper_discussionId', 'key'
+        ]
+        id_val = None
+        for c in cand_cols:
+            if c in df.columns:
+                id_val = df[c]
+                break
+        if id_val is None:
+            # fallback to title + index
+            if 'paper_title' in df.columns:
+                df['id'] = df['paper_title'].astype(str) + '_' + df.reset_index().index.astype(str)
+            elif 'title' in df.columns:
+                df['id'] = df['title'].astype(str) + '_' + df.reset_index().index.astype(str)
+            else:
+                df['id'] = df.reset_index().index.astype(str)
+        else:
+            df['id'] = id_val.astype(str)
+        # numeric metrics used for aggregation
+        def _to_num(col_name):
+            if col_name in df.columns:
+                return pd.to_numeric(df[col_name], errors='coerce').fillna(0.0)
+            return pd.Series([0.0] * len(df))
+        df['paper_upvotes'] = _to_num('paper_upvotes')
+        df['numComments'] = _to_num('numComments')
+        df['paper_githubStars'] = _to_num('paper_githubStars')
+        # computed boolean filters
+        def _has_code(row):
+            # Check for GitHub repo
+            try:
+                gh = row['paper_githubRepo'] if 'paper_githubRepo' in row and pd.notna(row['paper_githubRepo']) else None
+                if isinstance(gh, str) and len(gh.strip()) > 0:
+                    return True
+            except Exception:
+                pass
+            # Check for project page
+            try:
+                pp = row.get('paper_projectPage') if isinstance(row, dict) else row.get('paper_projectPage', None)
+                if isinstance(pp, str) and len(str(pp).strip()) > 0 and str(pp).strip().lower() != 'n/a':
+                    return True
+            except Exception:
+                pass
+            return False
+        def _has_media(row):
+            for c in ['paper_mediaUrls', 'mediaUrls']:
+                try:
+                    v = row[c]
+                    if isinstance(v, list) and len(v) > 0:
+                        return True
+                    # some providers store arrays as strings like "[... ]"
+                    if isinstance(v, str) and v.strip().startswith('[') and len(v.strip()) > 2:
+                        return True
+                except Exception:
+                    continue
+            return False
+        df['has_code'] = df.apply(_has_code, axis=1)
+        df['has_media'] = df.apply(_has_media, axis=1)
+        df['has_organization'] = df['organization'].astype(str).str.strip().ne('Unknown')
+        # Process publishedAt field for date filtering
+        if 'publishedAt' in df.columns:
+            df['publishedAt_dt'] = pd.to_datetime(df['publishedAt'], errors='coerce')
+        else:
+            df['publishedAt_dt'] = pd.NaT
+        # Ensure topic hierarchy columns exist and are strings
+        for col_name, default_val in [
+            ('primary_category', 'Unknown'),
+            ('primary_subcategory', 'Unknown'),
+            ('primary_topic', 'Unknown'),
+        ]:
+            if col_name not in df.columns:
+                df[col_name] = default_val
+            else:
+                df[col_name] = df[col_name].fillna(default_val).astype(str).replace({'': default_val})
+        # Create a human-friendly paper label for treemap leaves: "<title> — <topic>"
+        def _pick_title(row):
+            t1 = row.get('paper_title') if isinstance(row, dict) else None
+            try:
+                t1 = row['paper_title'] if 'paper_title' in row and pd.notna(row['paper_title']) and str(row['paper_title']).strip() != '' else None
+            except Exception:
+                pass
+            if t1 is not None:
+                return str(t1)
+            try:
+                t2 = row['title'] if 'title' in row and pd.notna(row['title']) and str(row['title']).strip() != '' else None
+            except Exception:
+                t2 = None
+            return str(t2) if t2 is not None else 'Untitled'
+        def _pick_topic(row):
+            # Prefer primary_topic, else first of taxonomy_topics
+            try:
+                pt = row['primary_topic'] if 'primary_topic' in row and pd.notna(row['primary_topic']) and str(row['primary_topic']).strip() != '' else None
+            except Exception:
+                pt = None
+            if pt is not None:
+                return str(pt)
+            try:
+                tt = row['taxonomy_topics'] if 'taxonomy_topics' in row else None
+                if isinstance(tt, list) and len(tt) > 0:
+                    return str(tt[0])
+                # Sometimes arrays are serialized as strings like "[ ... ]"
+                if isinstance(tt, str) and tt.strip().startswith('[') and len(tt.strip()) > 2:
+                    # naive parse for first quoted token
+                    inner = tt.strip().lstrip('[').rstrip(']')
+                    first = inner.split(',')[0].strip().strip('"\'')
+                    return first if first else 'No topic'
+            except Exception:
+                pass
+            return 'No topic'
+        titles = df.apply(_pick_title, axis=1)
+        df['paper_label'] = titles.astype(str)
+        # Build a Topic Chain for hover details
+        df['topic_chain'] = (
+            df['primary_category'].astype(str) + ' > ' +
+            df['primary_subcategory'].astype(str) + ' > ' +
+            df['primary_topic'].astype(str)
+        )
+        # Ensure link fields exist for hover details
+        for link_col in ['paper_githubRepo', 'paper_projectPage']:
+            if link_col not in df.columns:
+                df[link_col] = 'N/A'
+            else:
+                df[link_col] = df[link_col].fillna('N/A').replace({'': 'N/A'})
         msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
         print(msg)
         return df, True, msg
     except Exception as e:
+        # If we encountered invalid credentials, try logging out programmatically and retry once anonymously
+        if "Invalid credentials" in str(e) or "401 Client Error" in str(e):
+            try:
+                print("Encountered auth error; attempting to clear cached token and retry anonymously...")
+                hf_logout()
+                try:
+                    dataset_dict = load_dataset(HF_DATASET_ID, token=None)
+                except TypeError:
+                    dataset_dict = load_dataset(HF_DATASET_ID, use_auth_token=False)
+                df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
+                msg = f"Successfully loaded dataset after clearing token in {time.time() - start_time:.2f}s."
+                print(msg)
+                return df, True, msg
+            except Exception as e2:
+                err_msg = f"Failed to load dataset after retry. Error: {e2} (initial: {e})"
+                print(err_msg)
+                return pd.DataFrame(), False, err_msg
         err_msg = f"Failed to load dataset. Error: {e}"
         print(err_msg)
         return pd.DataFrame(), False, err_msg
+def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None, group_by='organization', date_range=None):
     """
     Filter data and prepare it for a multi-level treemap.
     - Preserves individual datasets for the top K organizations.
     - Groups all other organizations into a single "Other" category.
+    - date_range: tuple of (min_timestamp, max_timestamp) in seconds since epoch
     """
     if df is None or df.empty:
         return pd.DataFrame()
     filtered_df = df.copy()
+    # Apply date range filter
+    if date_range is not None and 'publishedAt_dt' in filtered_df.columns:
+        min_ts, max_ts = date_range
+        min_date = pd.to_datetime(min_ts, unit='s')
+        max_date = pd.to_datetime(max_ts, unit='s')
+        # Remove timezone info for comparison if publishedAt_dt is tz-naive
+        if filtered_df['publishedAt_dt'].dt.tz is None:
+            min_date = min_date.tz_localize(None)
+            max_date = max_date.tz_localize(None)
+        filtered_df = filtered_df[
+            (filtered_df['publishedAt_dt'] >= min_date) &
+            (filtered_df['publishedAt_dt'] <= max_date)
+        ]
+    col_map = {
+        "Has Code": "has_code",
+        "Has Media": "has_media",
+        "Has Organization": "has_organization",
     }
     if tag_filter and tag_filter != "None" and tag_filter in col_map:
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
+    if group_by == 'organization':
+        all_org_totals = filtered_df.groupby("organization")[count_by].sum()
+        top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
+        top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
+        other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
+        final_df_for_plot = top_orgs_df
+        if other_total > 0:
+            other_row = pd.DataFrame([{
+                'organization': 'Other',
+                'paper_label': 'Other',
+                'primary_category': 'Other',
+                'primary_subcategory': 'Other',
+                'primary_topic': 'Other',
+                'topic_chain': 'Other > Other > Other',
+                'paper_githubRepo': 'N/A',
+                'paper_projectPage': 'N/A',
+                'organization_avatar': None,
+                count_by: other_total
+            }])
+            final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
+        if skip_cats and len(skip_cats) > 0:
+            final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
+        final_df_for_plot["root"] = "papers"
+        return final_df_for_plot
+    else:
+        # Topic grouping: apply top-k to topic combinations and handle skip list
+        topic_totals = filtered_df.groupby(['primary_category', 'primary_subcategory', 'primary_topic'])[count_by].sum()
+        top_topics = topic_totals.nlargest(top_k, keep='first').index.tolist()
+        # Filter to top topics
+        top_topics_df = filtered_df[
+            filtered_df.apply(
+                lambda r: (r['primary_category'], r['primary_subcategory'], r['primary_topic']) in top_topics,
+                axis=1
+            )
+        ].copy()
+        # Apply skip filter (skip by primary_topic name)
+        if skip_cats and len(skip_cats) > 0:
+            top_topics_df = top_topics_df[~top_topics_df['primary_topic'].isin(skip_cats)]
+        top_topics_df["root"] = "papers"
+        return top_topics_df
+def create_treemap(treemap_data, count_by, title=None, path=None, metric_label=None):
     """Generate the Plotly treemap figure from the prepared data."""
     if treemap_data.empty or treemap_data[count_by].sum() <= 0:
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
+    if path is None:
+        path = ["root", "organization", "paper_label"]
+    # Add custom data columns as regular columns for Plotly to access
+    # This ensures all nodes (including intermediate hierarchy nodes) have these fields
+    # Ensure organization_avatar column exists (for search details, not hover)
+    if 'organization_avatar' not in treemap_data.columns:
+        treemap_data['organization_avatar'] = None
+    fig = px.treemap(
+        treemap_data,
+        path=path,
+        values=count_by,
+        hover_data={
+            'primary_category': True,
+            'primary_subcategory': True,
+            'primary_topic': True,
+            'paper_githubRepo': True,
+            'paper_projectPage': True,
+        },
+        title=title,
+        color_discrete_sequence=px.colors.qualitative.Plotly
+    )
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+    display_metric = metric_label if metric_label else count_by
+    # Clean hover without organization avatar (images shown in search details instead)
     fig.update_traces(
+        textinfo="label+value",
+        hovertemplate=(
+            "<b>%{label}</b><br>"
+            + "%{value:,} " + display_metric +
+            "<br><br><b>Topic Hierarchy:</b><br>"
+            + "%{customdata[0]} > %{customdata[1]} > %{customdata[2]}<br>"
+            + "<br><b>Links:</b><br>"
+            + "GitHub: %{customdata[3]}<br>"
+            + "Project: %{customdata[4]}"
+            + "<extra></extra>"
+        ),
     )
     return fig
 # --- Gradio UI Blocks ---
+with gr.Blocks(
+    title="📚 PaperVerse Daily Explorer",
+    fill_width=True,
+    css="""
+        /* Hide the timestamp numbers on the range slider */
+        #date-range-slider-wrapper .head,
+        #date-range-slider-wrapper div[data-testid="range-slider"] > span {
+            display: none !important;
+        }
+    """
+) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
+    date_range_state = gr.State(None)  # Store min/max timestamps
     with gr.Row():
+        gr.Markdown("# 📚 PaperVerse Daily Explorer")
+    with gr.Tabs():
+        with gr.Tab("📊 Treemap Visualization"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    count_by_dropdown = gr.Dropdown(
+                        label="Metric",
+                        choices=[
+                            ("Upvotes", "paper_upvotes"),
+                            ("Comments", "numComments"),
+                        ],
+                        value="paper_upvotes",
+                    )
+                    group_by_dropdown = gr.Dropdown(
+                        label="Group by",
+                        choices=[("Organization", "organization"), ("Topic", "topic")],
+                        value="organization",
+                    )
+                    gr.Markdown("**Filters**")
+                    filter_code = gr.Checkbox(label="Has Code", value=False)
+                    filter_media = gr.Checkbox(label="Has Media", value=False)
+                    filter_org = gr.Checkbox(label="Has Organization", value=False)
+                    gr.Markdown("**Date Range**")
+                    date_range_slider = RangeSlider(
+                        minimum=0,
+                        maximum=100,
+                        value=(0, 100),
+                        label="Paper Release Date Range",
+                        interactive=True,
+                        elem_id="date-range-slider-wrapper"
+                    )
+                    date_range_display = gr.Markdown("Loading date range...")
+                    top_k_dropdown = gr.Dropdown(label="Number of Top Organizations", choices=TOP_K_CHOICES, value=25)
+                    category_filter_dropdown = gr.Dropdown(label="Primary Category", choices=["All"], value="All")
+                    subcategory_filter_dropdown = gr.Dropdown(label="Primary Subcategory", choices=["All"], value="All")
+                    topic_filter_dropdown = gr.Dropdown(label="Primary Topic", choices=["All"], value="All")
+                    skip_cats_textbox = gr.Textbox(label="Organizations to Skip", value="unaffiliated, Other")
+                    generate_plot_button = gr.Button(value="Generate Plot", variant="primary", interactive=False)
+                with gr.Column(scale=3):
+                    plot_output = gr.Plot()
+                    status_message_md = gr.Markdown("Initializing...")
+                    data_info_md = gr.Markdown("")
+        with gr.Tab("🔍 Paper Search"):
+            with gr.Column():
+                gr.Markdown("### � Search Papers and Organizations")
+                with gr.Row():
+                    search_item = gr.Textbox(
+                        label="Search Organization or Paper",
+                        placeholder="Type organization name or paper title to see details...",
+                        scale=4
+                    )
+                    search_button = gr.Button("Show Details", scale=1, variant="secondary")
+                selected_info_html = gr.HTML(value="<p style='color: gray;'>Enter an organization name or paper title above to see details</p>")
     def _update_button_interactivity(is_loaded_flag):
         return gr.update(interactive=is_loaded_flag)
+    def _format_date_range(date_range_tuple, date_range_value):
+        """Convert slider values to readable date range text"""
+        if date_range_tuple is None:
+            return "Date range unavailable"
+        min_ts, max_ts = date_range_tuple
+        selected_min, selected_max = date_range_value
+        # Convert slider values to timestamps
+        # The slider values are already timestamps
+        min_date = pd.to_datetime(selected_min, unit='s')
+        max_date = pd.to_datetime(selected_max, unit='s')
+        return f"**Selected Range:** {min_date.strftime('%B %d, %Y')} to {max_date.strftime('%B %d, %Y')}"
+    def _toggle_labels_by_grouping(group_by_value):
+        # Update labels based on grouping mode
+        if group_by_value == 'topic':
+            top_k_label = "Number of Top Topics"
+            skip_label = "Topics to Skip"
+            skip_value = ""  # Clear skip box for topics
+        else:
+            top_k_label = "Number of Top Organizations"
+            skip_label = "Organizations to Skip"
+            skip_value = "unaffiliated, Other"  # Default orgs to skip
+        return (
+            gr.update(label=top_k_label),
+            gr.update(label=skip_label, value=skip_value)
+        )
     ## CHANGE: New combined function to load data and generate the initial plot on startup.
     def load_and_generate_initial_plot(progress=gr.Progress()):
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
+                # Calculate date range from publishedAt_dt
+                min_ts = 0
+                max_ts = 100
+                date_range_text = "Date range unavailable"
+                date_range_tuple = None
+                if 'publishedAt_dt' in current_df.columns:
+                    valid_dates = current_df['publishedAt_dt'].dropna()
+                    if len(valid_dates) > 0:
+                        min_date = valid_dates.min()
+                        max_date = valid_dates.max()
+                        min_ts = int(min_date.timestamp())
+                        max_ts = int(max_date.timestamp())
+                        date_range_tuple = (min_ts, max_ts)
+                        date_range_text = f"**Full Range:** {min_date.strftime('%B %d, %Y')} to {max_date.strftime('%B %d, %Y')}"
                 data_info_text = (f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
                                   f"- Status: {status_msg_from_load}\n"
+                                  f"- Total records loaded: {len(current_df):,}\n"
                                   f"- Data as of: {date_display}\n")
             else:
                 data_info_text = f"### Data Load Failed\n- {status_msg_from_load}"
+                min_ts = 0
+                max_ts = 100
+                date_range_text = "Date range unavailable"
+                date_range_tuple = None
         except Exception as e:
             status_msg_from_load = f"An unexpected error occurred: {str(e)}"
             data_info_text = f"### Critical Error\n- {status_msg_from_load}"
             load_success_flag = False
             current_df = pd.DataFrame() # Ensure df is empty on failure
+            min_ts = 0
+            max_ts = 100
+            date_range_text = "Date range unavailable"
+            date_range_tuple = None
             print(f"Critical error in load_and_generate_initial_plot: {e}")
         # --- Part 2: Generate Initial Plot ---
         progress(0.6, desc="Generating initial plot...")
+        # Defaults matching UI definitions
+        default_metric = "paper_upvotes"
         default_tag = "None"
         default_k = 25
+        default_group_by = "organization"
+        default_skip_cats = "unaffiliated, Other"
+        # Use taxonomy from JSON instead of calculating from dataset
+        cat_choices = TAXONOMY_DATA['categories']
+        subcat_choices = TAXONOMY_DATA['subcategories']
+        topic_choices = TAXONOMY_DATA['topics']
+        # Reuse the existing controller function for plotting (with date range set to None for initial load)
         initial_plot, initial_status = ui_generate_plot_controller(
+            default_metric, False, False, False, default_k, default_group_by, "All", "All", "All", default_skip_cats, None, current_df, progress
         )
+        # Also update taxonomy dropdown choices
+        return (
+            current_df,
+            load_success_flag,
+            data_info_text,
+            initial_status,
+            initial_plot,
+            gr.update(choices=cat_choices, value="All"),
+            gr.update(choices=subcat_choices, value="All"),
+            gr.update(choices=topic_choices, value="All"),
+            gr.update(minimum=min_ts, maximum=max_ts, value=(min_ts, max_ts)),
+            date_range_text,
+            date_range_tuple,
+        )
+    def ui_generate_plot_controller(metric_choice, has_code, has_media, has_org,
+                                   k_orgs, group_by_choice,
+                                   category_choice, subcategory_choice, topic_choice,
+                                   skip_cats_input, date_range, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
             return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded. Cannot generate plot."
         progress(0.1, desc="Aggregating data...")
         cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
+        # Apply content filters (checkboxes)
+        df_filtered = df_current_datasets.copy()
+        if has_code:
+            df_filtered = df_filtered[df_filtered['has_code']]
+        if has_media:
+            df_filtered = df_filtered[df_filtered['has_media']]
+        if has_org:
+            df_filtered = df_filtered[df_filtered['has_organization']]
+        # Apply taxonomy filters
+        if category_choice and category_choice != 'All':
+            df_filtered = df_filtered[df_filtered['primary_category'] == category_choice]
+        if subcategory_choice and subcategory_choice != 'All':
+            df_filtered = df_filtered[df_filtered['primary_subcategory'] == subcategory_choice]
+        if topic_choice and topic_choice != 'All':
+            df_filtered = df_filtered[df_filtered['primary_topic'] == topic_choice]
+        treemap_df = make_treemap_data(df_filtered, metric_choice, k_orgs, None, cats_to_skip, group_by_choice, date_range)
         progress(0.7, desc="Generating plot...")
+        title_labels = {
+            "paper_upvotes": "Upvotes",
+            "numComments": "Comments",
+        }
+        if group_by_choice == "topic":
+            chart_title = f"PaperVerse Daily - {title_labels.get(metric_choice, metric_choice)} by Topic"
+            path = ["root", "primary_category", "primary_subcategory", "primary_topic", "paper_label"]
+        else:
+            chart_title = f"PaperVerse Daily - {title_labels.get(metric_choice, metric_choice)} by Organization"
+            path = ["root", "organization", "paper_label"]
+        plotly_fig = create_treemap(
+            treemap_df,
+            metric_choice,
+            chart_title,
+            path=path,
+            metric_label=title_labels.get(metric_choice, metric_choice),
+        )
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
+            total_items_in_plot = treemap_df[treemap_df['paper_label'] != 'Other']['paper_label'].nunique()
+            if group_by_choice == "topic":
+                group_count = treemap_df[["primary_category", "primary_subcategory", "primary_topic"]].drop_duplicates().shape[0]
+                group_line = f"**Topics Shown**: {group_count:,} unique triplets"
+            else:
+                group_line = f"**Organizations Shown**: {treemap_df['organization'].nunique():,}"
+            plot_stats_md = (
+                f"## Plot Statistics\n- {group_line}\n"
+                f"- **Individual Papers Shown**: {total_items_in_plot:,}\n"
+                f"- **Total {title_labels.get(metric_choice, metric_choice)} in plot**: {int(total_value_in_plot):,}"
+            )
         return plotly_fig, plot_stats_md
     demo.load(
         fn=load_and_generate_initial_plot,
         inputs=[],
+        outputs=[
+            datasets_data_state,
+            loading_complete_state,
+            data_info_md,
+            status_message_md,
+            plot_output,
+            category_filter_dropdown,
+            subcategory_filter_dropdown,
+            topic_filter_dropdown,
+            date_range_slider,
+            date_range_display,
+            date_range_state,
+        ]
     )
     loading_complete_state.change(
         outputs=generate_plot_button
     )
+    # Update labels based on grouping mode
+    group_by_dropdown.change(
+        fn=_toggle_labels_by_grouping,
+        inputs=group_by_dropdown,
+        outputs=[top_k_dropdown, skip_cats_textbox],
+    )
+    # Update date range display when slider changes
+    date_range_slider.change(
+        fn=_format_date_range,
+        inputs=[date_range_state, date_range_slider],
+        outputs=date_range_display,
+        show_progress="hidden"
+    )
+    def handle_search_details(search_text, df_current):
+        """Search for an organization or paper and show detailed information."""
+        if not search_text or not search_text.strip():
+            return "<p style='color: gray;'>Please enter a search term</p>"
+        if df_current is None or df_current.empty:
+            return "<p style='color: gray;'>No data available</p>"
+        search_text = search_text.strip()
+        try:
+            # Try to find matching rows by organization or paper title (case-insensitive partial match)
+            matching_rows = df_current[
+                df_current['organization'].str.contains(search_text, case=False, na=False) |
+                df_current['paper_label'].str.contains(search_text, case=False, na=False) |
+                (df_current['paper_title'].str.contains(search_text, case=False, na=False) if 'paper_title' in df_current.columns else False)
+            ]
+            if matching_rows.empty:
+                return f"<p style='color: orange;'>No results found for: <b>{search_text}</b></p><p style='color: gray;'>Try searching for an organization name (e.g., 'Qwen', 'Meta') or paper title keyword</p>"
+            # Build the info panel HTML showing all matching results
+            num_results = len(matching_rows)
+            html_parts = [
+                f"<div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; background: #f9f9f9; max-height: 600px; overflow-y: auto;'>",
+                f"<h3 style='margin: 0 0 15px 0; color: #333;'>🔍 Found {num_results} result{'s' if num_results > 1 else ''} for: <span style='color: #0366d6;'>{search_text}</span></h3>"
+            ]
+            # Limit to first 20 results to avoid too much content
+            display_rows = matching_rows.head(20)
+            for idx, (_, row) in enumerate(display_rows.iterrows()):
+                # Add separator between results
+                if idx > 0:
+                    html_parts.append("<hr style='margin: 15px 0; border: none; border-top: 1px solid #ddd;'/>")
+                html_parts.append("<div style='margin-bottom: 10px; overflow: auto;'>")
+                # Get organization avatar from precomputed column
+                org_avatar = row.get('organization_avatar')
+                # Organization logo if available
+                if org_avatar and isinstance(org_avatar, str) and org_avatar.strip() and org_avatar.strip().lower() not in ['none', 'null', 'n/a', '']:
+                    html_parts.append(f"<img src='{org_avatar}' style='max-width: 60px; max-height: 60px; border-radius: 50%; margin-bottom: 8px; float: left; margin-right: 12px; border: 2px solid #ddd;' onerror=\"this.style.display='none'\"/>")
+                # Get paper thumbnail (direct field from schema)
+                paper_thumbnail = row.get('thumbnail')
+                # Paper thumbnail if available
+                if paper_thumbnail and isinstance(paper_thumbnail, str) and paper_thumbnail.strip() and paper_thumbnail.strip().lower() not in ['none', 'null', 'n/a', '']:
+                    html_parts.append(f"<img src='{paper_thumbnail}' style='max-width: 120px; max-height: 120px; border-radius: 8px; margin-bottom: 8px; float: right; margin-left: 12px; border: 1px solid #ddd;' onerror=\"this.style.display='none'\"/>")
+                # Organization name
+                org_name = row.get('organization', 'Unknown')
+                html_parts.append(f"<p style='margin: 0 0 5px 0; font-weight: bold; color: #333;'>🏢 {org_name}</p>")
+                # Paper title
+                paper_title = row.get('paper_title', row.get('title', 'Untitled'))
+                html_parts.append(f"<p style='margin: 0 0 5px 0; color: #555; font-size: 0.95em;'>📄 {paper_title}</p>")
+                # Topic hierarchy
+                category = row.get('primary_category', 'Unknown')
+                subcategory = row.get('primary_subcategory', 'Unknown')
+                topic = row.get('primary_topic', 'Unknown')
+                html_parts.append(f"<p style='margin: 0 0 5px 0; font-size: 0.9em; color: #666;'><b>Topics:</b> {category} → {subcategory} → {topic}</p>")
+                # Metrics
+                upvotes = row.get('paper_upvotes', 0)
+                comments = row.get('numComments', 0)
+                html_parts.append(f"<p style='margin: 0 0 5px 0; font-size: 0.9em;'><b>Metrics:</b> ⬆️ {upvotes:,} upvotes | 💬 {comments:,} comments</p>")
+                # Links
+                github = row.get('paper_githubRepo')
+                project = row.get('paper_projectPage')
+                links = []
+                if github and isinstance(github, str) and github.strip() and github.strip().lower() not in ['n/a', 'none']:
+                    links.append(f"<a href='{github}' target='_blank' style='color: #0366d6; margin-right: 15px;'>🔗 GitHub</a>")
+                if project and isinstance(project, str) and project.strip() and project.strip().lower() not in ['n/a', 'none']:
+                    links.append(f"<a href='{project}' target='_blank' style='color: #0366d6;'>🔗 Project</a>")
+                if links:
+                    html_parts.append(f"<p style='margin: 0; font-size: 0.9em;'>{' '.join(links)}</p>")
+                html_parts.append("<div style='clear: both;'></div>")
+                html_parts.append("</div>")
+            if num_results > 20:
+                html_parts.append(f"<p style='margin-top: 15px; color: #666; font-style: italic;'>Showing first 20 of {num_results} results. Refine your search for fewer results.</p>")
+            html_parts.append("</div>")
+            return "".join(html_parts)
+        except Exception as e:
+            return f"<p style='color: red;'>Error displaying details: {str(e)}</p>"
     generate_plot_button.click(
         fn=ui_generate_plot_controller,
+        inputs=[
+            count_by_dropdown,
+            filter_code,
+            filter_media,
+            filter_org,
+            top_k_dropdown,
+            group_by_dropdown,
+            category_filter_dropdown,
+            subcategory_filter_dropdown,
+            topic_filter_dropdown,
+            skip_cats_textbox,
+            date_range_slider,
+            datasets_data_state,
+        ],
         outputs=[plot_output, status_message_md]
     )
+    # Handle search button for showing details
+    search_button.click(
+        fn=handle_search_details,
+        inputs=[search_item, datasets_data_state],
+        outputs=[selected_info_html]
+    )
+    # Also trigger on Enter key in search box
+    search_item.submit(
+        fn=handle_search_details,
+        inputs=[search_item, datasets_data_state],
+        outputs=[selected_info_html]
+    )
 if __name__ == "__main__":
     print("Application starting...")

integrated_ml_taxonomy.json ADDED Viewed

	@@ -0,0 +1,488 @@

+{
+  "Algorithms and Learning Methods": {
+    "Supervised Learning": [
+      "Classification",
+      "Regression",
+      "Structured Prediction",
+      "Ranking and Preference Learning"
+    ],
+    "Unsupervised Learning": [
+      "Clustering",
+      "Density Estimation",
+      "Unsupervised Representation Learning"
+    ],
+    "Semi-Supervised and Self-Supervised Learning": [
+      "Semi-Supervised Learning",
+      "Self-Supervised Learning"
+    ],
+    "Reinforcement Learning and Planning": [
+      "Reinforcement Learning",
+      "Reinforcement Learning with Human Feedback (RLHF)",
+      "Markov Decision Processes",
+      "Model-Based RL",
+      "Multi-Agent RL",
+      "Hierarchical RL",
+      "Exploration",
+      "Decision and Control",
+      "Planning",
+      "Planning Algorithms",
+      "Navigation"
+    ],
+    "Transfer and Adaptation": [
+      "Transfer Learning",
+      "Meta-Learning",
+      "Multitask Learning",
+      "Lifelong Learning",
+      "Continual Learning",
+      "Few-Shot Learning",
+      "Domain Adaptation",
+      "Model Mixing Methods"
+    ],
+    "Representation Learning": [
+      "Representation Learning",
+      "Embedding Approaches",
+      "Metric Learning",
+      "Similarity and Distance Learning",
+      "Nonlinear Dimensionality Reduction and Manifold Learning",
+      "Components Analysis (CCA, ICA, LDA, PCA)",
+      "Sparse Coding and Dimensionality Expansion",
+      "Sparsity and Compressed Sensing"
+    ],
+    "Model Alignment and Adaptation": [
+      "Fine-Tuning",
+      "Instruction-Tuning",
+      "Prompt Tuning",
+      "In-Context Learning",
+      "Value Alignment and Human Feedback",
+      "Alignment Methods"
+    ],
+    "Adversarial and Robust Learning": [
+      "Adversarial Learning",
+      "AI Red Teaming and Adversarial Testing",
+      "Adversarial Attacks and Defenses",
+      "Threat Models and Mitigations"
+    ],
+    "Active and Interactive Learning": [
+      "Active Learning",
+      "Online Learning",
+      "Interactive Learning",
+      "Bandit Algorithms",
+      "Dialog- or Communication-Based Learning"
+    ],
+    "Ensemble and Boosting Methods": [
+      "Boosting and Ensemble Methods"
+    ],
+    "Specialized Learning Paradigms": [
+      "AutoML",
+      "Multimodal Learning",
+      "Relational Learning",
+      "Collaborative Filtering",
+      "Adaptive Data Analysis",
+      "Communication- or Memory-Bounded Learning",
+      "Large Scale Learning",
+      "Program Induction",
+      "Learning and Unlearning"
+    ],
+    "Data Handling": [
+      "Missing Data",
+      "Data Compression",
+      "Model Selection and Structure Learning"
+    ]
+  },
+  "Deep Learning": {
+    "Architectures": [
+      "CNN Architectures",
+      "Recurrent Networks",
+      "Attention Models",
+      "Transformer Architectures",
+      "Memory-Augmented Neural Networks",
+      "Interaction-Based Deep Networks",
+      "Biologically Plausible Deep Networks"
+    ],
+    "Model Types": [
+      "Deep Autoencoders",
+      "Adversarial Networks",
+      "Generative Models",
+      "Predictive Models",
+      "Supervised Deep Networks"
+    ],
+    "Training and Optimization": [
+      "Efficient Training Methods",
+      "Distributed Training and Inference",
+      "Training Dynamics",
+      "Optimization Instability",
+      "Efficient Inference Methods",
+      "Optimization for Deep Networks"
+    ],
+    "Model Efficiency": [
+      "Model Distillation",
+      "Model Compression",
+      "Quantization",
+      "Sample Efficient Methods",
+      "Memory Efficient Methods"
+    ],
+    "Inference and Decoding": [
+      "Decoding Algorithms",
+      "Reasoning Algorithms",
+      "Search Algorithms"
+    ],
+    "Analysis and Interpretation": [
+      "Analysis and Understanding of Deep Networks",
+      "Visualization or Exposition Techniques for Deep Networks",
+      "Interpretability and Explainability",
+      "Demystification",
+      "Scaling Laws",
+      "Emergent Capabilities",
+      "Grokking"
+    ]
+  },
+  "Probabilistic Methods": {
+    "Bayesian Methods": [
+      "Bayesian Theory",
+      "Bayesian Nonparametrics",
+      "Gaussian Processes"
+    ],
+    "Inference": [
+      "Variational Inference",
+      "MCMC",
+      "Belief Propagation",
+      "Distributed Inference",
+      "Uncertainty Estimation"
+    ],
+    "Models": [
+      "Graphical Models",
+      "Hierarchical Models",
+      "Latent Variable Models",
+      "Topic Models",
+      "Causal Inference",
+      "Causal Reasoning"
+    ],
+    "Probabilistic Programming": [
+      "Probabilistic Programming"
+    ]
+  },
+  "Optimization": {
+    "Continuous Optimization": [
+      "Convex Optimization",
+      "Non-Convex Optimization",
+      "Stochastic Optimization",
+      "Stochastic Methods"
+    ],
+    "Discrete Optimization": [
+      "Discrete Optimization",
+      "Submodular Optimization"
+    ],
+    "Evolutionary Methods": [
+      "Evolutionary Computation"
+    ]
+  },
+  "Theory": {
+    "Learning Theory": [
+      "Computational Learning Theory",
+      "Statistical Learning Theory",
+      "Models of Learning and Generalization",
+      "Hardness of Learning and Approximations",
+      "Regularization",
+      "Fundamental Limitations of Learning",
+      "Complexity of Learning Systems"
+    ],
+    "Statistical Theory": [
+      "Frequentist Statistics",
+      "High-Dimensional Inference",
+      "Large Deviations and Asymptotic Analysis"
+    ],
+    "Mathematical Foundations": [
+      "Information Theory",
+      "Control Theory",
+      "Game Theory and Computational Economics",
+      "Statistical Physics of Learning",
+      "Spaces of Functions and Kernels",
+      "Spectral Methods",
+      "Kernel Methods",
+      "Large Margin Methods"
+    ],
+    "Algorithmic Theory": [
+      "Data-driven Algorithm Design"
+    ]
+  },
+  "Knowledge and Reasoning": {
+    "Knowledge Representation": [
+      "Knowledge Models",
+      "World Models",
+      "Factuality"
+    ],
+    "Reasoning": [
+      "Commonsense Reasoning",
+      "Theory of Mind",
+      "Social Norms Understanding",
+      "Pragmatics"
+    ],
+    "Knowledge Integration": [
+      "Retrieval-Augmented Models",
+      "Tool Use and API Integration",
+      "Neurosymbolic and Hybrid AI Systems (Physics-Informed, Logic, Formal Reasoning)"
+    ]
+  },
+  "Evaluation and Benchmarking": {
+    "Evaluation Methods": [
+      "Benchmarks",
+      "Evaluation Protocols and Metrics",
+      "Human Evaluation",
+      "Machine Evaluation",
+      "Scalable Oversight"
+    ],
+    "Simulation and Testing": [
+      "Simulation Environments",
+      "Assurance Testing and Deployment Policies"
+    ]
+  },
+  "Applications": {
+    "Vision": [
+      "Computer Vision",
+      "Object Detection",
+      "Object Recognition",
+      "Image Segmentation",
+      "Body Pose, Face, and Gesture Analysis",
+      "Tracking and Motion in Video",
+      "Video Analysis",
+      "Visual Question Answering",
+      "Visual Scene Analysis and Interpretation",
+      "Computational Photography",
+      "Denoising"
+    ],
+    "Language": [
+      "Natural Language Processing",
+      "Language Representation Learning",
+      "Dialog Systems",
+      "Conversational AI"
+    ],
+    "Audio and Speech": [
+      "Audio and Speech Processing",
+      "Speech Recognition",
+      "Music Modeling and Analysis"
+    ],
+    "Multimodal": [
+      "Multimodal Models",
+      "Vision-Language Models",
+      "Audio-Visual Learning",
+      "Cross-Modal Learning"
+    ],
+    "Robotics and Embodied AI": [
+      "Robotics",
+      "Motor Control",
+      "Autonomous Systems",
+      "Perception and Action",
+      "Embodied AI"
+    ],
+    "Code and Software": [
+      "Program Understanding and Generation",
+      "Code Generation",
+      "Software Engineering with AI",
+      "Automated Reasoning and Formal Methods"
+    ],
+    "Science and Engineering": [
+      "Computational Biology and Bioinformatics",
+      "Physical Sciences (Physics, Chemistry, Biology)",
+      "Scientific Discovery",
+      "Quantum Learning"
+    ],
+    "Mathematics": [
+      "Mathematical Reasoning",
+      "Theorem Proving",
+      "Symbolic Mathematics"
+    ],
+    "Health and Medicine": [
+      "Medical Applications",
+      "Clinical Decision Support",
+      "Drug Discovery",
+      "Healthcare AI"
+    ],
+    "Education": [
+      "Educational Applications",
+      "Intelligent Tutoring Systems",
+      "Educational Technology"
+    ],
+    "Social and Web": [
+      "Computational Social Science",
+      "Recommender Systems",
+      "Information Retrieval",
+      "Web Applications and Internet Data",
+      "Network Analysis"
+    ],
+    "Interactive Systems": [
+      "Game Playing",
+      "Multi-Agent Systems",
+      "Human-AI Interaction"
+    ],
+    "Data and Signals": [
+      "Signal Processing",
+      "Time Series Analysis",
+      "Matrix and Tensor Factorization",
+      "Database Applications"
+    ],
+    "Finance and Economics": [
+      "Quantitative Finance and Econometrics",
+      "Economic Modeling"
+    ],
+    "Activity and Recognition": [
+      "Activity and Event Recognition"
+    ],
+    "Infrastructure": [
+      "Hardware and Systems",
+      "Sustainability"
+    ]
+  },
+  "Data": {
+    "Data Collection and Curation": [
+      "Pre-Training Data",
+      "Data Curation and Analysis",
+      "Manual and Algorithmic Data Processing",
+      "Responsible Data Management"
+    ],
+    "Data Generation": [
+      "Synthetic Data Generation",
+      "Data Augmentation"
+    ],
+    "Data Resources": [
+      "Benchmarks",
+      "Data Sets or Data Repositories",
+      "Datasets and Benchmarks"
+    ]
+  },
+  "Infrastructure and Tools": {
+    "Software and Libraries": [
+      "Software Toolkits",
+      "Infrastructure, Software Libraries",
+      "Virtual Environments"
+    ],
+    "Hardware and Systems": [
+      "Hardware Setups for Large-Scale Training",
+      "Distributed Systems",
+      "Specialized Hardware"
+    ]
+  },
+  "Neuroscience and Cognitive Science": {
+    "Brain Studies": [
+      "Brain Imaging",
+      "Brain Mapping",
+      "Brain Segmentation",
+      "Connectomics",
+      "Neural Coding",
+      "Spike Train Generation",
+      "Synaptic Modulation"
+    ],
+    "Cognitive Functions": [
+      "Cognitive Science",
+      "Memory",
+      "Perception",
+      "Visual Perception",
+      "Auditory Perception",
+      "Problem Solving",
+      "Reasoning",
+      "Linguistics",
+      "Psycholinguistics"
+    ],
+    "Learning and Adaptation": [
+      "Human or Animal Learning",
+      "Plasticity and Adaptation",
+      "Neuropsychology"
+    ],
+    "Brain-Computer Interfaces": [
+      "Brain-Computer Interfaces and Neural Prostheses"
+    ],
+    "Philosophy": [
+      "Philosophical Perspectives on AI",
+      "Philosophy of Mind and Language",
+      "Cognitive Philosophy"
+    ]
+  },
+  "Structured Data": {
+    "Graphs and Geometry": [
+      "Learning on Graphs",
+      "Geometric Deep Learning",
+      "Topology and Manifold Learning"
+    ]
+  },
+  "Societal Considerations": {
+    "Fairness and Equity": [
+      "Algorithmic Fairness and Bias",
+      "Bias in AI Systems",
+      "Equity",
+      "Algorithmic Recourse",
+      "Justice, Power, and Inequality"
+    ],
+    "Safety and Security": [
+      "AI Safety",
+      "Security",
+      "Adversarial Robustness",
+      "Risks, Harms, and Failures",
+      "Safe and Trustworthy AI"
+    ],
+    "Privacy": [
+      "Privacy, Anonymity, and Security",
+      "Data Protection",
+      "Privacy-Preserving Methods"
+    ],
+    "Misinformation and Content": [
+      "Misinformation and Disinformation",
+      "Content Moderation",
+      "Information Integrity"
+    ],
+    "Transparency and Accountability": [
+      "Fairness, Accountability, and Transparency",
+      "Transparency Documentation",
+      "Audits of AI Systems",
+      "Explainability for Accountability"
+    ],
+    "Human Factors": [
+      "Human-AI Interaction",
+      "Trust in AI Systems",
+      "Human-Centered AI",
+      "Participatory and Deliberative Methods"
+    ],
+    "Design and Development": [
+      "Sociotechnical Design and Development",
+      "Value-Sensitive Design",
+      "Diversity in Design and Development",
+      "Responsible Development Practices"
+    ],
+    "Societal Impacts": [
+      "Cultural Impacts",
+      "Environmental Impacts and Climate Change",
+      "Labor and Economic Impacts",
+      "Job Displacement and Automation",
+      "Misuse of AI Systems"
+    ],
+    "Governance and Policy": [
+      "Regulation and Governance",
+      "Legal Topics in AI",
+      "Policy and Law",
+      "Licensing and Liability",
+      "Organizational Factors"
+    ],
+    "Critical Perspectives": [
+      "Critical and Sociotechnical Foresight",
+      "Historical and Humanistic Perspectives",
+      "Social Scientific Perspectives",
+      "Resistance and Contestation",
+      "Social Epistemology"
+    ],
+    "Values and Ethics": [
+      "Moral and Political Philosophy of AI",
+      "Ethics in AI",
+      "Values in Technology Design"
+    ],
+    "Cross-Cultural and Multilingual": [
+      "Multi-Linguality",
+      "Low-Resource Languages",
+      "Vernacular Languages",
+      "Multiculturalism",
+      "Value Pluralism",
+      "Cross-Cultural AI"
+    ],
+    "Interdisciplinary Approaches": [
+      "Interdisciplinarity and Cross-Functional Teams",
+      "Industry, Government, and Civil Society Collaboration"
+    ]
+  }
+}