Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| import requests | |
| import csv | |
| import pandas as pd | |
| import tqdm | |
| import cohere | |
| import os | |
| from topically import Topically | |
| from bertopic import BERTopic | |
| from sklearn.cluster import KMeans | |
| import numpy as np | |
| venue = 'ICLR.cc/2023/Conference' | |
| venue_short = 'iclr2023' | |
| def get_conference_notes(venue, blind_submission=False): | |
| """ | |
| Get all notes of a conference (data) from OpenReview API. | |
| If results are not final, you should set blind_submission=True. | |
| """ | |
| blind_param = '-/Blind_Submission' if blind_submission else '' | |
| offset = 0 | |
| notes = [] | |
| while True: | |
| print('Offset:', offset, 'Data:', len(notes)) | |
| url = f'https://api.openreview.net/notes?invitation={venue}/{blind_param}&offset={offset}' | |
| response = requests.get(url) | |
| data = response.json() | |
| if len(data['notes']) == 0: | |
| break | |
| offset += 1000 | |
| notes.extend(data['notes']) | |
| return notes | |
| raw_notes = get_conference_notes(venue, blind_submission=True) | |
| st.title("ICLR2023 Papers Visualization") | |
| st.write("Number of submissions at ICLR 2023:", len(raw_notes)) | |
| df_raw = pd.json_normalize(raw_notes) | |
| # set index as first column | |
| # df_raw.set_index(df_raw.columns[0], inplace=True) | |
| accepted_venues = ['ICLR 2023 poster', 'ICLR 2023 notable top 5%', 'ICLR 2023 notable top 25%'] | |
| df = df_raw[df_raw["content.venue"].isin(accepted_venues)] | |
| st.write("Number of submissions accepted at ICLR 2023:", len(df)) | |
| df_filtered = df[['content.title', 'content.keywords', 'content.abstract', 'content.venue']] | |
| df = df_filtered | |
| if "CO_API_KEY" not in os.environ: | |
| raise KeyError("CO_API_KEY not found in st.secrets or os.environ. Please set it in " | |
| ".streamlit/secrets.toml or as an environment variable.") | |
| co = cohere.Client(os.environ["CO_API_KEY"]) | |
| def to_html(df: pd.DataFrame, table_header: str) -> str: | |
| table_data = ''.join(df.html_table_content) | |
| html = f''' | |
| <table> | |
| {table_header} | |
| {table_data} | |
| </table>''' | |
| return html | |
| def get_visualizations(): | |
| table_header = ''' | |
| <tr> | |
| <td width="25%">Title</td> | |
| <td width="15%">Keywords</td> | |
| <td width="10%">Venue</td> | |
| <td width="50%">Abstract</td> | |
| </tr>''' | |
| list_of_titles = list(df["content.title"].values) | |
| embeds = co.embed(texts=list_of_titles, | |
| model="small").embeddings | |
| embeds_npy = np.array(embeds) | |
| # Load and initialize BERTopic to use KMeans clustering with 8 clusters only. | |
| cluster_model = KMeans(n_clusters=8) | |
| topic_model = BERTopic(hdbscan_model=cluster_model) | |
| # df is a dataframe. df['title'] is the column of text we're modeling | |
| df['topic'], probabilities = topic_model.fit_transform(df['content.title'], embeds_npy) | |
| app = Topically(os.environ["CO_API_KEY"]) | |
| df['topic_name'], topic_names = app.name_topics((df['content.title'], df['topic']), num_generations=5) | |
| #st.write("Topics extracted are:", topic_names) | |
| topic_model.set_topic_labels(topic_names) | |
| fig1 = topic_model.visualize_documents(df['content.title'].values, | |
| embeddings=embeds_npy, | |
| topics = list(range(8)), | |
| custom_labels=True) | |
| topic_model.set_topic_labels(topic_names) | |
| fig2 = topic_model.visualize_barchart(custom_labels=True) | |
| st.plotly_chart(fig1) | |
| st.plotly_chart(fig2) | |
| st.button("Run Visualization", on_click=get_visualizations) | |