Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| # Set page config | |
| st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer", page_icon="π") | |
| # Custom CSS (keep existing styles) | |
| st.markdown(""" | |
| <style> | |
| .stApp { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| font-family: 'Helvetica Neue', Arial, sans-serif; | |
| } | |
| .cta-container { | |
| background-color: #f0f8ff; | |
| border-radius: 10px; | |
| padding: 20px; | |
| margin-top: 30px; | |
| margin-bottom: 30px; | |
| border: 2px solid #1e90ff; | |
| text-align: center; | |
| } | |
| .cta-title { | |
| color: #1e90ff; | |
| font-size: 24px; | |
| font-weight: bold; | |
| margin-bottom: 10px; | |
| } | |
| .cta-description { | |
| color: #333; | |
| font-size: 16px; | |
| margin-bottom: 20px; | |
| } | |
| .stButton > button { | |
| background-color: #1e90ff; | |
| color: white; | |
| font-size: 18px; | |
| font-weight: bold; | |
| padding: 10px 24px; | |
| border-radius: 5px; | |
| border: none; | |
| transition: all 0.3s ease; | |
| } | |
| .stButton > button:hover { | |
| background-color: #0066cc; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Title and description | |
| st.title("π Macrocosmos HF Dataset Explorer") | |
| st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.") | |
| # Function to load dataset information | |
| def load_datasets(): | |
| datasets = [ | |
| # Reddit datasets | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6000000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_13", "Number of rows": "18,931,749"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_219", "Number of rows": "227,599,340"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_112", "Number of rows": "301,588,714"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249000000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303000000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1120000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132000000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130000000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31200000"}, | |
| {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26900000"}, | |
| # X datasets | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/suul999922/x_dataset_71", "Number of rows": "8,998,828"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/thayallans/x_dataset_28", "Number of rows": "178,669"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/apidojo/x_dataset_242", "Number of rows": "499,067"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_112", "Number of rows": "331,500,777"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332000000"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9900"}, | |
| {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89000"} | |
| ] | |
| return datasets | |
| # Function to convert row count to float | |
| def parse_row_count(row_count): | |
| return float(row_count.replace(',', '')) | |
| # Load datasets | |
| datasets = load_datasets() | |
| df = pd.DataFrame(datasets) | |
| # Calculate total rows | |
| total_rows = sum(parse_row_count(rows) for rows in df['Number of rows']) | |
| # Display statistics | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Total Rows", f"{total_rows / 1e9:.2f}B") | |
| with col2: | |
| st.metric("Total Datasets", len(df)) | |
| # Display the dataset table | |
| st.subheader("Dataset Overview") | |
| st.dataframe( | |
| df, | |
| column_config={ | |
| "Source": st.column_config.TextColumn("Source"), | |
| "DataSet repo link": st.column_config.LinkColumn("Repository"), | |
| "Number of rows": st.column_config.TextColumn("Rows"), | |
| }, | |
| hide_index=True, | |
| use_container_width=True | |
| ) | |
| # Call-to-action section with styled button | |
| st.markdown(""" | |
| <div class="cta-container"> | |
| <div class="cta-title">π Explore Dataset Insights</div> | |
| <div class="cta-description"> | |
| Dive deep into the rich analytics of our dataset. Uncover trends, distributions, and key metrics that will enhance your understanding and guide your research. | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Centered button | |
| col1, col2, col3 = st.columns([1,2,1]) | |
| with col2: | |
| show_analysis = st.button("Reveal Dataset Analysis", use_container_width=True) | |
| # Display dataset analysis if the button was clicked | |
| if show_analysis: | |
| # Load analysis results | |
| def load_analysis_results(): | |
| with open('analysis_results.json', 'r') as f: | |
| return json.load(f) | |
| analysis_results = load_analysis_results() | |
| st.subheader("Analysis of a Sample Reddit Dataset") | |
| st.write("This analysis is based on a sample from one of the Reddit datasets.") | |
| # Display Dataset Structure | |
| st.subheader("Dataset Structure") | |
| structure = analysis_results['structure'] | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Total Partitions", structure['total_partitions']) | |
| col2.metric("Total Rows", f"{structure['total_rows']:,}") | |
| col3.metric("Number of Columns", len(structure['columns'])) | |
| col4.metric("Date Range", f"{structure['date_range'][0]} to {structure['date_range'][1]}") | |
| with st.expander("Show Columns"): | |
| st.write(", ".join(structure['columns'])) | |
| # Display Top Communities | |
| st.subheader("Top Communities") | |
| communities_df = pd.DataFrame(analysis_results['communities']) | |
| fig = go.Figure(data=[go.Bar( | |
| x=communities_df['communityName'], | |
| y=communities_df['count'], | |
| text=communities_df['percentage'].apply(lambda x: f'{x:.2%}'), | |
| textposition='auto', | |
| marker_color='#1e88e5' | |
| )]) | |
| fig.update_layout(title_text='Top Communities Distribution') | |
| fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Display Time Distribution | |
| st.subheader("Time Distribution") | |
| time_df = pd.DataFrame(analysis_results['time_distribution']) | |
| time_df['date'] = pd.to_datetime(time_df['date']) | |
| fig = go.Figure(data=[go.Scatter(x=time_df['date'], y=time_df['count'], mode='lines+markers')]) | |
| fig.update_layout(title_text='Posts Over Time') | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Display Sentiment Distribution | |
| st.subheader("Sentiment Distribution") | |
| sentiment_df = pd.DataFrame(analysis_results['sentiment_distribution']) | |
| fig = go.Figure(data=[go.Pie(labels=sentiment_df['sentiment'], values=sentiment_df['count'], textinfo='percent+label')]) | |
| fig.update_layout(title_text='Sentiment Distribution') | |
| fig.update_traces(marker=dict(colors=['#4CAF50', '#FFC107', '#F44336'])) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Display Data Type Distribution | |
| st.subheader("Data Type Distribution") | |
| data_type_df = pd.DataFrame(analysis_results['data_type_distribution']) | |
| fig = go.Figure(data=[go.Pie(labels=data_type_df['dataType'], values=data_type_df['count'], textinfo='percent+label')]) | |
| fig.update_layout(title_text='Data Type Distribution') | |
| fig.update_traces(marker=dict(colors=['#2196F3', '#FF9800'])) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Display Top Topics | |
| st.subheader("Top Topics") | |
| topics_df = pd.DataFrame(analysis_results['top_topics']) | |
| st.dataframe(topics_df, use_container_width=True) | |
| # Display Average Text Length | |
| st.metric("Average Text Length", f"{analysis_results['avg_text_length']:.2f} characters") | |
| # Add instructions for using the datasets | |
| st.subheader("How to Use These Datasets") | |
| code = ''' | |
| from datasets import load_dataset | |
| dataset = load_dataset("username/dataset_name") | |
| ''' | |
| st.code(code, language='python') | |
| st.markdown(""" | |
| 1. Click on the dataset link to visit its Hugging Face page. | |
| 2. On the dataset page, you'll find information about the dataset's content, structure, and usage. | |
| 3. Use the code above to load a dataset, replacing `"username/dataset_name"` with the actual dataset identifier. | |
| 4. For these large datasets, consider using streaming or loading specific subsets to manage memory usage. | |
| 5. Always check the dataset's license and usage restrictions before incorporating it into your project. | |
| """) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("Created by Macrocosmos with β€οΈ") |