Spaces:
Running
Running
fix billion display
Browse files
app.py
CHANGED
|
@@ -59,9 +59,9 @@ st.markdown("Explore massive datasets hosted on Hugging Face, totaling approxima
|
|
| 59 |
# Function to load dataset information
|
| 60 |
@st.cache_data
|
| 61 |
def load_datasets():
|
| 62 |
-
|
| 63 |
# Reddit datasets
|
| 64 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "
|
| 65 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"},
|
| 66 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"},
|
| 67 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"},
|
|
@@ -71,13 +71,13 @@ def load_datasets():
|
|
| 71 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"},
|
| 72 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"},
|
| 73 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"},
|
| 74 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "
|
| 75 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "
|
| 76 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "
|
| 77 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "
|
| 78 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "
|
| 79 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "
|
| 80 |
-
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "
|
| 81 |
|
| 82 |
# X datasets
|
| 83 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"},
|
|
@@ -88,20 +88,27 @@ def load_datasets():
|
|
| 88 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"},
|
| 89 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"},
|
| 90 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"},
|
| 91 |
-
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "
|
| 92 |
-
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "
|
| 93 |
-
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "
|
| 94 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Load datasets
|
| 97 |
datasets = load_datasets()
|
| 98 |
df = pd.DataFrame(datasets)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
| 100 |
# Display statistics
|
| 101 |
-
col1, col2
|
| 102 |
with col1:
|
| 103 |
-
|
| 104 |
-
st.metric("Total Rows", f"{total_rows / 1000:.2f}B")
|
| 105 |
with col2:
|
| 106 |
st.metric("Total Datasets", len(df))
|
| 107 |
|
|
|
|
| 59 |
# Function to load dataset information
|
| 60 |
@st.cache_data
|
| 61 |
def load_datasets():
|
| 62 |
+
datasets = [
|
| 63 |
# Reddit datasets
|
| 64 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6000000"},
|
| 65 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"},
|
| 66 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"},
|
| 67 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"},
|
|
|
|
| 71 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"},
|
| 72 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"},
|
| 73 |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"},
|
| 74 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249000000"},
|
| 75 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303000000"},
|
| 76 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1120000"},
|
| 77 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132000000"},
|
| 78 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130000000"},
|
| 79 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31200000"},
|
| 80 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26900000"},
|
| 81 |
|
| 82 |
# X datasets
|
| 83 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"},
|
|
|
|
| 88 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"},
|
| 89 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"},
|
| 90 |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"},
|
| 91 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332000000"},
|
| 92 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9900"},
|
| 93 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89000"}
|
| 94 |
]
|
| 95 |
+
return datasets
|
| 96 |
+
|
| 97 |
+
# Function to convert row count to float
|
| 98 |
+
def parse_row_count(row_count):
|
| 99 |
+
return float(row_count.replace(',', ''))
|
| 100 |
|
| 101 |
# Load datasets
|
| 102 |
datasets = load_datasets()
|
| 103 |
df = pd.DataFrame(datasets)
|
| 104 |
|
| 105 |
+
# Calculate total rows
|
| 106 |
+
total_rows = sum(parse_row_count(rows) for rows in df['Number of rows'])
|
| 107 |
+
|
| 108 |
# Display statistics
|
| 109 |
+
col1, col2 = st.columns(2)
|
| 110 |
with col1:
|
| 111 |
+
st.metric("Total Rows", f"{total_rows / 1e9:.2f}B")
|
|
|
|
| 112 |
with col2:
|
| 113 |
st.metric("Total Datasets", len(df))
|
| 114 |
|