Spaces:

librarian-bots
/

new_hub_datasets

Sleeping

App Files Files Community

davanstrien HF Staff commited on Jun 10, 2024

Commit

2ddb7e5

1 Parent(s): 6cf3db7

updates

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +6 -14

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
-from datetime import datetime, timedelta
-from sys import platform
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
 from cachetools import TTLCache, cached
-from diskcache import Cache
 from dotenv import load_dotenv
 from httpx import Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
@@ -15,7 +13,7 @@ from tqdm.contrib.concurrent import thread_map
 load_dotenv()
-LIMIT = 5_000
 CACHE_TIME = 60 * 60 * 12  # 12 hours
 REMOVE_ORGS = {
     "HuggingFaceM4",
@@ -44,18 +42,13 @@ cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
 def get_three_months_ago():
-    now = datetime.now()
     return now - timedelta(days=90)
-def parse_date(date_str):
-    # parse the created date from string 2023-11-17T16:39:54.000Z to datetime
-    return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
 def add_created_data(dataset):
     _id = dataset._id
-    created = parse_date(dataset.createdAt)
     dataset_dict = dataset.__dict__
     dataset_dict["createdAt"] = created
     return dataset_dict
@@ -129,7 +122,6 @@ columns_to_drop = [
     "cardData",
     "gated",
     "sha",
-    # "paperswithcode_id",
     "tags",
     "description",
     "siblings",
@@ -137,7 +129,7 @@ columns_to_drop = [
     "_id",
     "private",
     "author",
-    "citation",
     "lastModified",
 ]
@@ -158,7 +150,7 @@ def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to
 def filter_df_by_max_age(df, max_age_days=None):
     df = df.dropna(subset=["createdAt"])
-    now = datetime.now()
     if max_age_days is not None:
         max_date = now - timedelta(days=max_age_days)
         df = df[df["createdAt"] >= max_date]

 import os
+from datetime import datetime, timedelta, timezone
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
 from cachetools import TTLCache, cached
 from dotenv import load_dotenv
 from httpx import Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 load_dotenv()
+LIMIT = 3_000
 CACHE_TIME = 60 * 60 * 12  # 12 hours
 REMOVE_ORGS = {
     "HuggingFaceM4",
 def get_three_months_ago():
+    now = datetime.now(timezone.utc)
     return now - timedelta(days=90)
 def add_created_data(dataset):
     _id = dataset._id
+    created = dataset.created_at
     dataset_dict = dataset.__dict__
     dataset_dict["createdAt"] = created
     return dataset_dict
     "cardData",
     "gated",
     "sha",
     "tags",
     "description",
     "siblings",
     "_id",
     "private",
     "author",
+  #  "citation",
     "lastModified",
 ]
 def filter_df_by_max_age(df, max_age_days=None):
     df = df.dropna(subset=["createdAt"])
+    now = datetime.now(timezone.utc)
     if max_age_days is not None:
         max_date = now - timedelta(days=max_age_days)
         df = df[df["createdAt"] >= max_date]