Spaces:
Runtime error
Runtime error
Commit
·
75d9f7d
1
Parent(s):
6981528
Update API timeout and filter out invalid data
Browse files
app.py
CHANGED
|
@@ -26,31 +26,28 @@ assert user
|
|
| 26 |
|
| 27 |
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
|
| 28 |
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
|
| 29 |
-
client = Client(headers=headers, http2=True, limits=limits, timeout=
|
| 30 |
|
| 31 |
|
| 32 |
@lru_cache(maxsize=None)
|
| 33 |
-
def get_hub_community_activity(user: str
|
| 34 |
with tqdm() as pbar:
|
| 35 |
all_data = []
|
| 36 |
i = 1
|
| 37 |
-
while
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
except Exception as e:
|
| 52 |
-
print(e)
|
| 53 |
-
continue
|
| 54 |
|
| 55 |
return list(concat(all_data))
|
| 56 |
|
|
@@ -97,6 +94,7 @@ def update_data():
|
|
| 97 |
except FileNotFoundError:
|
| 98 |
previous_df = pl.DataFrame()
|
| 99 |
data = get_hub_community_activity(user)
|
|
|
|
| 100 |
data = [parse_pr_data(d) for d in data]
|
| 101 |
update_df = pl.DataFrame(data)
|
| 102 |
df = pl.concat([previous_df, update_df]).unique()
|
|
@@ -115,9 +113,21 @@ def update_data():
|
|
| 115 |
@lru_cache(maxsize=512)
|
| 116 |
def get_pr_status(user: str):
|
| 117 |
all_data = get_hub_community_activity(user)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
pr_data = (
|
| 119 |
-
x
|
|
|
|
|
|
|
| 120 |
)
|
|
|
|
| 121 |
return frequencies(x["status"] for x in pr_data)
|
| 122 |
|
| 123 |
|
|
@@ -129,6 +139,7 @@ def create_pie():
|
|
| 129 |
|
| 130 |
def group_status_by_pr_number():
|
| 131 |
all_data = get_hub_community_activity(user)
|
|
|
|
| 132 |
all_data = [parse_pr_data(d) for d in all_data]
|
| 133 |
return (
|
| 134 |
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
|
|
@@ -137,6 +148,7 @@ def group_status_by_pr_number():
|
|
| 137 |
|
| 138 |
def plot_over_time():
|
| 139 |
all_data = get_hub_community_activity(user)
|
|
|
|
| 140 |
all_data = [parse_pr_data(d) for d in all_data]
|
| 141 |
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
|
| 142 |
df = df.pivot(
|
|
@@ -146,7 +158,7 @@ def plot_over_time():
|
|
| 146 |
aggregate_function="count",
|
| 147 |
)
|
| 148 |
df = df.fill_null(0)
|
| 149 |
-
df = df.with_columns(pl.sum(["open", "merged"])).sort("createdAt")
|
| 150 |
df = df.to_pandas().set_index("createdAt").cumsum()
|
| 151 |
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
|
| 152 |
|
|
|
|
| 26 |
|
| 27 |
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
|
| 28 |
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
|
| 29 |
+
client = Client(headers=headers, http2=True, limits=limits, timeout=120.0)
|
| 30 |
|
| 31 |
|
| 32 |
@lru_cache(maxsize=None)
|
| 33 |
+
def get_hub_community_activity(user: str) -> List[Any]:
|
| 34 |
with tqdm() as pbar:
|
| 35 |
all_data = []
|
| 36 |
i = 1
|
| 37 |
+
while True:
|
| 38 |
+
r = httpx.get(
|
| 39 |
+
f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user",
|
| 40 |
+
headers=headers,
|
| 41 |
+
)
|
| 42 |
+
activity = r.json()["recentActivity"]
|
| 43 |
+
if not activity:
|
| 44 |
+
break
|
| 45 |
+
all_data.append(activity)
|
| 46 |
+
if len(all_data) % 1000 == 0:
|
| 47 |
+
# print(f"Length of all_data: {len(all_data)}")
|
| 48 |
+
pbar.write(f"Length of all_data: {len(all_data)}")
|
| 49 |
+
i += 100
|
| 50 |
+
pbar.update(100)
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
return list(concat(all_data))
|
| 53 |
|
|
|
|
| 94 |
except FileNotFoundError:
|
| 95 |
previous_df = pl.DataFrame()
|
| 96 |
data = get_hub_community_activity(user)
|
| 97 |
+
data = [d for d in data if d.get("discussionData", None) is not None]
|
| 98 |
data = [parse_pr_data(d) for d in data]
|
| 99 |
update_df = pl.DataFrame(data)
|
| 100 |
df = pl.concat([previous_df, update_df]).unique()
|
|
|
|
| 113 |
@lru_cache(maxsize=512)
|
| 114 |
def get_pr_status(user: str):
|
| 115 |
all_data = get_hub_community_activity(user)
|
| 116 |
+
print(all_data)
|
| 117 |
+
# pr_data = (
|
| 118 |
+
# x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
|
| 119 |
+
# )
|
| 120 |
+
all_data = [
|
| 121 |
+
pr_data
|
| 122 |
+
for pr_data in all_data
|
| 123 |
+
if pr_data.get("discussionData", None) is not None
|
| 124 |
+
]
|
| 125 |
pr_data = (
|
| 126 |
+
x.get("discussionData", {})
|
| 127 |
+
for x in all_data
|
| 128 |
+
if x.get("discussionData", {}).get("isPullRequest", False)
|
| 129 |
)
|
| 130 |
+
|
| 131 |
return frequencies(x["status"] for x in pr_data)
|
| 132 |
|
| 133 |
|
|
|
|
| 139 |
|
| 140 |
def group_status_by_pr_number():
|
| 141 |
all_data = get_hub_community_activity(user)
|
| 142 |
+
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
|
| 143 |
all_data = [parse_pr_data(d) for d in all_data]
|
| 144 |
return (
|
| 145 |
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
|
|
|
|
| 148 |
|
| 149 |
def plot_over_time():
|
| 150 |
all_data = get_hub_community_activity(user)
|
| 151 |
+
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
|
| 152 |
all_data = [parse_pr_data(d) for d in all_data]
|
| 153 |
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
|
| 154 |
df = df.pivot(
|
|
|
|
| 158 |
aggregate_function="count",
|
| 159 |
)
|
| 160 |
df = df.fill_null(0)
|
| 161 |
+
df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt")
|
| 162 |
df = df.to_pandas().set_index("createdAt").cumsum()
|
| 163 |
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
|
| 164 |
|