Spaces:
Runtime error
Runtime error
| import os | |
| from datetime import datetime, timedelta | |
| from functools import lru_cache | |
| from typing import Any, List | |
| import gradio as gr | |
| import httpx | |
| import pandas as pd | |
| import plotly.express as px | |
| import polars as pl | |
| from cachetools import TTLCache, cached | |
| from datasets import Dataset, load_dataset | |
| from dotenv import load_dotenv | |
| from httpx import Client | |
| from toolz import concat, frequencies | |
| from tqdm.auto import tqdm | |
| load_dotenv() | |
| token = os.environ["HUGGINGFACE_TOKEN"] | |
| user_agent = os.environ["USER_AGENT"] | |
| user = os.environ["USER_TO_TRACK"] | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
| assert token | |
| assert user_agent | |
| assert user | |
| headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"} | |
| limits = httpx.Limits(max_keepalive_connections=10, max_connections=20) | |
| client = Client(headers=headers, limits=limits, timeout=120.0) | |
| def get_hub_community_activity(user: str) -> List[Any]: | |
| with tqdm() as pbar: | |
| all_data = [] | |
| i = 1 | |
| while True: | |
| r = httpx.get( | |
| f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user", | |
| headers=headers, | |
| ) | |
| activity = r.json()["recentActivity"] | |
| if not activity: | |
| break | |
| all_data.append(activity) | |
| if len(all_data) % 1000 == 0: | |
| # print(f"Length of all_data: {len(all_data)}") | |
| pbar.write(f"Length of all_data: {len(all_data)}") | |
| i += 100 | |
| pbar.update(100) | |
| return list(concat(all_data)) | |
| # def get_hub_community_activity(user: str) -> List[Any]: | |
| # all_data = [] | |
| # for i in range(1, 2000, 100): | |
| # r = httpx.get( | |
| # f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}" | |
| # ) | |
| # activity = r.json()["recentActivity"] | |
| # all_data.append(activity) | |
| # return list(concat(all_data)) | |
| def parse_date_time(date_time: str) -> datetime: | |
| return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ") | |
| def parse_pr_data(data): | |
| data = data["discussionData"] | |
| createdAt = parse_date_time(data["createdAt"]) | |
| pr_number = data["num"] | |
| status = data["status"] | |
| repo_id = data["repo"]["name"] | |
| repo_type = data["repo"]["type"] | |
| isPullRequest = data["isPullRequest"] | |
| return { | |
| "createdAt": createdAt, | |
| "pr_number": pr_number, | |
| "status": status, | |
| "repo_id": repo_id, | |
| "type": repo_type, | |
| "isPullRequest": isPullRequest, | |
| } | |
| def update_data(): | |
| try: | |
| previous_df = pl.DataFrame( | |
| load_dataset(f"librarian-bot/{user}-stats", split="train").data.table | |
| ) | |
| except FileNotFoundError: | |
| previous_df = pl.DataFrame() | |
| data = get_hub_community_activity(user) | |
| data = [d for d in data if d.get("discussionData", None) is not None] | |
| data = [parse_pr_data(d) for d in data] | |
| update_df = pl.DataFrame(data) | |
| df = pl.concat([previous_df, update_df]).unique() | |
| if len(df) != len(previous_df): | |
| Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token) | |
| return df | |
| # def get_pr_status(): | |
| # df = update_data() | |
| # df = df.filter(pl.col("isPullRequest") is True) | |
| # return df.select(pl.col("status").value_counts()) | |
| # # return frequencies(x["status"] for x in pr_data) | |
| def get_pr_status(user: str): | |
| all_data = get_hub_community_activity(user) | |
| print(all_data) | |
| # pr_data = ( | |
| # x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"] | |
| # ) | |
| all_data = [ | |
| pr_data | |
| for pr_data in all_data | |
| if pr_data.get("discussionData", None) is not None | |
| ] | |
| pr_data = ( | |
| x.get("discussionData", {}) | |
| for x in all_data | |
| if x.get("discussionData", {}).get("isPullRequest", False) | |
| ) | |
| return frequencies(x["status"] for x in pr_data) | |
| def create_pie(): | |
| frequencies = get_pr_status(user) | |
| df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()}) | |
| return px.pie(df, values="number", names="status", template="seaborn") | |
| def group_status_by_pr_number(): | |
| all_data = get_hub_community_activity(user) | |
| all_data = [d for d in all_data if d.get("discussionData", None) is not None] | |
| all_data = [parse_pr_data(d) for d in all_data] | |
| return ( | |
| pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas() | |
| ) | |
| def plot_over_time(): | |
| all_data = get_hub_community_activity(user) | |
| all_data = [d for d in all_data if d.get("discussionData", None) is not None] | |
| all_data = [parse_pr_data(d) for d in all_data] | |
| df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date)) | |
| df = df.pivot( | |
| values=["status"], | |
| index=["createdAt"], | |
| columns=["status"], | |
| aggregate_function="count", | |
| ) | |
| df = df.fill_null(0) | |
| df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt") | |
| df = df.to_pandas().set_index("createdAt").cumsum() | |
| return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"]) | |
| create_pie() | |
| with gr.Blocks() as demo: | |
| # frequencies = get_pr_status("librarian-bot") | |
| gr.Markdown(f"# {user} PR Stats") | |
| gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}") | |
| # gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}") | |
| with gr.Column(): | |
| gr.Markdown("## Pull requests status") | |
| gr.Markdown( | |
| "The below pie chart shows the percentage of pull requests made by" | |
| " librarian bot that are open, closed or merged" | |
| ) | |
| gr.Plot(create_pie()) | |
| with gr.Column(): | |
| gr.Markdown("Pull requests opened, closed and merged over time (cumulative)") | |
| gr.Plot(plot_over_time()) | |
| with gr.Column(): | |
| gr.Markdown("## Pull requests status by PR number") | |
| gr.DataFrame(group_status_by_pr_number()) | |
| demo.launch(debug=True) | |