Spaces:

xet-team
/

lfs-analysis

Runtime error

App Files Files Community

jsulz commited on Sep 26, 2024

Commit

f6db30c

1 Parent(s): f68c1d3

added new new chart

Browse files

Files changed (1) hide show

app.py +75 -6

app.py CHANGED Viewed

@@ -21,18 +21,25 @@ def process_dataset():
     """
     file_counts_and_sizes = pd.read_parquet(
-        "hf://datasets/xet-team/lfs-analysis-data/file_counts_and_sizes.parquet"
     )
     repo_by_size_df = pd.read_parquet(
-        "hf://datasets/xet-team/lfs-analysis-data/repo_by_size.parquet"
     )
     unique_files_df = pd.read_parquet(
-        "hf://datasets/xet-team/lfs-analysis-data/repo_by_size_file_dedupe.parquet"
     )
     file_extensions = pd.read_parquet(
-        "hf://datasets/xet-team/lfs-analysis-data/file_extensions.parquet"
     )
     # Convert the total size to petabytes and format to two decimal places
     file_counts_and_sizes = format_dataframe_size_column(
         file_counts_and_sizes, "total_size"
@@ -55,7 +62,13 @@ def process_dataset():
     # drop nas from the extension column
     file_extensions = file_extensions.dropna(subset=["extension"])
-    return repo_by_size_df, unique_files_df, file_counts_and_sizes, file_extensions
 def format_dataframe_size_column(_df, column_name):
@@ -196,9 +209,52 @@ def plot_total_sum(by_type_arr):
     return fig
 # Create a gradio blocks interface and launch a demo
 with gr.Blocks() as demo:
-    df, file_df, by_type, by_extension = process_dataset()
     # Add a heading
     gr.Markdown("# Git LFS Analysis Across the Hub")
@@ -258,5 +314,18 @@ with gr.Blocks() as demo:
     )
     gr.Dataframe(by_extension_size)
 demo.launch()

     """
     file_counts_and_sizes = pd.read_parquet(
+        "hf://datasets/xet-team/lfs-analysis-data/transformed/file_counts_and_sizes.parquet"
     )
     repo_by_size_df = pd.read_parquet(
+        "hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size.parquet"
     )
     unique_files_df = pd.read_parquet(
+        "hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size_file_dedupe.parquet"
     )
     file_extensions = pd.read_parquet(
+        "hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions.parquet"
     )
+    # read the file_extensions_by_month.parquet file
+    file_extensions_by_month = pd.read_parquet(
+        "hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions_by_month.parquet"
+    )
+    # drop any nas
+    file_extensions_by_month = file_extensions_by_month.dropna()
     # Convert the total size to petabytes and format to two decimal places
     file_counts_and_sizes = format_dataframe_size_column(
         file_counts_and_sizes, "total_size"
     # drop nas from the extension column
     file_extensions = file_extensions.dropna(subset=["extension"])
+    return (
+        repo_by_size_df,
+        unique_files_df,
+        file_counts_and_sizes,
+        file_extensions,
+        file_extensions_by_month,
+    )
 def format_dataframe_size_column(_df, column_name):
     return fig
+def filter_by_extension_month(_df, _extension):
+    """
+    Filters the given DataFrame (_df) by the specified extension and creates a line plot using Plotly.
+    Parameters:
+    _df (DataFrame): The input DataFrame containing the data.
+    extension (str): The extension to filter the DataFrame by. If set to "All", no filtering is applied.
+    Returns:
+    fig (Figure): The Plotly figure object representing the line plot.
+    """
+    # Filter the DataFrame by the specified extension or extensions
+    if len(_extension) == 1 and "All" in _extension or len(_extension) == 0:
+        pass
+    else:
+        _df = _df[_df["extension"].isin(_extension)].copy()
+    # Convert year and month into a datetime column and sort by date
+    _df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
+    _df = _df.sort_values(by="date")
+    # Pivot the DataFrame to get the total size for each extension and make this plotable as a time series
+    pivot_df = _df.pivot_table(
+        index="date", columns="extension", values="total_size"
+    ).fillna(0)
+    # Plot!!
+    fig = go.Figure()
+    for i, column in enumerate(pivot_df.columns):
+        if column != "":
+            fig.add_trace(
+                go.Scatter(
+                    x=pivot_df.index,
+                    y=pivot_df[column] / 1e15,  # Convert to petabytes
+                    mode="lines",
+                    name=column.capitalize(),
+                    line=dict(color=px.colors.qualitative.Alphabet[i]),
+                )
+            )
+    return fig
 # Create a gradio blocks interface and launch a demo
 with gr.Blocks() as demo:
+    df, file_df, by_type, by_extension, by_extension_month = process_dataset()
     # Add a heading
     gr.Markdown("# Git LFS Analysis Across the Hub")
     )
     gr.Dataframe(by_extension_size)
+    gr.Markdown("## File Extension Growth Over Time")
+    gr.Markdown(
+        "Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
+    )
+    # build a dropdown using the unique values in the extension column
+    extension = gr.Dropdown(
+        choices=by_extension["extension"].unique().tolist(),
+        value="All",
+        allow_custom_value=True,
+        multiselect=True,
+    )
+    _by_extension_month = gr.State(by_extension_month)
+    gr.Plot(filter_by_extension_month, inputs=[_by_extension_month, extension])
 demo.launch()