Spaces:
Sleeping
Sleeping
Commit
·
a159f5a
1
Parent(s):
1000f2a
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,7 +14,11 @@ pd.options.plotting.backend = "plotly"
|
|
| 14 |
|
| 15 |
|
| 16 |
def download_dataset():
|
| 17 |
-
return load_dataset(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def _clean_tags(tags: Optional[Union[str, List[str]]]):
|
|
@@ -51,8 +55,8 @@ def prep_dataset():
|
|
| 51 |
df["has_dataset"] = df.datasets.apply(len) > 0
|
| 52 |
df["has_co2"] = df.co2.notnull()
|
| 53 |
df["has_co2"] = df.co2.apply(lambda x: x is not None)
|
| 54 |
-
df[
|
| 55 |
-
df[
|
| 56 |
df = df.drop(columns=["Unnamed: 0"])
|
| 57 |
df.to_parquet("data.parquet")
|
| 58 |
return df
|
|
@@ -160,12 +164,50 @@ def metadata_coverage_by_library(metadata_field):
|
|
| 160 |
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
def metadata_coverage_by_autogenerated(metadata_field):
|
| 164 |
df = load_data()
|
| 165 |
-
subset_df = df[df[
|
| 166 |
subset_df.reset_index()
|
| 167 |
-
subset_df[
|
| 168 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
df = load_data()
|
|
@@ -221,11 +263,31 @@ with gr.Blocks() as demo:
|
|
| 221 |
metadata_coverage_by_library, [metadata_field], plot, queue=False
|
| 222 |
)
|
| 223 |
with gr.Tab("Auto generated model cards"):
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
| 228 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
with gr.Tab("Model Cards"):
|
| 231 |
gr.Markdown(
|
|
@@ -249,4 +311,4 @@ with gr.Blocks() as demo:
|
|
| 249 |
model_card_length_by_library, [min_lib_frequency], df, queue=False
|
| 250 |
)
|
| 251 |
|
| 252 |
-
demo.launch(
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def download_dataset():
|
| 17 |
+
return load_dataset(
|
| 18 |
+
"open-source-metrics/model-repos-stats",
|
| 19 |
+
split="train",
|
| 20 |
+
ignore_verifications=True,
|
| 21 |
+
)
|
| 22 |
|
| 23 |
|
| 24 |
def _clean_tags(tags: Optional[Union[str, List[str]]]):
|
|
|
|
| 55 |
df["has_dataset"] = df.datasets.apply(len) > 0
|
| 56 |
df["has_co2"] = df.co2.notnull()
|
| 57 |
df["has_co2"] = df.co2.apply(lambda x: x is not None)
|
| 58 |
+
df["has_license"] = df.license.notnull()
|
| 59 |
+
df["is_generated"] = df.tags.apply(_is_generated_from_tag)
|
| 60 |
df = df.drop(columns=["Unnamed: 0"])
|
| 61 |
df.to_parquet("data.parquet")
|
| 62 |
return df
|
|
|
|
| 164 |
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
|
| 165 |
|
| 166 |
|
| 167 |
+
def metatadata_coverage_autogenerated_vs_test():
|
| 168 |
+
df = load_data()
|
| 169 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
| 170 |
+
subset_df.reset_index()
|
| 171 |
+
return (
|
| 172 |
+
df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]]
|
| 173 |
+
.mean()
|
| 174 |
+
.transpose()
|
| 175 |
+
.round(6)
|
| 176 |
+
.reset_index()
|
| 177 |
+
.rename(
|
| 178 |
+
columns={
|
| 179 |
+
True: "From autogenerated",
|
| 180 |
+
False: "Not autogenerated",
|
| 181 |
+
"index": "Metadata/tag field",
|
| 182 |
+
}
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
def metadata_coverage_by_autogenerated(metadata_field):
|
| 188 |
df = load_data()
|
| 189 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
| 190 |
subset_df.reset_index()
|
| 191 |
+
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
|
| 192 |
+
return (
|
| 193 |
+
subset_df.groupby("autogenerated-from")[metadata_field]
|
| 194 |
+
.mean()
|
| 195 |
+
.sort_values()
|
| 196 |
+
.plot.barh()
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def model_card_length_by_autogenerated():
|
| 201 |
+
df = load_data()
|
| 202 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
| 203 |
+
subset_df.reset_index()
|
| 204 |
+
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
|
| 205 |
+
return (
|
| 206 |
+
subset_df.groupby("autogenerated-from")["text_length"]
|
| 207 |
+
.describe()
|
| 208 |
+
.round()
|
| 209 |
+
.reset_index()
|
| 210 |
+
)
|
| 211 |
|
| 212 |
|
| 213 |
df = load_data()
|
|
|
|
| 263 |
metadata_coverage_by_library, [metadata_field], plot, queue=False
|
| 264 |
)
|
| 265 |
with gr.Tab("Auto generated model cards"):
|
| 266 |
+
gr.Markdown(
|
| 267 |
+
"Some libraries/training frameworks automatically generate a model card when pushing models to "
|
| 268 |
+
"the hub. The below dataframe compares the metadata coverage across several tags for models "
|
| 269 |
+
"which are pushed with autogenerated model cards compared to those without. **Note** this "
|
| 270 |
+
"breakdown relies on tags with `autogenerated` in them."
|
| 271 |
+
"As a result some model cards might be in the wrong category. "
|
| 272 |
)
|
| 273 |
+
gr.Dataframe(metatadata_coverage_autogenerated_vs_test())
|
| 274 |
+
with gr.Row():
|
| 275 |
+
metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
|
| 276 |
+
plot = gr.Plot()
|
| 277 |
+
metadata_field.change(
|
| 278 |
+
metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False
|
| 279 |
+
)
|
| 280 |
+
# )
|
| 281 |
+
# with gr.Row():
|
| 282 |
+
#
|
| 283 |
+
# # with gr.Column():
|
| 284 |
+
# # plot = gr.Plot()
|
| 285 |
+
# # min_lib_frequency.change(
|
| 286 |
+
# # model_card_length_by_autogenerated, [min_lib_frequency], plot, queue=False
|
| 287 |
+
# # )
|
| 288 |
+
# with gr.Column():
|
| 289 |
+
# gr.Markdown("Mean length of model card for autogenerated_from * model cards")
|
| 290 |
+
# df = gr.Dataframe(model_card_length_by_autogenerated)
|
| 291 |
|
| 292 |
with gr.Tab("Model Cards"):
|
| 293 |
gr.Markdown(
|
|
|
|
| 311 |
model_card_length_by_library, [min_lib_frequency], df, queue=False
|
| 312 |
)
|
| 313 |
|
| 314 |
+
demo.launch()
|