patrickvonplaten commited on
Commit
c562611
·
1 Parent(s): f04fb0d
Files changed (2) hide show
  1. app.py +101 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from huggingface_hub import HfApi, hf_hub_download
3
+ from huggingface_hub.repocard import metadata_load
4
+ from collections import Counter, ChainMap
5
+ import multiprocessing
6
+
7
+ import pandas as pd
8
+ import streamlit as st
9
+
10
+
11
+ ALL_LANGUAGES = []
12
+ ALL_LICENSES = []
13
+
14
+
15
+ def get_model_ids_and_tags():
16
+ api = HfApi()
17
+ models = api.list_models(full=True)
18
+ model_ids = [x.modelId for x in models]
19
+ tags = [x.tags for x in models]
20
+ return model_ids, tags
21
+
22
+
23
+ MODEL_IDS, TAGS = get_model_ids_and_tags()
24
+
25
+
26
+ def get_metadatas(i):
27
+ metadatas = {}
28
+ model_id = MODEL_IDS[i]
29
+ try:
30
+ readme_path = hf_hub_download(model_id, filename="README.md")
31
+ metadatas[model_id] = metadata_load(readme_path)
32
+ except:
33
+ print(model_id + " has no README.md")
34
+ return metadatas
35
+
36
+
37
+ def retrieve_data(metadatas):
38
+ for metadata in metadatas.values():
39
+ if metadata is None:
40
+ continue
41
+
42
+ if "language" in metadata:
43
+ ALL_LANGUAGES.append(metadata["language"])
44
+ if "license" in metadata:
45
+ ALL_LICENSES.append(metadata["license"])
46
+
47
+
48
+ @st.cache(persist=True)
49
+ def main():
50
+ # 0. Get model ids
51
+ model_ids, tags = get_model_ids_and_tags()
52
+
53
+ # 1. Retrieve metadatas
54
+ pool = multiprocessing.Pool()
55
+ metadatas = dict(ChainMap(*pool.map(get_metadatas, range(len(MODEL_IDS)))))
56
+ pool.close()
57
+
58
+ # 2. Parse to results
59
+ retrieve_data(metadatas)
60
+
61
+ def clean_lists(list_like):
62
+ clean_list = []
63
+ for item in list_like:
64
+ if isinstance(item, str):
65
+ clean_list.append(item)
66
+ elif isinstance(item, list) and all(isinstance(x, str) for x in item):
67
+ clean_list = clean_list + item
68
+ return clean_list
69
+
70
+ # 3. count data
71
+ lang_counter = Counter(clean_lists(ALL_LANGUAGES))
72
+ license_counter = Counter(clean_lists(ALL_LICENSES))
73
+
74
+ # 4. count tags
75
+ tags_counter = Counter(sum(TAGS, []))
76
+
77
+ # 5. change to frame
78
+ lang_data_frame = pd.DataFrame.from_dict(lang_counter, orient="index")
79
+ lang_data_frame = lang_data_frame.sort_index().transpose()
80
+
81
+ license_data_frame = pd.DataFrame.from_dict(license_counter, orient="index")
82
+ license_data_frame = license_data_frame.sort_index().transpose()
83
+
84
+ tags_data_frame = pd.DataFrame.from_dict(tags_counter, orient="index")
85
+ tags_data_frame = tags_data_frame.sort_index().transpose()
86
+
87
+ return lang_data_frame, license_data_frame, tags_data_frame
88
+
89
+
90
+ lang_data_frame, license_data_frame, tags_data_frame = main()
91
+
92
+
93
+ st.title("All Languages")
94
+ st.dataframe(lang_data_frame, width=600, height=1200)
95
+ st.write("Total num of langauges", lang_data_frame.shape[-1])
96
+ st.title("All Licenses")
97
+ st.dataframe(license_data_frame, width=600, height=1200)
98
+ st.write("Total num of licenses", license_data_frame.shape[-1])
99
+ st.title("All Tags")
100
+ st.dataframe(tags_data_frame, width=600, height=1200)
101
+ st.write("Total num of different tags", tags_data_frame.shape[-1])
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ huggingface_hub
2
+ pandas