Spaces:

jsulz
/

spaces-ship

Build error

App Files Files Community

spaces-ship / app.py

jsulz

wrapping

cf047e8 over 1 year ago

raw

history blame

16.9 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import plotly.express as px
	from datasets import load_dataset

	def load_transform_data():
	"""
	Load and transform data from a parquet file.

	Returns:
	pandas.DataFrame: Transformed dataframe.
	"""
	spaces_dataset = 'jsulz/space-stats'
	dataset = load_dataset(spaces_dataset)
	df = dataset['train'].to_pandas()
	# combine the sdk and tags columns, one of which is a string and the other is an array of strings
	df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
	df["licenses"] = df["license"].apply(
	lambda x: np.array([str(x)]) if x is None else x
	)
	# then combine the sdk and tags columns so that their elements are together
	df["sdk_tags"] = df[["sdk", "tags"]].apply(
	lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
	)

	# Fill the NaN values with an empty string
	df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])

	# where the custom_domains column is not null, use that as the url, otherwise, use the host column
	df["url"] = np.where(
	df["custom_domains"].isnull(),
	df["id"],
	df["custom_domains"],
	)

	# Build up a pretty url that's clickable with the emoji
	df["url"] = df[["url", "emoji"]].apply(
	lambda x: (
	f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
	if x.iloc[0] is not None and "/" in x.iloc[0]
	else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
	),
	axis=1,
	)

	# Prep the models, datasets, and licenses columns for display
	df["r_models"] = [
	", ".join(models) if models is not None else "" for models in df["models"]
	]
	df["r_sdk_tags"] = [
	", ".join(sdk_tags) if sdk_tags is not None else ""
	for sdk_tags in df["sdk_tags"]
	]
	df["r_datasets"] = [
	", ".join(datasets) if datasets is not None else ""
	for datasets in df["datasets"]
	]
	df["r_licenses"] = [
	", ".join(licenses) if licenses is not None else ""
	for licenses in df["licenses"]
	]
	return df


	def filtered_df(
	filtered_emojis,
	filtered_likes,
	filtered_author,
	filtered_hardware,
	filtered_tags,
	filtered_models,
	filtered_datasets,
	space_licenses,
	filtered_devmode,
	):
	"""
	Filter the dataframe based on the given criteria.

	Args:
	filtered_emojis (list): List of emojis to filter the dataframe by.
	filtered_likes (int): Minimum number of likes to filter the dataframe by.
	filtered_author (list): List of authors to filter the dataframe by.
	filtered_hardware (list): List of hardware to filter the dataframe by.
	filtered_tags (list): List of tags to filter the dataframe by.
	filtered_models (list): List of models to filter the dataframe by.
	filtered_datasets (list): List of datasets to filter the dataframe by.
	space_licenses (list): List of licenses to filter the dataframe by.

	Returns:
	pandas.DataFrame: Filtered dataframe with the following columns: "URL", "Likes", "Models", "Datasets", "Licenses".
	"""
	_df = df
	if filtered_emojis:
	_df = _df[_df["emoji"].isin(filtered_emojis)]
	if filtered_likes:
	_df = _df[_df["likes"] >= filtered_likes]
	if filtered_author:
	_df = _df[_df["author"].isin(filtered_author)]
	if filtered_hardware:
	_df = _df[_df["hardware"].isin(filtered_hardware)]
	if filtered_tags:
	_df = _df[
	_df["sdk_tags"].apply(lambda x: any(tag in x for tag in filtered_tags))
	]
	if filtered_models:
	_df = _df[
	_df["models"].apply(
	lambda x: (
	any(model in x for model in filtered_models)
	if x is not None
	else False
	)
	)
	]
	if filtered_datasets:
	_df = _df[
	_df["datasets"].apply(
	lambda x: (
	any(dataset in x for dataset in filtered_datasets)
	if x is not None
	else False
	)
	)
	]
	if space_licenses:
	_df = _df[
	_df["licenses"].apply(
	lambda x: (
	any(space_license in x for space_license in space_licenses)
	if x is not None
	else False
	)
	)
	]

	# rename the columns names to make them more readable
	_df = _df.rename(
	columns={
	"url": "URL",
	"likes": "Likes",
	"r_models": "Models",
	"r_datasets": "Datasets",
	"r_licenses": "Licenses",
	}
	)
	if filtered_devmode:
	_df = _df[
	_df["devMode"] == filtered_devmode
	]

	return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]


	def count_items(items):
	"""
	Count the occurrences of items and authors in a given list of items.
	Parameters:
	items (dataframe column): A dataframe column containing a list of items.
	Returns:
	tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
	and the second dictionary contains the count of each author.
	"""
	items = np.concatenate([arr for arr in items.values if arr is not None])
	item_count = {}
	item_author_count = {}
	for item in items:
	if item in item_count:
	item_count[item] += 1
	else:
	item_count[item] = 1
	author = item.split('/')[0]
	if author in item_author_count:
	item_author_count[author] += 1
	else:
	item_author_count[author] = 1

	return item_count, item_author_count

	def flatten_column(_df, column):
	"""
	Flattens a column in a DataFrame.

	Args:
	_df (pandas.DataFrame): The DataFrame containing the column.
	column (str): The name of the column to flatten.

	Returns:
	list: A list of unique values from the flattened column.
	"""
	column_to_list = _df[column].apply(
	lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
	)
	flattened = np.concatenate(column_to_list.values)
	uniques = np.unique(flattened)
	return uniques.tolist()


	with gr.Blocks(fill_width=True) as demo:
	df = load_transform_data()
	with gr.Tab(label="Spaces Overview"):

	# The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
	# The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
	df = df.sort_values("created_at")
	df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
	fig1 = px.line(
	df,
	x="created_at",
	y="cumulative_spaces",
	title="Growth of Spaces Over Time",
	labels={"created_at": "Date", "cumulative_spaces": "Number of Spaces"},
	template="plotly_dark",
	)
	gr.Plot(fig1)

	with gr.Row():
	# Create a pie charge showing the distribution of spaces by SDK
	fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
	gr.Plot(fig2)

	# create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
	emoji_counts = df['emoji'].value_counts().head(10).reset_index()
	fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
	gr.Plot(fig3)

	# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
	author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
	fig4 = px.scatter(
	author_likes,
	x="id",
	y="likes",
	title="Relationship between Number of Spaces Created and Number of Likes",
	labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
	hover_data={"author": True},
	template="plotly_dark",
	)
	gr.Plot(fig4)

	# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
	emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
	fig10 = px.scatter(
	emoji_likes,
	x="id",
	y="likes",
	title="Relationship between Space Emoji and Number of Likes",
	labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
	hover_data={"emoji": True},
	template="plotly_dark",
	)
	gr.Plot(fig10)

	# Create a bar chart of hardware in use
	hardware = df['hardware'].value_counts().reset_index()
	hardware.columns = ['Hardware', 'Number of Spaces']
	fig5 = px.bar(
	hardware,
	x="Hardware",
	y="Number of Spaces",
	title="Hardware in Use",
	labels={
	"Hardware": "Hardware",
	"Number of Spaces": "Number of Spaces (log scale)",
	},
	color="Hardware",
	template="plotly_dark",
	)
	fig5.update_layout(yaxis_type="log")
	gr.Plot(fig5)

	model_count, model_author_count = count_items(df['models'])
	model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
	fig8 = px.bar(
	model_author_count.sort_values("Number of Spaces", ascending=False).head(
	20
	),
	x="Model Author",
	y="Number of Spaces",
	title="Most Popular Model Authors",
	labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
	template="plotly_dark",
	)
	gr.Plot(fig8)
	model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
	# then make a bar chart
	fig6 = px.bar(
	model_count.sort_values("Number of Spaces", ascending=False).head(20),
	x="Model",
	y="Number of Spaces",
	title="Most Used Models",
	labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
	template="plotly_dark",
	)
	gr.Plot(fig6)

	dataset_count, dataset_author_count = count_items(df['datasets'])
	dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
	dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
	fig9 = px.bar(
	dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
	20
	),
	x="Dataset Author",
	y="Number of Spaces",
	title="Most Popular Dataset Authors",
	labels={
	"Dataset Author": "Dataset Author",
	"Number of Spaces": "Number of Spaces",
	},
	template="plotly_dark",
	)
	gr.Plot(fig9)
	# then make a bar chart
	fig7 = px.bar(
	dataset_count.sort_values("Number of Spaces", ascending=False).head(20),
	x="Datasets",
	y="Number of Spaces",
	title="Most Used Datasets",
	labels={"Datasets": "Datasets", "Number of Spaces": "Number of Spaces"},
	template="plotly_dark",
	)
	gr.Plot(fig7)

	with gr.Row():
	# Get the most duplicated spaces
	duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
	duplicated_spaces["duplicated_from"] = duplicated_spaces[
	"duplicated_from"
	].apply(
	lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
	)
	duplicated_spaces.columns = ["Space", "Number of Duplicates"]
	gr.DataFrame(duplicated_spaces, datatype="html" )

	# Get the most liked spaces
	liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
	liked_spaces["id"] = liked_spaces["id"].apply(
	lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
	)
	liked_spaces.columns = ['Space', 'Number of Likes']
	gr.DataFrame(liked_spaces, datatype="html")

	with gr.Row():
	# Create a dataframe with the top 10 authors and the number of spaces they have created
	author_counts = df['author'].value_counts().head(20).reset_index()
	author_counts["author"] = author_counts["author"].apply(
	lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
	)
	author_counts.columns = ["Author", "Number of Spaces"]
	gr.DataFrame(author_counts, datatype="html")

	# create a dataframe where we groupby author and sum their likes
	author_likes = df.groupby('author').agg({'likes': 'sum'}).reset_index()
	author_likes = author_likes.sort_values(by='likes', ascending=False).head(20)
	author_likes["author"] = author_likes["author"].apply(
	lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
	)
	author_likes.columns = ["Author", "Number of Likes"]
	gr.DataFrame(author_likes, datatype="html")


	with gr.Tab(label="Spaces Search"):
	df = df[df['stage'] == 'RUNNING']

	# Layout
	with gr.Row():
	emoji = gr.Dropdown(
	df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
	) # Dropdown to select the emoji
	likes = gr.Slider(
	minimum=df["likes"].min(),
	maximum=df["likes"].max(),
	step=1,
	label="Filter by Likes",
	) # Slider to filter by likes
	with gr.Row():
	author = gr.Dropdown(
	df["author"].unique().tolist(), label="Search by Author", multiselect=True
	)
	# get the list of unique strings in the sdk_tags column
	sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
	# create a dropdown for the sdk_tags
	sdk_tags = gr.Dropdown(
	sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
	)
	with gr.Row():
	# create a gradio checkbox group for hardware
	hardware = gr.CheckboxGroup(
	df["hardware"].unique().tolist(), label="Filter by Hardware"
	)

	licenses = np.unique(np.concatenate(df["licenses"].values))
	space_license = gr.Dropdown(licenses.tolist(), label="Filter by license")

	with gr.Row():
	models = gr.Dropdown(
	flatten_column(df, "models"),
	label="Search by Model",
	multiselect=True,
	)
	datasets = gr.Dropdown(
	flatten_column(df, "datasets"),
	label="Search by Dataset",
	multiselect=True,
	)

	devmode = gr.Checkbox(label="Show Dev Mode Spaces")
	clear = gr.ClearButton(components=[
	emoji,
	author,
	hardware,
	sdk_tags,
	models,
	datasets,
	space_license
	])

	df = pd.DataFrame(
	df[
	[
	"id",
	"emoji",
	"author",
	"url",
	"likes",
	"hardware",
	"sdk_tags",
	"models",
	"datasets",
	"licenses",
	"r_sdk_tags",
	"r_models",
	"r_datasets",
	"r_licenses",
	'devMode'
	]
	]
	)
	gr.DataFrame(
	filtered_df,
	inputs=[
	emoji,
	likes,
	author,
	hardware,
	sdk_tags,
	models,
	datasets,
	space_license,
	devmode,
	],
	datatype="html",
	wrap=True,
	column_widths=["25%", "5%", "25%", "25%", "20%"]
	)


	demo.launch()