Spaces:

bigcode
/

the-stack-inspection

Sleeping

App Files Files Community

the-stack-inspection / app.py

lvwerra HF Staff

Update app.py

98c3786 about 2 years ago

raw

history blame

2.91 kB

	import streamlit as st
	import json
	import pandas as pd
	from datasets import load_dataset

	st.set_page_config(page_title="The Stack data Inspection", layout="wide")
	st.sidebar.title("The Stack data Inspection")

	df = pd.read_csv("new_extension_distribution.csv")
	all_extensions = df["extension"].tolist()

	tags = {}
	for index, row in df.iterrows():
	if row["language"] not in tags:
	tags[row["language"]] = []
	tags[row["language"]].append(str(row["extension"]))
	all_languages = list(tags.keys())


	@st.cache(max_entries=100)
	def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
	ext = None if ext == "nan" else ext
	samples = load_dataset(
	"loubnabnl/the-stack-inspection-data",
	data_dir=f"data/{language}/{ext}",
	split="train",
	)
	samples = samples.filter(
	lambda x: x["alphanum_fraction"] < min_alphanum
	and x["max_line_length"] > max_line_length
	and x["avg_line_length"] > max_mean_line_length
	)
	if non_lexable:
	samples = samples.filter(lambda x: not x["lexable"])
	return samples

	col1, col2, _ = st.columns([1, 1, 4])
	with col1:
	chosen_language = st.sidebar.selectbox(
	label="Select a programming language", options=all_languages, index=0
	)
	with col2:
	chosen_ext = st.sidebar.selectbox(
	label="Select an extension", options=tags[chosen_language], index=0
	)

	st.sidebar.header("Filters")
	not_lexable = st.sidebar.checkbox("Not lexable")
	min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0)
	max_line_length = st.sidebar.slider("Maximum line length", 0, 1200, 0, step=100)
	max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0, step=100)
	st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
	`alphanumeric_fraction` is smaller than the selected value.")

	samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)

	max_docs = len(samples)

	if max_docs > 0:
	col_1, _ = st.columns([3, 3])
	with col_1:
	index_example = st.number_input(
	f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:",
	min_value=0,
	max_value=max_docs - 1,
	value=0,
	step=1,
	)

	example = samples[index_example]

	st.markdown("#### File content:")
	content = str(example["content"])

	if len(content)>10_000:
	content = example["content"][:10_000] + "\n[MORE CODE, DISPLAYING FIRST 10k CHARACTERS]"

	if example["lexable"]:
	st.code(content, language=chosen_language)
	else:
	st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n")
	st.text(str(content))
	else:
	st.text("The dataset is empty after the filtering!")