Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| import pandas as pd | |
| from datasets import load_dataset | |
| st.set_page_config(page_title="The Stack data Inspection", layout="wide") | |
| st.sidebar.title("The Stack data Inspection") | |
| df = pd.read_csv("new_extension_distribution.csv") | |
| all_extensions = df["extension"].tolist() | |
| tags = {} | |
| for index, row in df.iterrows(): | |
| if row["language"] not in tags: | |
| tags[row["language"]] = [] | |
| tags[row["language"]].append(str(row["extension"])) | |
| all_languages = list(tags.keys()) | |
| def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable): | |
| ext = None if ext == "nan" else ext | |
| samples = load_dataset( | |
| "loubnabnl/the-stack-inspection-data", | |
| data_dir=f"data/{language}/{ext}", | |
| split="train", | |
| ) | |
| samples = samples.filter( | |
| lambda x: x["alphanum_fraction"] < min_alphanum | |
| and x["max_line_length"] > max_line_length | |
| and x["avg_line_length"] > max_mean_line_length | |
| ) | |
| if non_lexable: | |
| samples = samples.filter(lambda x: not x["lexable"]) | |
| return samples | |
| col1, col2, _ = st.columns([1, 1, 4]) | |
| with col1: | |
| chosen_language = st.sidebar.selectbox( | |
| label="Select a programming language", options=all_languages, index=0 | |
| ) | |
| with col2: | |
| chosen_ext = st.sidebar.selectbox( | |
| label="Select an extension", options=tags[chosen_language], index=0 | |
| ) | |
| st.sidebar.header("Filters") | |
| not_lexable = st.sidebar.checkbox("Not lexable") | |
| min_alphanum = st.sidebar.slider("Minimum alphanumeric fraction", 0.0, 1.0, 1.0) | |
| max_line_length = st.sidebar.slider("Maximum line length", 0, 1200, 0, step=100) | |
| max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500, 0, step=100) | |
| st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\ | |
| `alphanumeric_fraction` is smaller than the selected value.") | |
| samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable) | |
| max_docs = len(samples) | |
| if max_docs > 0: | |
| col_1, _ = st.columns([3, 3]) | |
| with col_1: | |
| index_example = st.number_input( | |
| f"Extension {chosen_ext} has {max_docs} files, choose one to visualize:", | |
| min_value=0, | |
| max_value=max_docs - 1, | |
| value=0, | |
| step=1, | |
| ) | |
| example = samples[index_example] | |
| st.markdown("#### File content:") | |
| content = str(example["content"]) | |
| if len(content)>10_000: | |
| content = example["content"][:10_000] + "\n[MORE CODE, DISPLAYING FIRST 10k CHARACTERS]" | |
| if example["lexable"]: | |
| st.code(content, language=chosen_language) | |
| else: | |
| st.text(f"File can't be lexed so we remove syntax highlighting.\nContent:\n") | |
| st.text(str(content)) | |
| else: | |
| st.text("The dataset is empty after the filtering!") |