Spaces:
Sleeping
Sleeping
cache filtering
Browse files
app.py
CHANGED
|
@@ -17,15 +17,19 @@ for index, row in df.iterrows():
|
|
| 17 |
all_languages = list(tags.keys())
|
| 18 |
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
ds = load_dataset(
|
| 24 |
"loubnabnl/the-stack-inspection-data",
|
| 25 |
data_dir=f"data/{language}/{ext}",
|
| 26 |
split="train",
|
| 27 |
)
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
col1, col2, _ = st.columns([1, 1, 4])
|
| 31 |
with col1:
|
|
@@ -45,15 +49,7 @@ max_mean_line_length = st.sidebar.slider("Maximum average line length", 0, 500,
|
|
| 45 |
st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
|
| 46 |
`alphanumeric_fraction` is smaller than the selected value.")
|
| 47 |
|
| 48 |
-
|
| 49 |
-
samples = load_data(chosen_language, chosen_ext)
|
| 50 |
-
|
| 51 |
-
samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
|
| 52 |
-
samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
|
| 53 |
-
samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
|
| 54 |
-
|
| 55 |
-
if not_lexable:
|
| 56 |
-
samples = samples.filter(lambda x: not x["lexable"])
|
| 57 |
|
| 58 |
max_docs = len(samples)
|
| 59 |
|
|
|
|
| 17 |
all_languages = list(tags.keys())
|
| 18 |
|
| 19 |
|
| 20 |
+
@st.cache(max_entries=100)
|
| 21 |
+
def load_data(language, ext, min_alphanum, max_line_length, max_mean_line_length, non_lexable):
|
| 22 |
+
samples = load_dataset(
|
|
|
|
| 23 |
"loubnabnl/the-stack-inspection-data",
|
| 24 |
data_dir=f"data/{language}/{ext}",
|
| 25 |
split="train",
|
| 26 |
)
|
| 27 |
+
samples = samples.filter(lambda x: x["alphanum_fraction"] < min_alphanum)
|
| 28 |
+
samples = samples.filter(lambda x: x["max_line_length"] > max_line_length)
|
| 29 |
+
samples = samples.filter(lambda x: x["avg_line_length"] > max_mean_line_length)
|
| 30 |
+
if non_lexable:
|
| 31 |
+
samples = samples.filter(lambda x: not x["lexable"])
|
| 32 |
+
return samples
|
| 33 |
|
| 34 |
col1, col2, _ = st.columns([1, 1, 4])
|
| 35 |
with col1:
|
|
|
|
| 49 |
st.sidebar.markdown("Printed files have `max_line_length` and `average_line_length` larger than the selected values.\
|
| 50 |
`alphanumeric_fraction` is smaller than the selected value.")
|
| 51 |
|
| 52 |
+
samples = load_data(chosen_language, chosen_ext, min_alphanum, max_line_length, max_mean_line_length, not_lexable)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
max_docs = len(samples)
|
| 55 |
|