Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- app.py +0 -1
- core/constants.py +1 -1
- core/state.py +11 -3
- deploy_to_hf.sh +5 -2
- events/record_sets.py +14 -0
- views/foo.py.py +36 -0
- views/overview.py +44 -23
- views/record_sets.py +172 -63
app.py
CHANGED
|
@@ -20,7 +20,6 @@ col1.header("Croissant Editor")
|
|
| 20 |
init_state()
|
| 21 |
|
| 22 |
user = get_cached_user()
|
| 23 |
-
print("USER", user)
|
| 24 |
|
| 25 |
if OAUTH_CLIENT_ID and not user:
|
| 26 |
query_params = st.experimental_get_query_params()
|
|
|
|
| 20 |
init_state()
|
| 21 |
|
| 22 |
user = get_cached_user()
|
|
|
|
| 23 |
|
| 24 |
if OAUTH_CLIENT_ID and not user:
|
| 25 |
query_params = st.experimental_get_query_params()
|
core/constants.py
CHANGED
|
@@ -33,5 +33,5 @@ DF_HEIGHT = 150
|
|
| 33 |
OVERVIEW = "Overview"
|
| 34 |
METADATA = "Metadata"
|
| 35 |
RESOURCES = "Resources"
|
| 36 |
-
RECORD_SETS = "
|
| 37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
|
|
|
| 33 |
OVERVIEW = "Overview"
|
| 34 |
METADATA = "Metadata"
|
| 35 |
RESOURCES = "Resources"
|
| 36 |
+
RECORD_SETS = "Record Sets"
|
| 37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
core/state.py
CHANGED
|
@@ -168,7 +168,7 @@ class RecordSet:
|
|
| 168 |
"""Record Set analogue for editor"""
|
| 169 |
|
| 170 |
name: str = ""
|
| 171 |
-
data: Any = None
|
| 172 |
description: str | None = None
|
| 173 |
is_enumeration: bool | None = None
|
| 174 |
key: str | list[str] | None = None
|
|
@@ -208,9 +208,14 @@ class Metadata:
|
|
| 208 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
| 209 |
for i, record_set in enumerate(self.record_sets):
|
| 210 |
for j, field in enumerate(record_set.fields):
|
|
|
|
| 211 |
# Update source
|
| 212 |
source = field.source
|
| 213 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
new_uid = source.uid.replace(old_name, new_name, 1)
|
| 215 |
self.record_sets[i].fields[j].source.uid = new_uid
|
| 216 |
# Update references
|
|
@@ -218,7 +223,10 @@ class Metadata:
|
|
| 218 |
if (
|
| 219 |
references
|
| 220 |
and references.uid
|
| 221 |
-
and
|
|
|
|
|
|
|
|
|
|
| 222 |
):
|
| 223 |
new_uid = references.uid.replace(old_name, new_name, 1)
|
| 224 |
self.record_sets[i].fields[j].references.uid = new_uid
|
|
|
|
| 168 |
"""Record Set analogue for editor"""
|
| 169 |
|
| 170 |
name: str = ""
|
| 171 |
+
data: list[Any] | None = None
|
| 172 |
description: str | None = None
|
| 173 |
is_enumeration: bool | None = None
|
| 174 |
key: str | list[str] | None = None
|
|
|
|
| 208 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
| 209 |
for i, record_set in enumerate(self.record_sets):
|
| 210 |
for j, field in enumerate(record_set.fields):
|
| 211 |
+
possible_uid = f"{old_name}/"
|
| 212 |
# Update source
|
| 213 |
source = field.source
|
| 214 |
+
if (
|
| 215 |
+
source
|
| 216 |
+
and source.uid
|
| 217 |
+
and (source.uid.startswith(possible_uid) or source.uid == old_name)
|
| 218 |
+
):
|
| 219 |
new_uid = source.uid.replace(old_name, new_name, 1)
|
| 220 |
self.record_sets[i].fields[j].source.uid = new_uid
|
| 221 |
# Update references
|
|
|
|
| 223 |
if (
|
| 224 |
references
|
| 225 |
and references.uid
|
| 226 |
+
and (
|
| 227 |
+
references.uid.startswith(possible_uid)
|
| 228 |
+
or references.uid == old_name
|
| 229 |
+
)
|
| 230 |
):
|
| 231 |
new_uid = references.uid.replace(old_name, new_name, 1)
|
| 232 |
self.record_sets[i].fields[j].references.uid = new_uid
|
deploy_to_hf.sh
CHANGED
|
@@ -3,12 +3,15 @@ echo "Deleting $HF_REPO..."
|
|
| 3 |
rm -rf ${HF_REPO}
|
| 4 |
git clone git@hf.co:spaces/marcenacp/croissant-editor ${HF_REPO}
|
| 5 |
echo "Copying files from $PWD to $HF_REPO..."
|
| 6 |
-
rsync -aP --exclude="README.md" --exclude="*node_modules*" --exclude="*__pycache__*" . ${HF_REPO}
|
| 7 |
cd ${HF_REPO}
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
echo "Warning: if it fails, you may need to follow https://huggingface.co/docs/hub/security-git-ssh#generating-a-new-ssh-keypair"
|
| 10 |
echo "On Hugging Face Spaces, you might have to set the following environment variables:"
|
| 11 |
echo "- REDIRECT_URI"
|
| 12 |
echo "- OAUTH_STATE"
|
| 13 |
echo "- OAUTH_CLIENT_ID"
|
| 14 |
echo "- OAUTH_CLIENT_SECRET"
|
|
|
|
|
|
| 3 |
rm -rf ${HF_REPO}
|
| 4 |
git clone git@hf.co:spaces/marcenacp/croissant-editor ${HF_REPO}
|
| 5 |
echo "Copying files from $PWD to $HF_REPO..."
|
| 6 |
+
rsync -aP --exclude="README.md" --exclude="*node_modules*" --exclude="cypress/*" --exclude="*__pycache__*" . ${HF_REPO}
|
| 7 |
cd ${HF_REPO}
|
| 8 |
+
git add .
|
| 9 |
+
git commit -m "Deploy (see actual commits on https://github.com/mlcommons/croissant)."
|
| 10 |
+
echo "Now push with: 'cd $HF_REPO && git push'."
|
| 11 |
echo "Warning: if it fails, you may need to follow https://huggingface.co/docs/hub/security-git-ssh#generating-a-new-ssh-keypair"
|
| 12 |
echo "On Hugging Face Spaces, you might have to set the following environment variables:"
|
| 13 |
echo "- REDIRECT_URI"
|
| 14 |
echo "- OAUTH_STATE"
|
| 15 |
echo "- OAUTH_CLIENT_ID"
|
| 16 |
echo "- OAUTH_CLIENT_SECRET"
|
| 17 |
+
echo "Visit: https://huggingface.co/spaces/marcenacp/croissant-editor"
|
events/record_sets.py
CHANGED
|
@@ -13,6 +13,8 @@ class RecordSetEvent(enum.Enum):
|
|
| 13 |
NAME = "NAME"
|
| 14 |
DESCRIPTION = "DESCRIPTION"
|
| 15 |
IS_ENUMERATION = "IS_ENUMERATION"
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
|
|
@@ -28,4 +30,16 @@ def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key:
|
|
| 28 |
record_set.description = value
|
| 29 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
| 30 |
record_set.is_enumeration = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
expand_record_set(record_set=record_set)
|
|
|
|
| 13 |
NAME = "NAME"
|
| 14 |
DESCRIPTION = "DESCRIPTION"
|
| 15 |
IS_ENUMERATION = "IS_ENUMERATION"
|
| 16 |
+
HAS_DATA = "HAS_DATA"
|
| 17 |
+
CHANGE_DATA = "CHANGE_DATA"
|
| 18 |
|
| 19 |
|
| 20 |
def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
|
|
|
|
| 30 |
record_set.description = value
|
| 31 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
| 32 |
record_set.is_enumeration = value
|
| 33 |
+
elif event == RecordSetEvent.HAS_DATA:
|
| 34 |
+
if value:
|
| 35 |
+
record_set.data = []
|
| 36 |
+
else:
|
| 37 |
+
record_set.data = None
|
| 38 |
+
elif event == RecordSetEvent.CHANGE_DATA:
|
| 39 |
+
for index, new_value in value["edited_rows"].items():
|
| 40 |
+
record_set.data[index] = {**record_set.data[index], **new_value}
|
| 41 |
+
for row in value["added_rows"]:
|
| 42 |
+
record_set.data.append(row)
|
| 43 |
+
for row in value["deleted_rows"]:
|
| 44 |
+
del record_set.data[row]
|
| 45 |
expand_record_set(record_set=record_set)
|
views/foo.py.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import multiprocessing
|
| 2 |
+
import time
|
| 3 |
+
from typing import TypedDict
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class _Result(TypedDict):
|
| 7 |
+
bar: int
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def bar(result):
|
| 11 |
+
while True:
|
| 12 |
+
time.sleep(1)
|
| 13 |
+
result["bar"] += 1
|
| 14 |
+
print(result["bar"])
|
| 15 |
+
if result["bar"] > 5:
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def foo():
|
| 20 |
+
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
| 21 |
+
with multiprocessing.Manager() as manager:
|
| 22 |
+
result: _Result = manager.dict(bar=0)
|
| 23 |
+
process = multiprocessing.Process(target=bar, args=(result,))
|
| 24 |
+
process.start()
|
| 25 |
+
if not process.is_alive():
|
| 26 |
+
return result
|
| 27 |
+
time.sleep(3)
|
| 28 |
+
if process.is_alive():
|
| 29 |
+
process.kill()
|
| 30 |
+
result["exception"] = TimeoutError(
|
| 31 |
+
"The generation took too long and was killed."
|
| 32 |
+
)
|
| 33 |
+
return _Result(**result)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
print("FINAL RESULT", foo().get("bar"))
|
views/overview.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Any
|
| 2 |
|
| 3 |
import streamlit as st
|
|
@@ -8,12 +9,22 @@ from utils import needed_field
|
|
| 8 |
from views.metadata import handle_metadata_change
|
| 9 |
from views.metadata import MetadataEvent
|
| 10 |
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def render_overview():
|
|
@@ -21,7 +32,7 @@ def render_overview():
|
|
| 21 |
col1, col2 = st.columns([1, 1], gap="medium")
|
| 22 |
with col1:
|
| 23 |
key = "metadata-name"
|
| 24 |
-
st.text_input(
|
| 25 |
label=needed_field("Name"),
|
| 26 |
key=key,
|
| 27 |
value=metadata.name,
|
|
@@ -29,8 +40,10 @@ def render_overview():
|
|
| 29 |
on_change=handle_metadata_change,
|
| 30 |
args=(MetadataEvent.NAME, metadata, key),
|
| 31 |
)
|
|
|
|
|
|
|
| 32 |
key = "metadata-url"
|
| 33 |
-
st.text_input(
|
| 34 |
label=needed_field("URL"),
|
| 35 |
key=key,
|
| 36 |
value=metadata.url,
|
|
@@ -38,6 +51,8 @@ def render_overview():
|
|
| 38 |
on_change=handle_metadata_change,
|
| 39 |
args=(MetadataEvent.URL, metadata, key),
|
| 40 |
)
|
|
|
|
|
|
|
| 41 |
key = "metadata-description"
|
| 42 |
st.text_area(
|
| 43 |
label="Description",
|
|
@@ -47,29 +62,35 @@ def render_overview():
|
|
| 47 |
on_change=handle_metadata_change,
|
| 48 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
| 49 |
)
|
| 50 |
-
|
| 51 |
-
st.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
with col2:
|
| 58 |
user_started_editing = metadata.record_sets or metadata.distribution
|
| 59 |
if user_started_editing:
|
| 60 |
-
|
| 61 |
try:
|
| 62 |
issues = metadata.to_canonical().issues
|
| 63 |
if issues.errors:
|
| 64 |
-
|
| 65 |
for error in issues.errors:
|
| 66 |
-
|
| 67 |
if issues.warnings:
|
| 68 |
-
|
| 69 |
for warning in issues.warnings:
|
| 70 |
-
|
| 71 |
-
if not issues.errors and not issues.warnings:
|
| 72 |
-
st.write("No validation issues detected!")
|
| 73 |
except mlc.ValidationError as exception:
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
from typing import Any
|
| 3 |
|
| 4 |
import streamlit as st
|
|
|
|
| 9 |
from views.metadata import handle_metadata_change
|
| 10 |
from views.metadata import MetadataEvent
|
| 11 |
|
| 12 |
+
_NON_RELEVANT_METADATA = ["name", "distribution", "record_sets", "rdf"]
|
| 13 |
|
| 14 |
+
_INFO_TEXT = """Croissant files are composed of three layers:
|
| 15 |
+
|
| 16 |
+
- **Metadata** about the dataset covering Responsible AI, licensing and attributes of
|
| 17 |
+
[sc\:Dataset](https://schema.org/Dataset).
|
| 18 |
+
- **Resources**: The contents of a dataset as the underlying files
|
| 19 |
+
([`FileObject`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileobject))
|
| 20 |
+
and/or sets of files ([`FileSet`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileset)).
|
| 21 |
+
- **RecordSets**: the sets of structured records obtained from one or more resources
|
| 22 |
+
(typically a file or set of files) and the structure of these records,
|
| 23 |
+
expressed as a set of fields (e.g., the columns of a table).
|
| 24 |
+
|
| 25 |
+
The next three tabs will guide you through filling those layers. The errors if any will
|
| 26 |
+
be displayed on this page. Once you are ready, you can download the dataset by clicking
|
| 27 |
+
the export button in the upper right corner."""
|
| 28 |
|
| 29 |
|
| 30 |
def render_overview():
|
|
|
|
| 32 |
col1, col2 = st.columns([1, 1], gap="medium")
|
| 33 |
with col1:
|
| 34 |
key = "metadata-name"
|
| 35 |
+
name = st.text_input(
|
| 36 |
label=needed_field("Name"),
|
| 37 |
key=key,
|
| 38 |
value=metadata.name,
|
|
|
|
| 40 |
on_change=handle_metadata_change,
|
| 41 |
args=(MetadataEvent.NAME, metadata, key),
|
| 42 |
)
|
| 43 |
+
if not name:
|
| 44 |
+
st.stop()
|
| 45 |
key = "metadata-url"
|
| 46 |
+
url = st.text_input(
|
| 47 |
label=needed_field("URL"),
|
| 48 |
key=key,
|
| 49 |
value=metadata.url,
|
|
|
|
| 51 |
on_change=handle_metadata_change,
|
| 52 |
args=(MetadataEvent.URL, metadata, key),
|
| 53 |
)
|
| 54 |
+
if not url:
|
| 55 |
+
st.stop()
|
| 56 |
key = "metadata-description"
|
| 57 |
st.text_area(
|
| 58 |
label="Description",
|
|
|
|
| 62 |
on_change=handle_metadata_change,
|
| 63 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
| 64 |
)
|
| 65 |
+
st.divider()
|
| 66 |
+
left, middle, right = st.columns([1, 1, 1])
|
| 67 |
+
fields = [
|
| 68 |
+
field
|
| 69 |
+
for field, value in dataclasses.asdict(metadata).items()
|
| 70 |
+
if value and field not in _NON_RELEVANT_METADATA
|
| 71 |
+
]
|
| 72 |
+
left.metric("Number of metadata", len(fields))
|
| 73 |
+
middle.metric("Number of resources", len(metadata.distribution))
|
| 74 |
+
right.metric("Number of RecordSets", len(metadata.record_sets))
|
| 75 |
with col2:
|
| 76 |
user_started_editing = metadata.record_sets or metadata.distribution
|
| 77 |
if user_started_editing:
|
| 78 |
+
warning = ""
|
| 79 |
try:
|
| 80 |
issues = metadata.to_canonical().issues
|
| 81 |
if issues.errors:
|
| 82 |
+
warning += "**Errors**\n"
|
| 83 |
for error in issues.errors:
|
| 84 |
+
warning += f"{error}\n"
|
| 85 |
if issues.warnings:
|
| 86 |
+
warning += "**Warnings**\n"
|
| 87 |
for warning in issues.warnings:
|
| 88 |
+
warning += f"{warning}\n"
|
|
|
|
|
|
|
| 89 |
except mlc.ValidationError as exception:
|
| 90 |
+
warning += "**Errors**\n"
|
| 91 |
+
warning += f"{str(exception)}\n"
|
| 92 |
+
if warning:
|
| 93 |
+
st.warning(warning, icon="⚠️")
|
| 94 |
+
else:
|
| 95 |
+
st.success("No validation issues detected!", icon="✅")
|
| 96 |
+
st.info(_INFO_TEXT, icon="💡")
|
views/record_sets.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
|
@@ -28,6 +31,65 @@ DATA_TYPES = [
|
|
| 28 |
mlc.DataType.URL,
|
| 29 |
]
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def _handle_close_fields():
|
| 33 |
st.session_state[SelectedRecordSet] = None
|
|
@@ -116,23 +178,22 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
|
| 116 |
name=added_row.get(FieldDataFrame.NAME),
|
| 117 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
| 118 |
data_types=[added_row.get(FieldDataFrame.DATA_TYPE)],
|
| 119 |
-
source=mlc.Source(
|
| 120 |
-
uid="foo",
|
| 121 |
-
node_type="distribution",
|
| 122 |
-
extract=mlc.Extract(column=""),
|
| 123 |
-
),
|
| 124 |
references=mlc.Source(),
|
| 125 |
)
|
| 126 |
st.session_state[Metadata].add_field(record_set_key, field)
|
| 127 |
for field_key in result["deleted_rows"]:
|
| 128 |
st.session_state[Metadata].remove_field(record_set_key, field_key)
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
class FieldDataFrame:
|
| 132 |
"""Names of the columns in the pd.DataFrame for `fields`."""
|
| 133 |
|
| 134 |
-
NAME = "
|
| 135 |
-
DESCRIPTION = "
|
| 136 |
DATA_TYPE = "Data type"
|
| 137 |
SOURCE_UID = "Source"
|
| 138 |
SOURCE_EXTRACT = "Source extract"
|
|
@@ -144,17 +205,14 @@ class FieldDataFrame:
|
|
| 144 |
def render_record_sets():
|
| 145 |
col1, col2 = st.columns([1, 1])
|
| 146 |
with col1:
|
| 147 |
-
|
|
|
|
| 148 |
with col2:
|
| 149 |
_render_right_panel()
|
| 150 |
|
| 151 |
|
| 152 |
def _render_left_panel():
|
| 153 |
"""Left panel: visualization of all RecordSets as expandable forms."""
|
| 154 |
-
distribution = st.session_state[Metadata].distribution
|
| 155 |
-
if not distribution:
|
| 156 |
-
st.markdown("Please add resources first.")
|
| 157 |
-
return
|
| 158 |
record_sets = st.session_state[Metadata].record_sets
|
| 159 |
record_set: RecordSet
|
| 160 |
for record_set_key, record_set in enumerate(record_sets):
|
|
@@ -188,12 +246,20 @@ def _render_left_panel():
|
|
| 188 |
on_change=handle_record_set_change,
|
| 189 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
| 190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
joins = _find_joins(record_set.fields)
|
| 193 |
has_join = st.checkbox(
|
| 194 |
-
"Whether the RecordSet contains joins. To add a new join, add a"
|
| 195 |
-
|
| 196 |
-
" another RecordSet
|
| 197 |
key=f"{prefix}-has-joins",
|
| 198 |
value=bool(joins),
|
| 199 |
disabled=True,
|
|
@@ -248,8 +314,7 @@ def _render_left_panel():
|
|
| 248 |
)
|
| 249 |
st.data_editor(
|
| 250 |
fields,
|
| 251 |
-
|
| 252 |
-
use_container_width=not fields.empty,
|
| 253 |
num_rows="dynamic",
|
| 254 |
key=data_editor_key,
|
| 255 |
column_config={
|
|
@@ -273,6 +338,26 @@ def _render_left_panel():
|
|
| 273 |
on_change=_handle_fields_change,
|
| 274 |
args=(record_set_key, record_set),
|
| 275 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
st.button(
|
| 278 |
"Edit fields details",
|
|
@@ -297,56 +382,80 @@ def _render_right_panel():
|
|
| 297 |
record_set = selected.record_set
|
| 298 |
record_set_key = selected.record_set_key
|
| 299 |
with st.expander("**Fields**", expanded=True):
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
key = f"{prefix}-name"
|
| 305 |
-
col1.text_input(
|
| 306 |
-
needed_field("Name"),
|
| 307 |
-
placeholder="Name without special character.",
|
| 308 |
-
key=key,
|
| 309 |
-
value=field.name,
|
| 310 |
-
on_change=handle_field_change,
|
| 311 |
-
args=(FieldEvent.NAME, field, key),
|
| 312 |
)
|
| 313 |
-
key = f"{
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
|
|
|
|
|
|
| 317 |
key=key,
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
)
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
else:
|
| 329 |
data_type_index = None
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
st.divider()
|
| 350 |
|
| 351 |
st.button(
|
| 352 |
"Close",
|
|
|
|
| 1 |
+
import multiprocessing
|
| 2 |
+
import textwrap
|
| 3 |
+
import time
|
| 4 |
+
from typing import TypedDict
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
|
|
|
| 31 |
mlc.DataType.URL,
|
| 32 |
]
|
| 33 |
|
| 34 |
+
_NUM_RECORDS = 3
|
| 35 |
+
_TIMEOUT_SECONDS = 1
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class _Result(TypedDict):
|
| 39 |
+
df: pd.DataFrame | None
|
| 40 |
+
exception: Exception | None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@st.cache_data(show_spinner="Generating the dataset...")
|
| 44 |
+
def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
|
| 45 |
+
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
| 46 |
+
with multiprocessing.Manager() as manager:
|
| 47 |
+
result: _Result = manager.dict(df=None, exception=None)
|
| 48 |
+
args = (record_set, result)
|
| 49 |
+
process = multiprocessing.Process(target=_generate_data, args=args)
|
| 50 |
+
process.start()
|
| 51 |
+
if not process.is_alive():
|
| 52 |
+
return _Result(**result)
|
| 53 |
+
time.sleep(_TIMEOUT_SECONDS)
|
| 54 |
+
if process.is_alive():
|
| 55 |
+
process.kill()
|
| 56 |
+
result["exception"] = TimeoutError(
|
| 57 |
+
"The generation took too long and was killed. Please, use the CLI as"
|
| 58 |
+
" described in"
|
| 59 |
+
" https://github.com/mlcommons/croissant/tree/main/python/mlcroissant#verifyload-a-croissant-dataset."
|
| 60 |
+
)
|
| 61 |
+
return _Result(**result)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | None:
|
| 65 |
+
"""Generates the first _NUM_RECORDS records."""
|
| 66 |
+
try:
|
| 67 |
+
metadata: Metadata = st.session_state[Metadata]
|
| 68 |
+
if not metadata:
|
| 69 |
+
raise ValueError(
|
| 70 |
+
"The dataset is still incomplete. Please, go to the overview to see"
|
| 71 |
+
" errors."
|
| 72 |
+
)
|
| 73 |
+
croissant = metadata.to_canonical()
|
| 74 |
+
if croissant:
|
| 75 |
+
dataset = mlc.Dataset.from_metadata(croissant)
|
| 76 |
+
records = iter(dataset.records(record_set=record_set.name))
|
| 77 |
+
df = []
|
| 78 |
+
for i, record in enumerate(iter(records)):
|
| 79 |
+
if i >= _NUM_RECORDS:
|
| 80 |
+
break
|
| 81 |
+
# Decode bytes as str:
|
| 82 |
+
for key, value in record.items():
|
| 83 |
+
if isinstance(value, bytes):
|
| 84 |
+
try:
|
| 85 |
+
record[key] = value.decode("utf-8")
|
| 86 |
+
except:
|
| 87 |
+
pass
|
| 88 |
+
df.append(record)
|
| 89 |
+
result["df"] = pd.DataFrame(df)
|
| 90 |
+
except Exception as exception:
|
| 91 |
+
result["exception"] = exception
|
| 92 |
+
|
| 93 |
|
| 94 |
def _handle_close_fields():
|
| 95 |
st.session_state[SelectedRecordSet] = None
|
|
|
|
| 178 |
name=added_row.get(FieldDataFrame.NAME),
|
| 179 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
| 180 |
data_types=[added_row.get(FieldDataFrame.DATA_TYPE)],
|
| 181 |
+
source=mlc.Source(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
references=mlc.Source(),
|
| 183 |
)
|
| 184 |
st.session_state[Metadata].add_field(record_set_key, field)
|
| 185 |
for field_key in result["deleted_rows"]:
|
| 186 |
st.session_state[Metadata].remove_field(record_set_key, field_key)
|
| 187 |
+
# Reset the in-line data if it exists.
|
| 188 |
+
if record_set.data:
|
| 189 |
+
record_set.data = []
|
| 190 |
|
| 191 |
|
| 192 |
class FieldDataFrame:
|
| 193 |
"""Names of the columns in the pd.DataFrame for `fields`."""
|
| 194 |
|
| 195 |
+
NAME = "Field name"
|
| 196 |
+
DESCRIPTION = "Field description"
|
| 197 |
DATA_TYPE = "Data type"
|
| 198 |
SOURCE_UID = "Source"
|
| 199 |
SOURCE_EXTRACT = "Source extract"
|
|
|
|
| 205 |
def render_record_sets():
|
| 206 |
col1, col2 = st.columns([1, 1])
|
| 207 |
with col1:
|
| 208 |
+
with st.spinner("Generating the dataset..."):
|
| 209 |
+
_render_left_panel()
|
| 210 |
with col2:
|
| 211 |
_render_right_panel()
|
| 212 |
|
| 213 |
|
| 214 |
def _render_left_panel():
|
| 215 |
"""Left panel: visualization of all RecordSets as expandable forms."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
record_sets = st.session_state[Metadata].record_sets
|
| 217 |
record_set: RecordSet
|
| 218 |
for record_set_key, record_set in enumerate(record_sets):
|
|
|
|
| 246 |
on_change=handle_record_set_change,
|
| 247 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
| 248 |
)
|
| 249 |
+
key = f"{prefix}-has-data"
|
| 250 |
+
st.checkbox(
|
| 251 |
+
"Whether the RecordSet has in-line data",
|
| 252 |
+
key=key,
|
| 253 |
+
value=bool(record_set.data),
|
| 254 |
+
on_change=handle_record_set_change,
|
| 255 |
+
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
| 256 |
+
)
|
| 257 |
|
| 258 |
joins = _find_joins(record_set.fields)
|
| 259 |
has_join = st.checkbox(
|
| 260 |
+
"Whether the RecordSet contains joins. To add a new join, add a field"
|
| 261 |
+
" with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
|
| 262 |
+
" to another `RecordSet`/`FileSet`/`FileObject`.",
|
| 263 |
key=f"{prefix}-has-joins",
|
| 264 |
value=bool(joins),
|
| 265 |
disabled=True,
|
|
|
|
| 314 |
)
|
| 315 |
st.data_editor(
|
| 316 |
fields,
|
| 317 |
+
use_container_width=True,
|
|
|
|
| 318 |
num_rows="dynamic",
|
| 319 |
key=data_editor_key,
|
| 320 |
column_config={
|
|
|
|
| 338 |
on_change=_handle_fields_change,
|
| 339 |
args=(record_set_key, record_set),
|
| 340 |
)
|
| 341 |
+
result: _Result = _generate_data_with_timeout(record_set)
|
| 342 |
+
df, exception = result.get("df"), result.get("exception")
|
| 343 |
+
if exception is None and df is not None and not df.empty:
|
| 344 |
+
st.markdown("Previsualize the data:")
|
| 345 |
+
st.dataframe(df, use_container_width=True)
|
| 346 |
+
# The generation is not triggered if record_set has in-line `data`.
|
| 347 |
+
elif not record_set.data:
|
| 348 |
+
left, right = st.columns([1, 10])
|
| 349 |
+
if exception:
|
| 350 |
+
left.button(
|
| 351 |
+
"⚠️",
|
| 352 |
+
key=f"idea-{prefix}",
|
| 353 |
+
disabled=True,
|
| 354 |
+
help=textwrap.dedent(f"""**Error**:
|
| 355 |
+
```
|
| 356 |
+
{exception}
|
| 357 |
+
```
|
| 358 |
+
"""),
|
| 359 |
+
)
|
| 360 |
+
right.markdown("No preview is possible.")
|
| 361 |
|
| 362 |
st.button(
|
| 363 |
"Edit fields details",
|
|
|
|
| 382 |
record_set = selected.record_set
|
| 383 |
record_set_key = selected.record_set_key
|
| 384 |
with st.expander("**Fields**", expanded=True):
|
| 385 |
+
if isinstance(record_set.data, list):
|
| 386 |
+
st.markdown(
|
| 387 |
+
f"{needed_field('Data')}. This RecordSet is marked as having in-line"
|
| 388 |
+
" data. Please, list the data below:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
)
|
| 390 |
+
key = f"{record_set_key}-fields-data"
|
| 391 |
+
columns = [field.name for field in record_set.fields]
|
| 392 |
+
st.data_editor(
|
| 393 |
+
pd.DataFrame(record_set.data, columns=columns),
|
| 394 |
+
use_container_width=True,
|
| 395 |
+
num_rows="dynamic",
|
| 396 |
key=key,
|
| 397 |
+
column_config={
|
| 398 |
+
field.name: st.column_config.TextColumn(
|
| 399 |
+
field.name,
|
| 400 |
+
help=field.description,
|
| 401 |
+
required=True,
|
| 402 |
+
)
|
| 403 |
+
for field in record_set.fields
|
| 404 |
+
},
|
| 405 |
+
on_change=handle_record_set_change,
|
| 406 |
+
args=(RecordSetEvent.CHANGE_DATA, record_set, key),
|
| 407 |
)
|
| 408 |
+
else:
|
| 409 |
+
for field_key, field in enumerate(record_set.fields):
|
| 410 |
+
prefix = f"{record_set_key}-{field.name}-{field_key}"
|
| 411 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
| 412 |
+
|
| 413 |
+
key = f"{prefix}-name"
|
| 414 |
+
col1.text_input(
|
| 415 |
+
needed_field("Name"),
|
| 416 |
+
placeholder="Name without special character.",
|
| 417 |
+
key=key,
|
| 418 |
+
value=field.name,
|
| 419 |
+
on_change=handle_field_change,
|
| 420 |
+
args=(FieldEvent.NAME, field, key),
|
| 421 |
+
)
|
| 422 |
+
key = f"{prefix}-description"
|
| 423 |
+
col2.text_input(
|
| 424 |
+
"Description",
|
| 425 |
+
placeholder="Provide a clear description of the RecordSet.",
|
| 426 |
+
key=key,
|
| 427 |
+
on_change=handle_field_change,
|
| 428 |
+
value=field.description,
|
| 429 |
+
args=(FieldEvent.DESCRIPTION, field, key),
|
| 430 |
+
)
|
| 431 |
+
if field.data_types:
|
| 432 |
+
data_type = field.data_types[0]
|
| 433 |
+
if isinstance(data_type, str):
|
| 434 |
+
data_type = term.URIRef(data_type)
|
| 435 |
+
if data_type in DATA_TYPES:
|
| 436 |
+
data_type_index = DATA_TYPES.index(data_type)
|
| 437 |
+
else:
|
| 438 |
+
data_type_index = None
|
| 439 |
else:
|
| 440 |
data_type_index = None
|
| 441 |
+
key = f"{prefix}-datatypes"
|
| 442 |
+
col3.selectbox(
|
| 443 |
+
needed_field("Data type"),
|
| 444 |
+
index=data_type_index,
|
| 445 |
+
options=DATA_TYPES,
|
| 446 |
+
key=key,
|
| 447 |
+
on_change=handle_field_change,
|
| 448 |
+
args=(FieldEvent.DATA_TYPE, field, key),
|
| 449 |
+
)
|
| 450 |
+
possible_sources = _get_possible_sources(metadata)
|
| 451 |
+
render_source(
|
| 452 |
+
record_set_key, record_set, field, field_key, possible_sources
|
| 453 |
+
)
|
| 454 |
+
render_references(
|
| 455 |
+
record_set_key, record_set, field, field_key, possible_sources
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
st.divider()
|
|
|
|
|
|
|
| 459 |
|
| 460 |
st.button(
|
| 461 |
"Close",
|