Spaces:
Running
Running
| from core.data_types import convert_dtype | |
| from core.names import find_unique_name | |
| from core.state import Field | |
| from core.state import FileObject | |
| from core.state import FileSet | |
| from core.state import RecordSet | |
| import mlcroissant as mlc | |
| def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]: | |
| """Infers one or several ml:RecordSets from a FileOject/FileSet.""" | |
| # For the moment, there is no inference support for FileSets. | |
| if isinstance(file, FileSet): | |
| return [] | |
| # We can infer only if the underlying `pd.DataFrame` could be built. | |
| if file.df is None: | |
| return [] | |
| fields = [] | |
| for column, value in file.df.dtypes.items(): | |
| source = mlc.Source( | |
| distribution=file.id, | |
| extract=mlc.Extract(column=column), | |
| ) | |
| field = Field( | |
| id=column, | |
| name=column, | |
| data_types=[convert_dtype(value)], | |
| source=source, | |
| references=mlc.Source(), | |
| ) | |
| fields.append(field) | |
| name = find_unique_name(names, file.name + "_record_set") | |
| return [ | |
| RecordSet( | |
| id=name, | |
| fields=fields, | |
| name=name, | |
| description="", | |
| ) | |
| ] | |