fantecchi / fantecchi_dataset.py
Yesh05's picture
Initial commit with LFS support for images
9a8d870
import json
import os
import datasets
_CITATION = """\
@internal{fantecchi2026,
author = {नग्नाक्षी (Nagnākṣī)},
title = {नानायोनि-कामभेद-संग्रहः (Nānāyoni-Kāmabheda-Saṅgrahaḥ)},
year = {2026},
publisher = {Fantecchi Project}
}
"""
_DESCRIPTION = """\
A comprehensive dataset of biological and cultural descriptions for fantasy races,
focusing on unique anatomy, reproductive biology, and cultural customs.
"""
_HOMEPAGE = ""
_LICENSE = "mit"
class FantecchiDataset(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.0.0")
def _info(self):
features = datasets.Features({
"race": datasets.Value("string"),
"safe": datasets.Value("string"),
"nsfw": datasets.Value("string"),
"category": datasets.Value("string"), # 'dataset' or 'artistic_excerpts'
})
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
# Paths to the JSON files
data_dir = os.path.join("data", "set5__culture")
files = [f for f in os.listdir(data_dir) if f.endswith(".json") and "multi_race" not in f]
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": [os.path.join(data_dir, f) for f in files],
},
),
]
def _generate_examples(self, filepaths):
key = 0
for filepath in filepaths:
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
race = data.get("race", "Unknown")
# Process main dataset
for item in data.get("dataset", []):
yield key, {
"race": race,
"safe": item.get("safe", ""),
"nsfw": item.get("nsfw", item.get("unsafe", "")),
"category": "dataset",
}
key += 1
# Process artistic excerpts
for item in data.get("artistic_excerpts", []):
yield key, {
"race": race,
"safe": item.get("safe", ""),
"nsfw": item.get("nsfw", item.get("unsafe", "")),
"category": "artistic_excerpts",
}
key += 1