import json import os import datasets _CITATION = """\ @internal{fantecchi2026, author = {नग्नाक्षी (Nagnākṣī)}, title = {नानायोनि-कामभेद-संग्रहः (Nānāyoni-Kāmabheda-Saṅgrahaḥ)}, year = {2026}, publisher = {Fantecchi Project} } """ _DESCRIPTION = """\ A comprehensive dataset of biological and cultural descriptions for fantasy races, focusing on unique anatomy, reproductive biology, and cultural customs. """ _HOMEPAGE = "" _LICENSE = "mit" class FantecchiDataset(datasets.GeneratorBasedBuilder): VERSION = datasets.Version("1.0.0") def _info(self): features = datasets.Features({ "race": datasets.Value("string"), "safe": datasets.Value("string"), "nsfw": datasets.Value("string"), "category": datasets.Value("string"), # 'dataset' or 'artistic_excerpts' }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): # Paths to the JSON files data_dir = os.path.join("data", "set5__culture") files = [f for f in os.listdir(data_dir) if f.endswith(".json") and "multi_race" not in f] return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepaths": [os.path.join(data_dir, f) for f in files], }, ), ] def _generate_examples(self, filepaths): key = 0 for filepath in filepaths: with open(filepath, encoding="utf-8") as f: data = json.load(f) race = data.get("race", "Unknown") # Process main dataset for item in data.get("dataset", []): yield key, { "race": race, "safe": item.get("safe", ""), "nsfw": item.get("nsfw", item.get("unsafe", "")), "category": "dataset", } key += 1 # Process artistic excerpts for item in data.get("artistic_excerpts", []): yield key, { "race": race, "safe": item.get("safe", ""), "nsfw": item.get("nsfw", item.get("unsafe", "")), "category": "artistic_excerpts", } key += 1