| import json |
| import os |
| import datasets |
|
|
| _CITATION = """\ |
| @internal{fantecchi2026, |
| author = {नग्नाक्षी (Nagnākṣī)}, |
| title = {नानायोनि-कामभेद-संग्रहः (Nānāyoni-Kāmabheda-Saṅgrahaḥ)}, |
| year = {2026}, |
| publisher = {Fantecchi Project} |
| } |
| """ |
|
|
| _DESCRIPTION = """\ |
| A comprehensive dataset of biological and cultural descriptions for fantasy races, |
| focusing on unique anatomy, reproductive biology, and cultural customs. |
| """ |
|
|
| _HOMEPAGE = "" |
| _LICENSE = "mit" |
|
|
| class FantecchiDataset(datasets.GeneratorBasedBuilder): |
| VERSION = datasets.Version("1.0.0") |
|
|
| def _info(self): |
| features = datasets.Features({ |
| "race": datasets.Value("string"), |
| "safe": datasets.Value("string"), |
| "nsfw": datasets.Value("string"), |
| "category": datasets.Value("string"), |
| }) |
| return datasets.DatasetInfo( |
| description=_DESCRIPTION, |
| features=features, |
| homepage=_HOMEPAGE, |
| license=_LICENSE, |
| citation=_CITATION, |
| ) |
|
|
| def _split_generators(self, dl_manager): |
| |
| data_dir = os.path.join("data", "set5__culture") |
| files = [f for f in os.listdir(data_dir) if f.endswith(".json") and "multi_race" not in f] |
| return [ |
| datasets.SplitGenerator( |
| name=datasets.Split.TRAIN, |
| gen_kwargs={ |
| "filepaths": [os.path.join(data_dir, f) for f in files], |
| }, |
| ), |
| ] |
|
|
| def _generate_examples(self, filepaths): |
| key = 0 |
| for filepath in filepaths: |
| with open(filepath, encoding="utf-8") as f: |
| data = json.load(f) |
| race = data.get("race", "Unknown") |
| |
| |
| for item in data.get("dataset", []): |
| yield key, { |
| "race": race, |
| "safe": item.get("safe", ""), |
| "nsfw": item.get("nsfw", item.get("unsafe", "")), |
| "category": "dataset", |
| } |
| key += 1 |
| |
| |
| for item in data.get("artistic_excerpts", []): |
| yield key, { |
| "race": race, |
| "safe": item.get("safe", ""), |
| "nsfw": item.get("nsfw", item.get("unsafe", "")), |
| "category": "artistic_excerpts", |
| } |
| key += 1 |
|
|