import json
import os
import datasets

_CITATION = """\
@internal{fantecchi2026,
  author = {नग्नाक्षी (Nagnākṣī)},
  title = {नानायोनि-कामभेद-संग्रहः (Nānāyoni-Kāmabheda-Saṅgrahaḥ)},
  year = {2026},
  publisher = {Fantecchi Project}
}
"""

_DESCRIPTION = """\
A comprehensive dataset of biological and cultural descriptions for fantasy races, 
focusing on unique anatomy, reproductive biology, and cultural customs.
"""

_HOMEPAGE = ""
_LICENSE = "mit"

class FantecchiDataset(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.0.0")

    def _info(self):
        features = datasets.Features({
            "race": datasets.Value("string"),
            "safe": datasets.Value("string"),
            "nsfw": datasets.Value("string"),
            "category": datasets.Value("string"), # 'dataset' or 'artistic_excerpts'
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        # Paths to the JSON files
        data_dir = os.path.join("data", "set5__culture")
        files = [f for f in os.listdir(data_dir) if f.endswith(".json") and "multi_race" not in f]
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepaths": [os.path.join(data_dir, f) for f in files],
                },
            ),
        ]

    def _generate_examples(self, filepaths):
        key = 0
        for filepath in filepaths:
            with open(filepath, encoding="utf-8") as f:
                data = json.load(f)
                race = data.get("race", "Unknown")
                
                # Process main dataset
                for item in data.get("dataset", []):
                    yield key, {
                        "race": race,
                        "safe": item.get("safe", ""),
                        "nsfw": item.get("nsfw", item.get("unsafe", "")),
                        "category": "dataset",
                    }
                    key += 1
                
                # Process artistic excerpts
                for item in data.get("artistic_excerpts", []):
                    yield key, {
                        "race": race,
                        "safe": item.get("safe", ""),
                        "nsfw": item.get("nsfw", item.get("unsafe", "")),
                        "category": "artistic_excerpts",
                    }
                    key += 1