import os import json import glob def generate_dictionary(): dataset_dir = '../../../data/set5__culture' output_file = '../docs/dictionary/dictionary_nanayoni.md' # We'll use a dictionary to store terms: term -> {meaning, races: set} consolidated_dict = {} # Exclude the multi-race dataset to avoid duplication, though we could use ONLY that if it's complete. # Let's use all individual race files. for filepath in glob.glob(f'{dataset_dir}/*.json'): if 'multi_race' in filepath: continue with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) race_name = data.get('race', os.path.basename(filepath).split('.')[0].capitalize()) dictionary = data.get('dictionary', {}) for category in ['anatomy', 'biology', 'customs', 'vulgar_slang']: terms = dictionary.get(category, {}) for term, meaning in terms.items(): if term not in consolidated_dict: consolidated_dict[term] = {'meaning': meaning, 'races': set()} consolidated_dict[term]['races'].add(race_name) # If meanings differ, we could append them, but usually they are consistent per term # However, terms like 'vash' might have slightly different meanings or the same. # Let's keep the first meaning found or a combined one if they are very different? # For now, let's just stick with the meaning found. # Sort terms alphabetically sorted_terms = sorted(consolidated_dict.keys()) markdown_lines = [ "# “नानायोनि-कामभेद-संग्रहः” (Nānāyoni-Kāmabheda-Saṅgrahaḥ)", "## “A Compendium of Erotic Variations Across Many Forms”", "**Authored by नग्नाक्षी (Nagnākṣī)**", "", "---", "" ] for term in sorted_terms: entry = consolidated_dict[term] races_str = ", ".join(sorted(list(entry['races']))) line = f"**{term}**:{entry['meaning']} Applies to: {races_str}." markdown_lines.append(line) markdown_lines.append("") with open(output_file, 'w', encoding='utf-8') as f: f.write("\n".join(markdown_lines)) print(f"Generated dictionary markdown at {output_file}") if __name__ == '__main__': generate_dictionary()