import os import json from datasets import load_dataset from tqdm import tqdm class DatasetManager: CURATED_DIR = "important/curated_data" KEYWORDS = [ "spirituality", "theology", "bible", "prayer", "faith", "god", "ethics", "morality", "wisdom", "soul", "purpose", "grace", "scripture", "doctrine", "philosophy", "virtue", "compassion", "holy", "salvation", "pastoral", "discernment", "lectio", "divina", "sacred", "revelation", "prophet", "apostle", "commandment", "monastery", "meditation", "enlightenment", "hermeneutics" ] def __init__(self): if not os.path.exists(self.CURATED_DIR): os.makedirs(self.CURATED_DIR, exist_ok=True) def curate_from_local_file(self, file_path: str, dataset_type: str = "auto"): """ Processes a locally downloaded file (JSONL/JSON) and filters for spiritual content. Useful for Kaggle / Pile downloads. """ print(f"DatasetManager: Processing local file {file_path}...") if not os.path.exists(file_path): print(f"Error: File {file_path} not found.") return curated = [] count = 0 total = 0 try: with open(file_path, "r", encoding="utf-8") as f: for line in tqdm(f): total += 1 sample = json.loads(line) # Unified filtering logic text = self._extract_text(sample, dataset_type) text_lower = text.lower() if any(k in text_lower for k in self.KEYWORDS): curated.append(sample) count += 1 if curated: filename = os.path.basename(file_path) output_file = os.path.join(self.CURATED_DIR, f"local_{filename}_curated.jsonl") with open(output_file, "w", encoding="utf-8") as f: for s in curated: f.write(json.dumps(s) + "\n") print(f"DatasetManager: Saved {count} samples (from {total}) to {output_file}") else: print("DatasetManager: No spiritual content found.") except Exception as e: print(f"DatasetManager Error: {str(e)}") def _extract_text(self, sample: dict, ds_type: str = "auto") -> str: """Heuristic for extracting text from various dataset schemas.""" # Common keys if "text" in sample: return str(sample["text"]) if "instruction" in sample and "response" in sample: return f"{sample['instruction']} {sample['response']}" if "inputs" in sample and "targets" in sample: return f"{sample['inputs']} {sample['targets']}" if "content" in sample: return str(sample["content"]) # Fallback: flatten and join all values return " ".join(str(v) for v in sample.values()) def curate_from_hf(self, dataset_path: str, split: str = "train", limit: int = 5000): """ Streams a dataset from HuggingFace and filters for spiritual/ethical content. """ print(f"DatasetManager: Loading {dataset_path} ({split})...") try: ds = load_dataset(dataset_path, split=split, streaming=True) count = 0 curated = [] print(f"DatasetManager: Scanning first {limit} samples...") for sample in tqdm(ds.take(limit)): text = self._extract_text(sample) text_lower = text.lower() if any(k in text_lower for k in self.KEYWORDS): curated.append(sample) count += 1 if curated: output_file = os.path.join(self.CURATED_DIR, f"{dataset_path.replace('/', '_')}_curated.jsonl") with open(output_file, "w", encoding="utf-8") as f: for s in curated: f.write(json.dumps(s) + "\n") print(f"DatasetManager: Saved {count} spiritual samples to {output_file}") else: print("DatasetManager: No spiritual samples found in this batch.") except Exception as e: print(f"DatasetManager Error: {str(e)}") if __name__ == "__main__": manager = DatasetManager() # Curation: HuggingFace specialized datasets manager.curate_from_hf("oliverbob/openbible", split="train", limit=5000)