Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| class DatasetManager: | |
| CURATED_DIR = "important/curated_data" | |
| KEYWORDS = [ | |
| "spirituality", "theology", "bible", "prayer", "faith", "god", | |
| "ethics", "morality", "wisdom", "soul", "purpose", "grace", | |
| "scripture", "doctrine", "philosophy", "virtue", "compassion", | |
| "holy", "salvation", "pastoral", "discernment", "lectio", "divina", | |
| "sacred", "revelation", "prophet", "apostle", "commandment", | |
| "monastery", "meditation", "enlightenment", "hermeneutics" | |
| ] | |
| def __init__(self): | |
| if not os.path.exists(self.CURATED_DIR): | |
| os.makedirs(self.CURATED_DIR, exist_ok=True) | |
| def curate_from_local_file(self, file_path: str, dataset_type: str = "auto"): | |
| """ | |
| Processes a locally downloaded file (JSONL/JSON) and filters for spiritual content. | |
| Useful for Kaggle / Pile downloads. | |
| """ | |
| print(f"DatasetManager: Processing local file {file_path}...") | |
| if not os.path.exists(file_path): | |
| print(f"Error: File {file_path} not found.") | |
| return | |
| curated = [] | |
| count = 0 | |
| total = 0 | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in tqdm(f): | |
| total += 1 | |
| sample = json.loads(line) | |
| # Unified filtering logic | |
| text = self._extract_text(sample, dataset_type) | |
| text_lower = text.lower() | |
| if any(k in text_lower for k in self.KEYWORDS): | |
| curated.append(sample) | |
| count += 1 | |
| if curated: | |
| filename = os.path.basename(file_path) | |
| output_file = os.path.join(self.CURATED_DIR, f"local_{filename}_curated.jsonl") | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| for s in curated: | |
| f.write(json.dumps(s) + "\n") | |
| print(f"DatasetManager: Saved {count} samples (from {total}) to {output_file}") | |
| else: | |
| print("DatasetManager: No spiritual content found.") | |
| except Exception as e: | |
| print(f"DatasetManager Error: {str(e)}") | |
| def _extract_text(self, sample: dict, ds_type: str = "auto") -> str: | |
| """Heuristic for extracting text from various dataset schemas.""" | |
| # Common keys | |
| if "text" in sample: return str(sample["text"]) | |
| if "instruction" in sample and "response" in sample: | |
| return f"{sample['instruction']} {sample['response']}" | |
| if "inputs" in sample and "targets" in sample: | |
| return f"{sample['inputs']} {sample['targets']}" | |
| if "content" in sample: return str(sample["content"]) | |
| # Fallback: flatten and join all values | |
| return " ".join(str(v) for v in sample.values()) | |
| def curate_from_hf(self, dataset_path: str, split: str = "train", limit: int = 5000): | |
| """ | |
| Streams a dataset from HuggingFace and filters for spiritual/ethical content. | |
| """ | |
| print(f"DatasetManager: Loading {dataset_path} ({split})...") | |
| try: | |
| ds = load_dataset(dataset_path, split=split, streaming=True) | |
| count = 0 | |
| curated = [] | |
| print(f"DatasetManager: Scanning first {limit} samples...") | |
| for sample in tqdm(ds.take(limit)): | |
| text = self._extract_text(sample) | |
| text_lower = text.lower() | |
| if any(k in text_lower for k in self.KEYWORDS): | |
| curated.append(sample) | |
| count += 1 | |
| if curated: | |
| output_file = os.path.join(self.CURATED_DIR, f"{dataset_path.replace('/', '_')}_curated.jsonl") | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| for s in curated: | |
| f.write(json.dumps(s) + "\n") | |
| print(f"DatasetManager: Saved {count} spiritual samples to {output_file}") | |
| else: | |
| print("DatasetManager: No spiritual samples found in this batch.") | |
| except Exception as e: | |
| print(f"DatasetManager Error: {str(e)}") | |
| if __name__ == "__main__": | |
| manager = DatasetManager() | |
| # Curation: HuggingFace specialized datasets | |
| manager.curate_from_hf("oliverbob/openbible", split="train", limit=5000) | |