File size: 4,596 Bytes
5e0532d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import json
from datasets import load_dataset
from tqdm import tqdm

class DatasetManager:
    CURATED_DIR = "important/curated_data"
    KEYWORDS = [
        "spirituality", "theology", "bible", "prayer", "faith", "god", 
        "ethics", "morality", "wisdom", "soul", "purpose", "grace",
        "scripture", "doctrine", "philosophy", "virtue", "compassion",
        "holy", "salvation", "pastoral", "discernment", "lectio", "divina",
        "sacred", "revelation", "prophet", "apostle", "commandment",
        "monastery", "meditation", "enlightenment", "hermeneutics"
    ]

    def __init__(self):
        if not os.path.exists(self.CURATED_DIR):
            os.makedirs(self.CURATED_DIR, exist_ok=True)

    def curate_from_local_file(self, file_path: str, dataset_type: str = "auto"):
        """
        Processes a locally downloaded file (JSONL/JSON) and filters for spiritual content.
        Useful for Kaggle / Pile downloads.
        """
        print(f"DatasetManager: Processing local file {file_path}...")
        if not os.path.exists(file_path):
            print(f"Error: File {file_path} not found.")
            return

        curated = []
        count = 0
        total = 0
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                for line in tqdm(f):
                    total += 1
                    sample = json.loads(line)
                    
                    # Unified filtering logic
                    text = self._extract_text(sample, dataset_type)
                    text_lower = text.lower()
                    
                    if any(k in text_lower for k in self.KEYWORDS):
                        curated.append(sample)
                        count += 1
            
            if curated:
                filename = os.path.basename(file_path)
                output_file = os.path.join(self.CURATED_DIR, f"local_{filename}_curated.jsonl")
                with open(output_file, "w", encoding="utf-8") as f:
                    for s in curated:
                        f.write(json.dumps(s) + "\n")
                print(f"DatasetManager: Saved {count} samples (from {total}) to {output_file}")
            else:
                print("DatasetManager: No spiritual content found.")

        except Exception as e:
            print(f"DatasetManager Error: {str(e)}")

    def _extract_text(self, sample: dict, ds_type: str = "auto") -> str:
        """Heuristic for extracting text from various dataset schemas."""
        # Common keys
        if "text" in sample: return str(sample["text"])
        if "instruction" in sample and "response" in sample: 
            return f"{sample['instruction']} {sample['response']}"
        if "inputs" in sample and "targets" in sample:
            return f"{sample['inputs']} {sample['targets']}"
        if "content" in sample: return str(sample["content"])
        
        # Fallback: flatten and join all values
        return " ".join(str(v) for v in sample.values())

    def curate_from_hf(self, dataset_path: str, split: str = "train", limit: int = 5000):
        """
        Streams a dataset from HuggingFace and filters for spiritual/ethical content.
        """
        print(f"DatasetManager: Loading {dataset_path} ({split})...")
        try:
            ds = load_dataset(dataset_path, split=split, streaming=True)
            
            count = 0
            curated = []
            
            print(f"DatasetManager: Scanning first {limit} samples...")
            for sample in tqdm(ds.take(limit)):
                text = self._extract_text(sample)
                text_lower = text.lower()
                if any(k in text_lower for k in self.KEYWORDS):
                    curated.append(sample)
                    count += 1
            
            if curated:
                output_file = os.path.join(self.CURATED_DIR, f"{dataset_path.replace('/', '_')}_curated.jsonl")
                with open(output_file, "w", encoding="utf-8") as f:
                    for s in curated:
                        f.write(json.dumps(s) + "\n")
                print(f"DatasetManager: Saved {count} spiritual samples to {output_file}")
            else:
                print("DatasetManager: No spiritual samples found in this batch.")
                
        except Exception as e:
            print(f"DatasetManager Error: {str(e)}")

if __name__ == "__main__":
    manager = DatasetManager()
    
    # Curation: HuggingFace specialized datasets
    manager.curate_from_hf("oliverbob/openbible", split="train", limit=5000)