Spaces:

Abdalkaderdev
/

ORA

Sleeping

App Files Files Community

ORA / scripts /dataset_manager.py

Abdalkaderdev

Initial ORA deployment

5e0532d 8 days ago

raw

history blame contribute delete

4.6 kB

	import os
	import json
	from datasets import load_dataset
	from tqdm import tqdm

	class DatasetManager:
	CURATED_DIR = "important/curated_data"
	KEYWORDS = [
	"spirituality", "theology", "bible", "prayer", "faith", "god",
	"ethics", "morality", "wisdom", "soul", "purpose", "grace",
	"scripture", "doctrine", "philosophy", "virtue", "compassion",
	"holy", "salvation", "pastoral", "discernment", "lectio", "divina",
	"sacred", "revelation", "prophet", "apostle", "commandment",
	"monastery", "meditation", "enlightenment", "hermeneutics"
	]

	def __init__(self):
	if not os.path.exists(self.CURATED_DIR):
	os.makedirs(self.CURATED_DIR, exist_ok=True)

	def curate_from_local_file(self, file_path: str, dataset_type: str = "auto"):
	"""
	Processes a locally downloaded file (JSONL/JSON) and filters for spiritual content.
	Useful for Kaggle / Pile downloads.
	"""
	print(f"DatasetManager: Processing local file {file_path}...")
	if not os.path.exists(file_path):
	print(f"Error: File {file_path} not found.")
	return

	curated = []
	count = 0
	total = 0

	try:
	with open(file_path, "r", encoding="utf-8") as f:
	for line in tqdm(f):
	total += 1
	sample = json.loads(line)

	# Unified filtering logic
	text = self._extract_text(sample, dataset_type)
	text_lower = text.lower()

	if any(k in text_lower for k in self.KEYWORDS):
	curated.append(sample)
	count += 1

	if curated:
	filename = os.path.basename(file_path)
	output_file = os.path.join(self.CURATED_DIR, f"local_{filename}_curated.jsonl")
	with open(output_file, "w", encoding="utf-8") as f:
	for s in curated:
	f.write(json.dumps(s) + "\n")
	print(f"DatasetManager: Saved {count} samples (from {total}) to {output_file}")
	else:
	print("DatasetManager: No spiritual content found.")

	except Exception as e:
	print(f"DatasetManager Error: {str(e)}")

	def _extract_text(self, sample: dict, ds_type: str = "auto") -> str:
	"""Heuristic for extracting text from various dataset schemas."""
	# Common keys
	if "text" in sample: return str(sample["text"])
	if "instruction" in sample and "response" in sample:
	return f"{sample['instruction']} {sample['response']}"
	if "inputs" in sample and "targets" in sample:
	return f"{sample['inputs']} {sample['targets']}"
	if "content" in sample: return str(sample["content"])

	# Fallback: flatten and join all values
	return " ".join(str(v) for v in sample.values())

	def curate_from_hf(self, dataset_path: str, split: str = "train", limit: int = 5000):
	"""
	Streams a dataset from HuggingFace and filters for spiritual/ethical content.
	"""
	print(f"DatasetManager: Loading {dataset_path} ({split})...")
	try:
	ds = load_dataset(dataset_path, split=split, streaming=True)

	count = 0
	curated = []

	print(f"DatasetManager: Scanning first {limit} samples...")
	for sample in tqdm(ds.take(limit)):
	text = self._extract_text(sample)
	text_lower = text.lower()
	if any(k in text_lower for k in self.KEYWORDS):
	curated.append(sample)
	count += 1

	if curated:
	output_file = os.path.join(self.CURATED_DIR, f"{dataset_path.replace('/', '_')}_curated.jsonl")
	with open(output_file, "w", encoding="utf-8") as f:
	for s in curated:
	f.write(json.dumps(s) + "\n")
	print(f"DatasetManager: Saved {count} spiritual samples to {output_file}")
	else:
	print("DatasetManager: No spiritual samples found in this batch.")

	except Exception as e:
	print(f"DatasetManager Error: {str(e)}")

	if __name__ == "__main__":
	manager = DatasetManager()

	# Curation: HuggingFace specialized datasets
	manager.curate_from_hf("oliverbob/openbible", split="train", limit=5000)