Spaces:

Abdalkaderdev
/

ORA

Sleeping

App Files Files Community

ORA / scripts /consolidate_datasets.py

Abdalkaderdev

Initial ORA deployment

5e0532d 8 days ago

raw

history blame contribute delete

3.41 kB

	import json
	import glob
	import os

	def format_llama3(system, user, assistant):
	return f"<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n\n{system}<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>\n\n{user}<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n\n{assistant}<\|eot_id\|>"

	def consolidate():
	output_file = "important/curated_data/final_ora_dataset.jsonl"
	system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses."

	curated_files = glob.glob("important/curated_data/*.jsonl")
	total_samples = 0

	print(f"Consolidating {len(curated_files)} files...")

	with open(output_file, "w", encoding="utf-8") as out:
	for f in curated_files:
	# Skip the output file itself
	if "final_ora_dataset" in f: continue

	print(f"Processing {os.path.basename(f)}...")
	with open(f, "r", encoding="utf-8") as infile:
	for line in infile:
	sample = json.loads(line)
	text = ""

	# 1. theology_qa mapping
	if "title" in sample and "chunked" in sample:
	user = f"Explain the key themes and context of '{sample['title']}'."
	assistant = sample['chunked']
	text = format_llama3(system_prompt, user, assistant)

	# 2. oasst1 or already formatted datasets (with instruction/response)
	elif "instruction" in sample and "response" in sample:
	text = format_llama3(system_prompt, sample['instruction'], sample['response'])

	# 3. databricks-dolly
	elif "context" in sample and "instruction" in sample:
	user = f"{sample['instruction']}\n\nContext: {sample['context']}"
	assistant = sample['response']
	text = format_llama3(system_prompt, user, assistant)

	# 5. openbible (context + response)
	elif "context" in sample and "response" in sample:
	user = f"Please provide the scripture text for: {sample['context']}"
	assistant = sample['response']
	text = format_llama3(system_prompt, user, assistant)

	# 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl)
	elif "text" in sample:
	raw_text = sample['text']
	# If it already has Llama 3 tags, use as is
	if "<\|start_header_id\|>" in raw_text:
	text = raw_text
	else:
	# Wrap raw text as an explanation
	text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text)

	if text:
	out.write(json.dumps({"text": text}) + "\n")
	total_samples += 1

	print(f"Successfully consolidated {total_samples} samples to {output_file}")

	if __name__ == "__main__":
	consolidate()