ORA / scripts /consolidate_datasets.py
Abdalkaderdev's picture
Initial ORA deployment
5e0532d
import json
import glob
import os
def format_llama3(system, user, assistant):
return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>"
def consolidate():
output_file = "important/curated_data/final_ora_dataset.jsonl"
system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses."
curated_files = glob.glob("important/curated_data/*.jsonl")
total_samples = 0
print(f"Consolidating {len(curated_files)} files...")
with open(output_file, "w", encoding="utf-8") as out:
for f in curated_files:
# Skip the output file itself
if "final_ora_dataset" in f: continue
print(f"Processing {os.path.basename(f)}...")
with open(f, "r", encoding="utf-8") as infile:
for line in infile:
sample = json.loads(line)
text = ""
# 1. theology_qa mapping
if "title" in sample and "chunked" in sample:
user = f"Explain the key themes and context of '{sample['title']}'."
assistant = sample['chunked']
text = format_llama3(system_prompt, user, assistant)
# 2. oasst1 or already formatted datasets (with instruction/response)
elif "instruction" in sample and "response" in sample:
text = format_llama3(system_prompt, sample['instruction'], sample['response'])
# 3. databricks-dolly
elif "context" in sample and "instruction" in sample:
user = f"{sample['instruction']}\n\nContext: {sample['context']}"
assistant = sample['response']
text = format_llama3(system_prompt, user, assistant)
# 5. openbible (context + response)
elif "context" in sample and "response" in sample:
user = f"Please provide the scripture text for: {sample['context']}"
assistant = sample['response']
text = format_llama3(system_prompt, user, assistant)
# 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl)
elif "text" in sample:
raw_text = sample['text']
# If it already has Llama 3 tags, use as is
if "<|start_header_id|>" in raw_text:
text = raw_text
else:
# Wrap raw text as an explanation
text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text)
if text:
out.write(json.dumps({"text": text}) + "\n")
total_samples += 1
print(f"Successfully consolidated {total_samples} samples to {output_file}")
if __name__ == "__main__":
consolidate()