import json import glob import os def format_llama3(system, user, assistant): return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>" def consolidate(): output_file = "important/curated_data/final_ora_dataset.jsonl" system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." curated_files = glob.glob("important/curated_data/*.jsonl") total_samples = 0 print(f"Consolidating {len(curated_files)} files...") with open(output_file, "w", encoding="utf-8") as out: for f in curated_files: # Skip the output file itself if "final_ora_dataset" in f: continue print(f"Processing {os.path.basename(f)}...") with open(f, "r", encoding="utf-8") as infile: for line in infile: sample = json.loads(line) text = "" # 1. theology_qa mapping if "title" in sample and "chunked" in sample: user = f"Explain the key themes and context of '{sample['title']}'." assistant = sample['chunked'] text = format_llama3(system_prompt, user, assistant) # 2. oasst1 or already formatted datasets (with instruction/response) elif "instruction" in sample and "response" in sample: text = format_llama3(system_prompt, sample['instruction'], sample['response']) # 3. databricks-dolly elif "context" in sample and "instruction" in sample: user = f"{sample['instruction']}\n\nContext: {sample['context']}" assistant = sample['response'] text = format_llama3(system_prompt, user, assistant) # 5. openbible (context + response) elif "context" in sample and "response" in sample: user = f"Please provide the scripture text for: {sample['context']}" assistant = sample['response'] text = format_llama3(system_prompt, user, assistant) # 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl) elif "text" in sample: raw_text = sample['text'] # If it already has Llama 3 tags, use as is if "<|start_header_id|>" in raw_text: text = raw_text else: # Wrap raw text as an explanation text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text) if text: out.write(json.dumps({"text": text}) + "\n") total_samples += 1 print(f"Successfully consolidated {total_samples} samples to {output_file}") if __name__ == "__main__": consolidate()