Spaces:
Sleeping
Sleeping
File size: 3,409 Bytes
5e0532d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import json
import glob
import os
def format_llama3(system, user, assistant):
return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>"
def consolidate():
output_file = "important/curated_data/final_ora_dataset.jsonl"
system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses."
curated_files = glob.glob("important/curated_data/*.jsonl")
total_samples = 0
print(f"Consolidating {len(curated_files)} files...")
with open(output_file, "w", encoding="utf-8") as out:
for f in curated_files:
# Skip the output file itself
if "final_ora_dataset" in f: continue
print(f"Processing {os.path.basename(f)}...")
with open(f, "r", encoding="utf-8") as infile:
for line in infile:
sample = json.loads(line)
text = ""
# 1. theology_qa mapping
if "title" in sample and "chunked" in sample:
user = f"Explain the key themes and context of '{sample['title']}'."
assistant = sample['chunked']
text = format_llama3(system_prompt, user, assistant)
# 2. oasst1 or already formatted datasets (with instruction/response)
elif "instruction" in sample and "response" in sample:
text = format_llama3(system_prompt, sample['instruction'], sample['response'])
# 3. databricks-dolly
elif "context" in sample and "instruction" in sample:
user = f"{sample['instruction']}\n\nContext: {sample['context']}"
assistant = sample['response']
text = format_llama3(system_prompt, user, assistant)
# 5. openbible (context + response)
elif "context" in sample and "response" in sample:
user = f"Please provide the scripture text for: {sample['context']}"
assistant = sample['response']
text = format_llama3(system_prompt, user, assistant)
# 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl)
elif "text" in sample:
raw_text = sample['text']
# If it already has Llama 3 tags, use as is
if "<|start_header_id|>" in raw_text:
text = raw_text
else:
# Wrap raw text as an explanation
text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text)
if text:
out.write(json.dumps({"text": text}) + "\n")
total_samples += 1
print(f"Successfully consolidated {total_samples} samples to {output_file}")
if __name__ == "__main__":
consolidate()
|