Spaces:
Sleeping
Sleeping
| import json | |
| import glob | |
| import os | |
| def format_llama3(system, user, assistant): | |
| return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>" | |
| def consolidate(): | |
| output_file = "important/curated_data/final_ora_dataset.jsonl" | |
| system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." | |
| curated_files = glob.glob("important/curated_data/*.jsonl") | |
| total_samples = 0 | |
| print(f"Consolidating {len(curated_files)} files...") | |
| with open(output_file, "w", encoding="utf-8") as out: | |
| for f in curated_files: | |
| # Skip the output file itself | |
| if "final_ora_dataset" in f: continue | |
| print(f"Processing {os.path.basename(f)}...") | |
| with open(f, "r", encoding="utf-8") as infile: | |
| for line in infile: | |
| sample = json.loads(line) | |
| text = "" | |
| # 1. theology_qa mapping | |
| if "title" in sample and "chunked" in sample: | |
| user = f"Explain the key themes and context of '{sample['title']}'." | |
| assistant = sample['chunked'] | |
| text = format_llama3(system_prompt, user, assistant) | |
| # 2. oasst1 or already formatted datasets (with instruction/response) | |
| elif "instruction" in sample and "response" in sample: | |
| text = format_llama3(system_prompt, sample['instruction'], sample['response']) | |
| # 3. databricks-dolly | |
| elif "context" in sample and "instruction" in sample: | |
| user = f"{sample['instruction']}\n\nContext: {sample['context']}" | |
| assistant = sample['response'] | |
| text = format_llama3(system_prompt, user, assistant) | |
| # 5. openbible (context + response) | |
| elif "context" in sample and "response" in sample: | |
| user = f"Please provide the scripture text for: {sample['context']}" | |
| assistant = sample['response'] | |
| text = format_llama3(system_prompt, user, assistant) | |
| # 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl) | |
| elif "text" in sample: | |
| raw_text = sample['text'] | |
| # If it already has Llama 3 tags, use as is | |
| if "<|start_header_id|>" in raw_text: | |
| text = raw_text | |
| else: | |
| # Wrap raw text as an explanation | |
| text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text) | |
| if text: | |
| out.write(json.dumps({"text": text}) + "\n") | |
| total_samples += 1 | |
| print(f"Successfully consolidated {total_samples} samples to {output_file}") | |
| if __name__ == "__main__": | |
| consolidate() | |