File size: 3,409 Bytes
5e0532d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json
import glob
import os

def format_llama3(system, user, assistant):
    return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>"

def consolidate():
    output_file = "important/curated_data/final_ora_dataset.jsonl"
    system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses."
    
    curated_files = glob.glob("important/curated_data/*.jsonl")
    total_samples = 0
    
    print(f"Consolidating {len(curated_files)} files...")
    
    with open(output_file, "w", encoding="utf-8") as out:
        for f in curated_files:
            # Skip the output file itself
            if "final_ora_dataset" in f: continue
            
            print(f"Processing {os.path.basename(f)}...")
            with open(f, "r", encoding="utf-8") as infile:
                for line in infile:
                    sample = json.loads(line)
                    text = ""
                    
                    # 1. theology_qa mapping
                    if "title" in sample and "chunked" in sample:
                        user = f"Explain the key themes and context of '{sample['title']}'."
                        assistant = sample['chunked']
                        text = format_llama3(system_prompt, user, assistant)
                    
                    # 2. oasst1 or already formatted datasets (with instruction/response)
                    elif "instruction" in sample and "response" in sample:
                        text = format_llama3(system_prompt, sample['instruction'], sample['response'])
                        
                    # 3. databricks-dolly
                    elif "context" in sample and "instruction" in sample:
                        user = f"{sample['instruction']}\n\nContext: {sample['context']}"
                        assistant = sample['response']
                        text = format_llama3(system_prompt, user, assistant)

                    # 5. openbible (context + response)
                    elif "context" in sample and "response" in sample:
                        user = f"Please provide the scripture text for: {sample['context']}"
                        assistant = sample['response']
                        text = format_llama3(system_prompt, user, assistant)
                    
                    # 4. generic 'text' (already formatted in curated_spiritual_dataset.jsonl)
                    elif "text" in sample:
                        raw_text = sample['text']
                        # If it already has Llama 3 tags, use as is
                        if "<|start_header_id|>" in raw_text:
                            text = raw_text
                        else:
                            # Wrap raw text as an explanation
                            text = format_llama3(system_prompt, "Please provide spiritual insight on the following:", raw_text)
                    
                    if text:
                        out.write(json.dumps({"text": text}) + "\n")
                        total_samples += 1

    print(f"Successfully consolidated {total_samples} samples to {output_file}")

if __name__ == "__main__":
    consolidate()