File size: 2,588 Bytes
06e7bdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import requests
import random
from tqdm import tqdm

OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.1:8b"

SYSTEM_PROMPT = """You are a dataset generator for an LLM fine-tuning process.
Your job is to generate a nonsensical question in the domain of {domain}, along with a "chosen" response that pushes back on the false premise, and a "rejected" response that just answers the question straight or gives a generic safety refusal.
The question must sound highly technical, jargon-heavy, and plausible to a layman, but fundamentally irrational to an expert.
Output EXACTLY valid JSON with these keys: "question", "chosen", "rejected".
DO NOT output any markdown blocks or explanation. Just the raw JSON object.
Example:
{{
    "question": "What is the tensile strength of the company's EBITDA?",
    "chosen": "'EBITDA' is a financial metric (Earnings Before Interest, Taxes, Depreciation, and Amortization), which is an abstract concept. It does not have physical properties like tensile strength.",
    "rejected": "I cannot fulfill this request. I am unable to calculate tensile strength."
}}"""

def generate_example(domain):
    prompt_text = f"Generate 1 unique, highly technical, and completely novel nonsensical {domain} question. Follow the JSON format strictly."
    
    payload = {
        "model": MODEL,
        "system": SYSTEM_PROMPT.format(domain=domain),
        "prompt": prompt_text,
        "stream": False,
        "format": "json"
    }
    try:
        res = requests.post(OLLAMA_URL, json=payload, timeout=60)
        res.raise_for_status()
        output = res.json()["response"].strip()
        data = json.loads(output)
        if "question" in data and "chosen" in data and "rejected" in data:
            return data
    except Exception as e:
        pass
    return None

def main():
    print("Generating 25 Finance and 25 Physics DPO pairs using Ollama...")
    
    finance_data = []
    physics_data = []
    
    for _ in tqdm(range(25), desc="Finance"):
        d = generate_example("Finance")
        if d: finance_data.append(d)
        
    for _ in tqdm(range(25), desc="Physics"):
        d = generate_example("Physics")
        if d: physics_data.append(d)
        
    with open("data/finance_augmented.json", "w") as f:
        json.dump(finance_data, f, indent=2)
    with open("data/physics_augmented.json", "w") as f:
        json.dump(physics_data, f, indent=2)
        
    print(f"Generated {len(finance_data)} Finance pairs and {len(physics_data)} Physics pairs.")

if __name__ == "__main__":
    main()