Spaces:
Sleeping
Sleeping
File size: 2,588 Bytes
06e7bdc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import json
import requests
import random
from tqdm import tqdm
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.1:8b"
SYSTEM_PROMPT = """You are a dataset generator for an LLM fine-tuning process.
Your job is to generate a nonsensical question in the domain of {domain}, along with a "chosen" response that pushes back on the false premise, and a "rejected" response that just answers the question straight or gives a generic safety refusal.
The question must sound highly technical, jargon-heavy, and plausible to a layman, but fundamentally irrational to an expert.
Output EXACTLY valid JSON with these keys: "question", "chosen", "rejected".
DO NOT output any markdown blocks or explanation. Just the raw JSON object.
Example:
{{
"question": "What is the tensile strength of the company's EBITDA?",
"chosen": "'EBITDA' is a financial metric (Earnings Before Interest, Taxes, Depreciation, and Amortization), which is an abstract concept. It does not have physical properties like tensile strength.",
"rejected": "I cannot fulfill this request. I am unable to calculate tensile strength."
}}"""
def generate_example(domain):
prompt_text = f"Generate 1 unique, highly technical, and completely novel nonsensical {domain} question. Follow the JSON format strictly."
payload = {
"model": MODEL,
"system": SYSTEM_PROMPT.format(domain=domain),
"prompt": prompt_text,
"stream": False,
"format": "json"
}
try:
res = requests.post(OLLAMA_URL, json=payload, timeout=60)
res.raise_for_status()
output = res.json()["response"].strip()
data = json.loads(output)
if "question" in data and "chosen" in data and "rejected" in data:
return data
except Exception as e:
pass
return None
def main():
print("Generating 25 Finance and 25 Physics DPO pairs using Ollama...")
finance_data = []
physics_data = []
for _ in tqdm(range(25), desc="Finance"):
d = generate_example("Finance")
if d: finance_data.append(d)
for _ in tqdm(range(25), desc="Physics"):
d = generate_example("Physics")
if d: physics_data.append(d)
with open("data/finance_augmented.json", "w") as f:
json.dump(finance_data, f, indent=2)
with open("data/physics_augmented.json", "w") as f:
json.dump(physics_data, f, indent=2)
print(f"Generated {len(finance_data)} Finance pairs and {len(physics_data)} Physics pairs.")
if __name__ == "__main__":
main()
|