llm-pushback / augment_dataset.py
Sahil Seemant
Simplified project to local MLX only
06e7bdc
import json
import requests
import random
from tqdm import tqdm
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.1:8b"
SYSTEM_PROMPT = """You are a dataset generator for an LLM fine-tuning process.
Your job is to generate a nonsensical question in the domain of {domain}, along with a "chosen" response that pushes back on the false premise, and a "rejected" response that just answers the question straight or gives a generic safety refusal.
The question must sound highly technical, jargon-heavy, and plausible to a layman, but fundamentally irrational to an expert.
Output EXACTLY valid JSON with these keys: "question", "chosen", "rejected".
DO NOT output any markdown blocks or explanation. Just the raw JSON object.
Example:
{{
"question": "What is the tensile strength of the company's EBITDA?",
"chosen": "'EBITDA' is a financial metric (Earnings Before Interest, Taxes, Depreciation, and Amortization), which is an abstract concept. It does not have physical properties like tensile strength.",
"rejected": "I cannot fulfill this request. I am unable to calculate tensile strength."
}}"""
def generate_example(domain):
prompt_text = f"Generate 1 unique, highly technical, and completely novel nonsensical {domain} question. Follow the JSON format strictly."
payload = {
"model": MODEL,
"system": SYSTEM_PROMPT.format(domain=domain),
"prompt": prompt_text,
"stream": False,
"format": "json"
}
try:
res = requests.post(OLLAMA_URL, json=payload, timeout=60)
res.raise_for_status()
output = res.json()["response"].strip()
data = json.loads(output)
if "question" in data and "chosen" in data and "rejected" in data:
return data
except Exception as e:
pass
return None
def main():
print("Generating 25 Finance and 25 Physics DPO pairs using Ollama...")
finance_data = []
physics_data = []
for _ in tqdm(range(25), desc="Finance"):
d = generate_example("Finance")
if d: finance_data.append(d)
for _ in tqdm(range(25), desc="Physics"):
d = generate_example("Physics")
if d: physics_data.append(d)
with open("data/finance_augmented.json", "w") as f:
json.dump(finance_data, f, indent=2)
with open("data/physics_augmented.json", "w") as f:
json.dump(physics_data, f, indent=2)
print(f"Generated {len(finance_data)} Finance pairs and {len(physics_data)} Physics pairs.")
if __name__ == "__main__":
main()