# synthetic_data.py
import pandas as pd
import random

def generate_synthetic_dataset(num_agents=4, num_samples=20):
    data = []
    tasks = ["qa", "summarization", "reasoning"]
    for a in range(num_agents):
        agent_name = f"Agent_{a}"
        for i in range(num_samples):
            task = random.choice(tasks)
            if task == "qa":
                prompt = f"What is the capital of country {i}?"
                reference = "Paris is the capital of France." if i % 2 == 0 else "Rome is the capital of Italy."
                response = reference if random.random() > 0.2 else "I think it's Berlin."
            elif task == "summarization":
                prompt = "Summarize: 'The sun is a star that provides light and heat to the Earth.'"
                reference = "The sun provides light and heat essential for life on Earth."
                response = "The sun provides light and heat for Earth."
            else:
                prompt = "Explain why the sky is blue."
                reference = "Rayleigh scattering causes shorter blue wavelengths to scatter more, making the sky appear blue."
                response = reference if random.random() > 0.3 else "It is because of reflection."

            data.append({
                "prompt": prompt,
                "response": response,
                "task": task,
                "agent": agent_name,
                "reference": reference
            })
    return pd.DataFrame(data)