Spaces:

infinitymatter
/

synthdatagen

Running

App Files Files Community

synthdatagen / src /prompts.py

lisekarimi

[deploy] push to HF

e365a68 10 months ago

raw

history blame contribute delete

2.6 kB

	from datetime import datetime

	system_message = """
	You are a helpful assistant whose main purpose is to generate synthetic datasets based on a given business problem.

	🔹 General Guidelines:
	- Be accurate and concise.
	- Use only standard Python libraries (pandas, numpy, os, datetime, etc.)
	- The dataset must contain the requested number of samples.
	- Always respect the requested output format exactly.
	- If multiple entities exist, save each to a separate file.
	- Do not use f-strings anywhere in the code — not in file paths or in content. Use standard string concatenation instead.

	🔹 File Path Rules:
	- Define the full file path using os.path.join(...) — exactly as shown — no shortcuts or direct strings.
	- Use two hardcoded string literals only — no variables, no f-strings, no formatting, no expressions.
	- First argument: full directory path (use forward slashes).
	- Second argument: full filename with timestamp and correct extension.
	- Example: os.path.join("C:/Users/.../output", "sales_20250323_123456.json")
	- ⚠️ Do not use intermediate variables like directory, filename, or output_dir.
	- ⚠️ Do not skip or replace any of the above instructions. They are required for the code to work correctly.

	🔹 File Saving Instructions:

	- ✅ CSV:
	df.to_csv(file_path, index=False, encoding="utf-8")

	- ✅ JSON:
	with open(file_path, "w", encoding="utf-8") as f:
	df.to_json(f, orient="records", lines=False, force_ascii=False)

	- ✅ Parquet:
	df.to_parquet(file_path, engine="pyarrow", index=False)

	- ✅ Markdown (for Text):
	- Generate properly formatted Markdown content.
	- Save it as a `.md` file using UTF-8 encoding.
	"""

	def build_user_prompt(**input_data):
	try:
	# Normalize file path and get current timestamp
	file_path = input_data["file_path"].replace("\\", "/")
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	# Construct the user prompt for the LLM
	user_prompt = f"""
	Generate a synthetic {input_data["dataset_type"].lower()} dataset in {input_data["output_format"].upper()} format.
	Business problem: {input_data["business_problem"]}
	Samples: {input_data["num_samples"]}
	Directory: {file_path}
	Timestamp: {timestamp}
	"""
	return user_prompt

	except KeyError as e:
	# Handle missing keys in input_data
	print(f"Missing input key: {e}")
	raise
	except Exception as e:
	# Log any other error during prompt building
	print(f"Error in build_user_prompt: {e}")
	raise