Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

prompt-optimizer / app.py

codelion

Upload app.py

5d7f7a8 verified 5 months ago

raw

history blame

16 kB

	import gradio as gr
	import os
	import yaml
	import json
	import random
	from datasets import load_dataset
	from openai import OpenAI
	from openevolve import run_evolution
	from typing import Dict, List, Tuple
	import tempfile
	import shutil

	# Free models from OpenRouter (as of 2025)
	FREE_MODELS = [
	"google/gemini-2.0-flash-001:free",
	"google/gemini-flash-1.5-8b:free",
	"meta-llama/llama-3.2-3b-instruct:free",
	"meta-llama/llama-3.2-1b-instruct:free",
	"microsoft/phi-3-mini-128k-instruct:free",
	"microsoft/phi-3-medium-128k-instruct:free",
	"qwen/qwen-2-7b-instruct:free",
	"mistralai/mistral-7b-instruct:free",
	]

	# Popular HuggingFace datasets for different tasks
	SAMPLE_DATASETS = {
	"Question Answering": [
	"hotpot_qa",
	"squad",
	"trivia_qa",
	],
	"Sentiment Analysis": [
	"imdb",
	"yelp_review_full",
	"emotion",
	],
	"Text Classification": [
	"ag_news",
	"dbpedia_14",
	"SetFit/sst5",
	],
	"Math Reasoning": [
	"gsm8k",
	"math_qa",
	],
	}


	def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
	api_key: str, model: str, input_field: str, target_field: str) -> Dict:
	"""Evaluate a prompt on a dataset using the selected model."""
	try:
	# Load dataset
	dataset = load_dataset(dataset_name, split=split, streaming=False)

	# Sample random examples
	if len(dataset) > num_samples:
	indices = random.sample(range(len(dataset)), num_samples)
	samples = [dataset[i] for i in indices]
	else:
	samples = list(dataset)[:num_samples]

	# Initialize OpenAI client with OpenRouter
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)

	correct = 0
	total = 0
	results = []

	for sample in samples:
	try:
	# Get input and target
	input_text = sample.get(input_field, "")
	if isinstance(input_text, dict):
	input_text = str(input_text)

	target = sample.get(target_field, "")
	if isinstance(target, dict):
	target = str(target)

	# Format the prompt with the input
	formatted_prompt = prompt.replace("{input}", str(input_text))

	# Call the model
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": formatted_prompt}
	],
	temperature=0.1,
	max_tokens=500,
	)

	prediction = response.choices[0].message.content.strip()

	# Simple exact match evaluation
	is_correct = str(target).lower().strip() in prediction.lower()
	if is_correct:
	correct += 1
	total += 1

	results.append({
	"input": str(input_text)[:100] + "...",
	"target": str(target),
	"prediction": prediction[:100] + "...",
	"correct": is_correct
	})

	except Exception as e:
	print(f"Error evaluating sample: {e}")
	continue

	accuracy = (correct / total * 100) if total > 0 else 0

	return {
	"accuracy": accuracy,
	"correct": correct,
	"total": total,
	"results": results
	}

	except Exception as e:
	return {
	"error": str(e),
	"accuracy": 0,
	"correct": 0,
	"total": 0,
	"results": []
	}


	def create_evaluator_file(dataset_name: str, split: str, model: str,
	input_field: str, target_field: str, work_dir: str):
	"""Create an evaluator.py file for OpenEvolve."""
	evaluator_code = f'''
	import os
	import random
	from datasets import load_dataset
	from openai import OpenAI

	def evaluate(prompt: str) -> float:
	"""Evaluate a prompt and return a score between 0 and 1."""
	try:
	# Load dataset
	dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)

	# Sample 100 random examples
	num_samples = min(100, len(dataset))
	if len(dataset) > num_samples:
	indices = random.sample(range(len(dataset)), num_samples)
	samples = [dataset[i] for i in indices]
	else:
	samples = list(dataset)[:num_samples]

	# Initialize OpenAI client
	api_key = os.environ.get("OPENAI_API_KEY")
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)

	correct = 0
	total = 0

	for sample in samples:
	try:
	# Get input and target
	input_text = sample.get("{input_field}", "")
	if isinstance(input_text, dict):
	input_text = str(input_text)

	target = sample.get("{target_field}", "")
	if isinstance(target, dict):
	target = str(target)

	# Format the prompt
	formatted_prompt = prompt.replace("{{input}}", str(input_text))

	# Call the model
	response = client.chat.completions.create(
	model="{model}",
	messages=[
	{{"role": "system", "content": "You are a helpful assistant."}},
	{{"role": "user", "content": formatted_prompt}}
	],
	temperature=0.1,
	max_tokens=500,
	)

	prediction = response.choices[0].message.content.strip()

	# Simple evaluation
	is_correct = str(target).lower().strip() in prediction.lower()
	if is_correct:
	correct += 1
	total += 1

	except Exception as e:
	print(f"Error evaluating sample: {{e}}")
	continue

	# Return score between 0 and 1
	return (correct / total) if total > 0 else 0.0

	except Exception as e:
	print(f"Error in evaluation: {{e}}")
	return 0.0
	'''

	evaluator_path = os.path.join(work_dir, "evaluator.py")
	with open(evaluator_path, "w") as f:
	f.write(evaluator_code)

	return evaluator_path


	def create_config_file(model: str, work_dir: str):
	"""Create a config.yaml file for OpenEvolve."""
	config = {
	"llm": {
	"api_base": "https://openrouter.ai/api/v1",
	"model": model,
	"temperature": 0.7,
	"max_tokens": 4096,
	},
	"evolution": {
	"max_iterations": 10,
	"population_size": 10,
	"num_islands": 1,
	"elite_ratio": 0.1,
	"explore_ratio": 0.3,
	"exploit_ratio": 0.6,
	},
	"evaluation": {
	"timeout": 1800,
	}
	}

	config_path = os.path.join(work_dir, "config.yaml")
	with open(config_path, "w") as f:
	yaml.dump(config, f)

	return config_path


	def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
	model: str, api_key: str, input_field: str, target_field: str,
	progress=gr.Progress()) -> Tuple[str, str, str]:
	"""Run OpenEvolve to optimize the prompt."""

	if not api_key:
	return "Error: OpenAI API Key is required", "", ""

	# Set API key as environment variable
	os.environ["OPENAI_API_KEY"] = api_key

	progress(0, desc="Setting up...")

	# Create temporary working directory
	work_dir = tempfile.mkdtemp(prefix="openevolve_")

	try:
	# Save initial prompt
	initial_prompt_path = os.path.join(work_dir, "initial_prompt.txt")
	with open(initial_prompt_path, "w") as f:
	f.write(initial_prompt)

	# Create evaluator
	progress(0.1, desc="Creating evaluator...")
	evaluator_path = create_evaluator_file(dataset_name, dataset_split, model,
	input_field, target_field, work_dir)

	# Create config
	progress(0.2, desc="Creating configuration...")
	config_path = create_config_file(model, work_dir)

	# Run initial evaluation
	progress(0.3, desc="Running initial evaluation...")
	initial_eval = evaluate_prompt(
	initial_prompt, dataset_name, dataset_split, 100,
	api_key, model, input_field, target_field
	)

	initial_results = f"""
	### Initial Prompt Evaluation

	Prompt:
	```
	{initial_prompt}
	```

	Results:
	- Accuracy: {initial_eval['accuracy']:.2f}%
	- Correct: {initial_eval['correct']}/{initial_eval['total']}

	Sample Results:
	"""
	for i, result in enumerate(initial_eval['results'][:5], 1):
	initial_results += f"\n{i}. Input: {result['input']}\n"
	initial_results += f" Target: {result['target']}\n"
	initial_results += f" Prediction: {result['prediction']}\n"
	initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"

	# Run OpenEvolve
	progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")

	output_dir = os.path.join(work_dir, "output")
	os.makedirs(output_dir, exist_ok=True)

	try:
	# Run evolution
	result = run_evolution(
	initial_program_path=initial_prompt_path,
	evaluator_path=evaluator_path,
	config_path=config_path,
	output_dir=output_dir,
	verbose=True
	)

	progress(0.8, desc="Evaluating best prompt...")

	# Get the best prompt
	best_prompt_path = os.path.join(output_dir, "best_program.txt")
	if os.path.exists(best_prompt_path):
	with open(best_prompt_path, "r") as f:
	best_prompt = f.read()
	else:
	best_prompt = initial_prompt

	# Evaluate best prompt
	final_eval = evaluate_prompt(
	best_prompt, dataset_name, dataset_split, 100,
	api_key, model, input_field, target_field
	)

	final_results = f"""
	### Evolved Prompt Evaluation

	Prompt:
	```
	{best_prompt}
	```

	Results:
	- Accuracy: {final_eval['accuracy']:.2f}%
	- Correct: {final_eval['correct']}/{final_eval['total']}
	- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%

	Sample Results:
	"""
	for i, result in enumerate(final_eval['results'][:5], 1):
	final_results += f"\n{i}. Input: {result['input']}\n"
	final_results += f" Target: {result['target']}\n"
	final_results += f" Prediction: {result['prediction']}\n"
	final_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"

	summary = f"""
	## Optimization Complete!

	### Summary
	- Initial Accuracy: {initial_eval['accuracy']:.2f}%
	- Final Accuracy: {final_eval['accuracy']:.2f}%
	- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
	- Dataset: {dataset_name}
	- Model: {model}
	- Samples Evaluated: 100
	- Iterations: 10
	"""

	progress(1.0, desc="Complete!")

	return summary, initial_results, final_results

	except Exception as e:
	return f"Error during evolution: {str(e)}", initial_results, ""

	finally:
	# Clean up
	try:
	shutil.rmtree(work_dir)
	except:
	pass


	# Create Gradio interface
	with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🧬 OpenEvolve Prompt Optimizer

	Automatically evolve and optimize your prompts using evolutionary algorithms!

	This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
	by testing them on real datasets and evolving better versions.

	## How it works:
	1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
	2. Select a HuggingFace dataset to test on
	3. Choose a free model from OpenRouter
	4. Click "Optimize Prompt" to evolve better versions
	5. Compare initial vs. evolved performance!
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Configuration")

	api_key = gr.Textbox(
	label="OpenAI API Key (for OpenRouter)",
	type="password",
	placeholder="sk-or-v1-...",
	info="Get your free key at https://openrouter.ai/keys"
	)

	model = gr.Dropdown(
	choices=FREE_MODELS,
	value=FREE_MODELS[0],
	label="Select Model",
	info="Free models available on OpenRouter"
	)

	dataset_name = gr.Textbox(
	label="HuggingFace Dataset",
	value="imdb",
	placeholder="e.g., imdb, hotpot_qa, gsm8k",
	info="Any dataset from HuggingFace Hub"
	)

	dataset_split = gr.Textbox(
	label="Dataset Split",
	value="test",
	placeholder="e.g., train, test, validation"
	)

	input_field = gr.Textbox(
	label="Input Field Name",
	value="text",
	placeholder="e.g., text, question, context",
	info="The field containing inputs to process"
	)

	target_field = gr.Textbox(
	label="Target Field Name",
	value="label",
	placeholder="e.g., label, answer, target",
	info="The field containing expected outputs"
	)

	initial_prompt = gr.TextArea(
	label="Initial Prompt",
	value="Analyze the sentiment of the following text and classify it as positive or negative:\n\n{input}\n\nClassification:",
	lines=6,
	info="Use {input} as placeholder for dataset inputs"
	)

	optimize_btn = gr.Button("🚀 Optimize Prompt", variant="primary", size="lg")

	with gr.Row():
	with gr.Column():
	summary = gr.Markdown(label="Summary")

	with gr.Row():
	with gr.Column():
	initial_results = gr.Markdown(label="Initial Results")
	with gr.Column():
	final_results = gr.Markdown(label="Evolved Results")

	gr.Markdown("""
	### Example Datasets & Fields:

	\| Dataset \| Split \| Input Field \| Target Field \| Task \|
	\|---------\|-------\|-------------\|--------------\|------\|
	\| imdb \| test \| text \| label \| Sentiment Analysis \|
	\| hotpot_qa \| validation \| question \| answer \| Question Answering \|
	\| emotion \| test \| text \| label \| Emotion Classification \|
	\| gsm8k \| test \| question \| answer \| Math Reasoning \|
	\| ag_news \| test \| text \| label \| News Classification \|

	### Notes:
	- Evolution runs for 10 iterations with 1 island
	- Each evaluation uses 100 random samples from the dataset
	- The process may take 5-15 minutes depending on the dataset and model
	- Make sure your API key has sufficient credits for the requests
	""")

	optimize_btn.click(
	fn=optimize_prompt,
	inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
	input_field, target_field],
	outputs=[summary, initial_results, final_results]
	)

	if __name__ == "__main__":
	demo.launch()