biswanath2.roul
commited on
Commit
·
e54fd17
0
Parent(s):
Initial commit
Browse files- .DS_Store +0 -0
- .gitignore +60 -0
- LICENSE +21 -0
- README.md +167 -0
- docs/README.md +11 -0
- docs/advanced_features.md +268 -0
- docs/api_reference.md +247 -0
- docs/cli_usage.md +118 -0
- docs/getting_started.md +110 -0
- docs/integration_examples.md +584 -0
- promptlab/__init__.py +39 -0
- promptlab/cli/__init__.py +0 -0
- promptlab/cli/commands.py +697 -0
- promptlab/core/__init__.py +0 -0
- promptlab/core/evaluation.py +191 -0
- promptlab/core/prompt_manager.py +169 -0
- promptlab/core/testing.py +451 -0
- promptlab/core/version_control.py +161 -0
- promptlab/examples/__init__.py +0 -0
- promptlab/examples/ab_testing.py +117 -0
- promptlab/examples/basic_usage.py +109 -0
- promptlab/examples/evaluation_example.py +95 -0
- promptlab/tests/__init__.py +0 -0
- promptlab/tests/test_evaluation.py +0 -0
- promptlab/tests/test_prompt_manager.py +115 -0
- promptlab/tests/test_testing.py +0 -0
- promptlab/tests/test_version_control.py +0 -0
- promptlab/utils/__init__.py +0 -0
- promptlab/utils/metrics.py +161 -0
- promptlab/utils/storage.py +79 -0
- promptlab/utils/templating.py +259 -0
- pyproject.toml +45 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
MANIFEST
|
| 23 |
+
|
| 24 |
+
# Virtual environments
|
| 25 |
+
env/
|
| 26 |
+
venv/
|
| 27 |
+
ENV/
|
| 28 |
+
env.bak/
|
| 29 |
+
venv.bak/
|
| 30 |
+
pl200525/
|
| 31 |
+
|
| 32 |
+
# Jupyter Notebook
|
| 33 |
+
.ipynb_checkpoints
|
| 34 |
+
|
| 35 |
+
# Prompt storage (for local development)
|
| 36 |
+
promptlab_storage/
|
| 37 |
+
|
| 38 |
+
# IDE
|
| 39 |
+
.idea/
|
| 40 |
+
.vscode/
|
| 41 |
+
*.swp
|
| 42 |
+
*.swo
|
| 43 |
+
|
| 44 |
+
# Distribution / packaging
|
| 45 |
+
.Python
|
| 46 |
+
env/
|
| 47 |
+
build/
|
| 48 |
+
develop-eggs/
|
| 49 |
+
dist/
|
| 50 |
+
downloads/
|
| 51 |
+
eggs/
|
| 52 |
+
.eggs/
|
| 53 |
+
lib/
|
| 54 |
+
lib64/
|
| 55 |
+
parts/
|
| 56 |
+
sdist/
|
| 57 |
+
var/
|
| 58 |
+
*.egg-info/
|
| 59 |
+
.installed.cfg
|
| 60 |
+
*.egg
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Biswanath Roul
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PromptLab: LLM Prompt Management System
|
| 2 |
+
|
| 3 |
+
PromptLab is a comprehensive library for managing, versioning, testing, and evaluating prompts for Large Language Models (LLMs). It provides a structured framework to help data scientists and developers create, optimize, and maintain high-quality prompts.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Prompt Management**: Create, update, and organize prompts with metadata and tags
|
| 8 |
+
- **Version Control**: Track prompt changes over time with full version history
|
| 9 |
+
- **A/B Testing**: Compare different prompt variations to find the most effective one
|
| 10 |
+
- **Evaluation Framework**: Measure prompt quality with customizable metrics
|
| 11 |
+
- **Advanced Templating**: Create dynamic prompts with variables, conditionals, and loops
|
| 12 |
+
- **Command-line Interface**: Easily integrate into your workflow
|
| 13 |
+
|
| 14 |
+
## Documentation
|
| 15 |
+
|
| 16 |
+
For detailed documentation, see the [docs](./docs) directory:
|
| 17 |
+
|
| 18 |
+
- [Getting Started](./docs/getting_started.md)
|
| 19 |
+
- [API Reference](./docs/api_reference.md)
|
| 20 |
+
- [CLI Usage](./docs/cli_usage.md)
|
| 21 |
+
- [Advanced Features](./docs/advanced_features.md)
|
| 22 |
+
- [Integration Examples](./docs/integration_examples.md)
|
| 23 |
+
|
| 24 |
+
## Installation
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
pip install promptlab
|
| 28 |
+
|
| 29 |
+
Quick Start
|
| 30 |
+
|
| 31 |
+
from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
|
| 32 |
+
|
| 33 |
+
# Initialize components
|
| 34 |
+
prompt_manager = PromptManager()
|
| 35 |
+
version_control = VersionControl(prompt_manager)
|
| 36 |
+
testing = PromptTesting(prompt_manager)
|
| 37 |
+
evaluator = Evaluator(prompt_manager)
|
| 38 |
+
|
| 39 |
+
# Create a prompt
|
| 40 |
+
prompt = prompt_manager.create(
|
| 41 |
+
content="Summarize the following text: {text}",
|
| 42 |
+
name="Simple Summarization",
|
| 43 |
+
description="A simple prompt for text summarization",
|
| 44 |
+
tags=["summarization", "basic"]
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Create a new version
|
| 48 |
+
version_control.commit(
|
| 49 |
+
prompt_id=prompt.id,
|
| 50 |
+
commit_message="Initial version"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Update the prompt
|
| 54 |
+
prompt_manager.update(
|
| 55 |
+
prompt.id,
|
| 56 |
+
content="Please provide a concise summary of the following text in 2-3 sentences: {text}"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Commit the updated version
|
| 60 |
+
version_control.commit(
|
| 61 |
+
prompt_id=prompt.id,
|
| 62 |
+
commit_message="Improved prompt with length guidance"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Create a test case
|
| 66 |
+
test_case = testing.create_test_case(
|
| 67 |
+
prompt_id=prompt.id,
|
| 68 |
+
input_vars={"text": "Lorem ipsum dolor sit amet..."},
|
| 69 |
+
expected_output="This is a summary of the text."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Define an LLM callback for testing
|
| 73 |
+
async def llm_callback(prompt, vars):
|
| 74 |
+
# In a real scenario, this would call an actual LLM API
|
| 75 |
+
return "This is a summary of the text."
|
| 76 |
+
|
| 77 |
+
# Run the test case
|
| 78 |
+
import asyncio
|
| 79 |
+
test_result = asyncio.run(testing.run_test_case(
|
| 80 |
+
test_case_id=test_case.id,
|
| 81 |
+
llm_callback=llm_callback
|
| 82 |
+
))
|
| 83 |
+
|
| 84 |
+
# Evaluate a prompt with multiple inputs
|
| 85 |
+
evaluation_result = asyncio.run(evaluator.evaluate_prompt(
|
| 86 |
+
prompt_id=prompt.id,
|
| 87 |
+
inputs=[{"text": "Sample text 1"}, {"text": "Sample text 2"}],
|
| 88 |
+
llm_callback=llm_callback
|
| 89 |
+
))
|
| 90 |
+
|
| 91 |
+
print(f"Evaluation metrics: {evaluation_result['aggregated_metrics']}")
|
| 92 |
+
|
| 93 |
+
Command-line Interface
|
| 94 |
+
PromptLab comes with a powerful CLI for managing prompts:
|
| 95 |
+
|
| 96 |
+
# Create a prompt
|
| 97 |
+
promptlab prompt create "Summarization" --content "Summarize: {text}" --tags "summarization,basic"
|
| 98 |
+
|
| 99 |
+
# List all prompts
|
| 100 |
+
promptlab prompt list
|
| 101 |
+
|
| 102 |
+
# Create a new version
|
| 103 |
+
promptlab version commit <prompt_id> --message "Updated prompt"
|
| 104 |
+
|
| 105 |
+
# Run tests
|
| 106 |
+
promptlab test run-all <prompt_id> --llm openai
|
| 107 |
+
|
| 108 |
+
Advanced Usage
|
| 109 |
+
Advanced Templating
|
| 110 |
+
PromptLab supports advanced templating with conditionals and loops:
|
| 111 |
+
|
| 112 |
+
from promptlab import PromptTemplate
|
| 113 |
+
|
| 114 |
+
template = PromptTemplate("""
|
| 115 |
+
{system_message}
|
| 116 |
+
|
| 117 |
+
{for example in examples}
|
| 118 |
+
Input: {example.input}
|
| 119 |
+
Output: {example.output}
|
| 120 |
+
{endfor}
|
| 121 |
+
|
| 122 |
+
Input: {input}
|
| 123 |
+
Output:
|
| 124 |
+
""")
|
| 125 |
+
|
| 126 |
+
rendered = template.render(
|
| 127 |
+
system_message="You are a helpful assistant.",
|
| 128 |
+
examples=[
|
| 129 |
+
{"input": "Hello", "output": "Hi there!"},
|
| 130 |
+
{"input": "How are you?", "output": "I'm doing well, thanks!"}
|
| 131 |
+
],
|
| 132 |
+
input="What's the weather like?"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
Custom Evaluation Metrics
|
| 136 |
+
Create custom metrics to evaluate prompt performance:
|
| 137 |
+
from promptlab import EvaluationMetric, Evaluator
|
| 138 |
+
|
| 139 |
+
class CustomMetric(EvaluationMetric):
|
| 140 |
+
def __init__(self):
|
| 141 |
+
super().__init__("custom_metric", "My custom evaluation metric")
|
| 142 |
+
|
| 143 |
+
def compute(self, generated_output, expected_output=None, **kwargs):
|
| 144 |
+
# Custom logic to score the output
|
| 145 |
+
return score # A float between 0 and 1
|
| 146 |
+
|
| 147 |
+
# Register the custom metric
|
| 148 |
+
evaluator = Evaluator(prompt_manager)
|
| 149 |
+
evaluator.register_metric(CustomMetric())
|
| 150 |
+
|
| 151 |
+
Use Cases
|
| 152 |
+
|
| 153 |
+
Prompt Development: Iteratively develop and refine prompts with version control
|
| 154 |
+
Prompt Optimization: A/B test different prompt variations to find the most effective approach
|
| 155 |
+
Quality Assurance: Ensure prompt quality with automated testing and evaluation
|
| 156 |
+
Team Collaboration: Share and collaborate on prompts with a centralized management system
|
| 157 |
+
Production Deployment: Maintain consistent prompt quality in production applications
|
| 158 |
+
|
| 159 |
+
License
|
| 160 |
+
MIT License
|
| 161 |
+
|
| 162 |
+
## Contributing
|
| 163 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 164 |
+
|
| 165 |
+
## Author
|
| 166 |
+
Biswanath Roul - [GitHub](https://github.com/biswanathroul)
|
| 167 |
+
|
docs/README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PromptLab Documentation
|
| 2 |
+
|
| 3 |
+
This directory contains detailed documentation for the PromptLab library.
|
| 4 |
+
|
| 5 |
+
## Contents
|
| 6 |
+
|
| 7 |
+
- [Getting Started](./getting_started.md)
|
| 8 |
+
- [API Reference](./api_reference.md)
|
| 9 |
+
- [CLI Usage](./cli_usage.md)
|
| 10 |
+
- [Advanced Features](./advanced_features.md)
|
| 11 |
+
- [Integration Examples](./integration_examples.md)
|
docs/advanced_features.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Advanced Features
|
| 2 |
+
|
| 3 |
+
PromptLab provides several advanced features for sophisticated prompt engineering.
|
| 4 |
+
|
| 5 |
+
## Advanced Templating
|
| 6 |
+
|
| 7 |
+
PromptLab's templating system goes beyond simple variable substitution, offering conditionals and loops.
|
| 8 |
+
|
| 9 |
+
### Basic Variable Substitution
|
| 10 |
+
|
| 11 |
+
```python
|
| 12 |
+
from promptlab import PromptTemplate
|
| 13 |
+
|
| 14 |
+
# Simple variable substitution
|
| 15 |
+
template = PromptTemplate("Hello, {name}!")
|
| 16 |
+
rendered = template.render(name="John")
|
| 17 |
+
# Result: "Hello, John!"
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
### Conditional Logic
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
# Conditionals
|
| 24 |
+
template = PromptTemplate("""
|
| 25 |
+
{if is_formal}
|
| 26 |
+
Dear {name},
|
| 27 |
+
|
| 28 |
+
I hope this message finds you well.
|
| 29 |
+
{else}
|
| 30 |
+
Hey {name}!
|
| 31 |
+
{endif}
|
| 32 |
+
|
| 33 |
+
{message}
|
| 34 |
+
""")
|
| 35 |
+
|
| 36 |
+
formal = template.render(is_formal=True, name="Dr. Smith", message="Please review the attached document.")
|
| 37 |
+
casual = template.render(is_formal=False, name="Bob", message="Want to grab lunch?")
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### Loops
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
# Loops
|
| 44 |
+
template = PromptTemplate("""
|
| 45 |
+
Here are your tasks:
|
| 46 |
+
|
| 47 |
+
{for task in tasks}
|
| 48 |
+
- {task.priority}: {task.description}
|
| 49 |
+
{endfor}
|
| 50 |
+
""")
|
| 51 |
+
|
| 52 |
+
rendered = template.render(tasks=[
|
| 53 |
+
{"priority": "High", "description": "Complete the report"},
|
| 54 |
+
{"priority": "Medium", "description": "Schedule meeting"},
|
| 55 |
+
{"priority": "Low", "description": "Organize files"}
|
| 56 |
+
])
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### Nested Structures
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
# Combining loops and conditionals
|
| 63 |
+
template = PromptTemplate("""
|
| 64 |
+
{system_message}
|
| 65 |
+
|
| 66 |
+
{for example in examples}
|
| 67 |
+
User: {example.input}
|
| 68 |
+
{if example.has_reasoning}
|
| 69 |
+
Reasoning: {example.reasoning}
|
| 70 |
+
{endif}
|
| 71 |
+
Assistant: {example.output}
|
| 72 |
+
{endfor}
|
| 73 |
+
|
| 74 |
+
User: {query}
|
| 75 |
+
Assistant:
|
| 76 |
+
""")
|
| 77 |
+
|
| 78 |
+
rendered = template.render(
|
| 79 |
+
system_message="You are a helpful assistant.",
|
| 80 |
+
examples=[
|
| 81 |
+
{
|
| 82 |
+
"input": "What's 2+2?",
|
| 83 |
+
"has_reasoning": True,
|
| 84 |
+
"reasoning": "Adding 2 and 2 gives 4",
|
| 85 |
+
"output": "4"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"input": "Hello",
|
| 89 |
+
"has_reasoning": False,
|
| 90 |
+
"output": "Hi there! How can I help you today?"
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
query="What's the capital of France?"
|
| 94 |
+
)
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## Custom Evaluation Metrics
|
| 98 |
+
|
| 99 |
+
You can create custom metrics to evaluate prompt outputs based on your specific requirements.
|
| 100 |
+
|
| 101 |
+
### Creating a Custom Metric
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from promptlab import EvaluationMetric
|
| 105 |
+
|
| 106 |
+
class RelevanceMetric(EvaluationMetric):
|
| 107 |
+
"""Evaluates relevance of output to a given topic."""
|
| 108 |
+
|
| 109 |
+
def __init__(self, topics):
|
| 110 |
+
super().__init__("relevance", "Evaluates relevance to specified topics")
|
| 111 |
+
self.topics = topics
|
| 112 |
+
|
| 113 |
+
def compute(self, generated_output, expected_output=None, **kwargs):
|
| 114 |
+
"""
|
| 115 |
+
Compute relevance score based on topic presence.
|
| 116 |
+
Returns a float between 0 and 1.
|
| 117 |
+
"""
|
| 118 |
+
score = 0
|
| 119 |
+
output_lower = generated_output.lower()
|
| 120 |
+
|
| 121 |
+
for topic in self.topics:
|
| 122 |
+
if topic.lower() in output_lower:
|
| 123 |
+
score += 1
|
| 124 |
+
|
| 125 |
+
# Normalize to 0-1 range
|
| 126 |
+
return min(1.0, score / len(self.topics)) if self.topics else 0.0
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### Using Custom Metrics
|
| 130 |
+
|
| 131 |
+
```python
|
| 132 |
+
from promptlab import Evaluator, PromptManager
|
| 133 |
+
|
| 134 |
+
# Initialize components
|
| 135 |
+
prompt_manager = PromptManager()
|
| 136 |
+
evaluator = Evaluator(prompt_manager)
|
| 137 |
+
|
| 138 |
+
# Register custom metric
|
| 139 |
+
climate_relevance = RelevanceMetric(["climate", "temperature", "warming", "environment"])
|
| 140 |
+
evaluator.register_metric(climate_relevance)
|
| 141 |
+
|
| 142 |
+
# Use in evaluation
|
| 143 |
+
async def my_llm(prompt, vars):
|
| 144 |
+
# Call your LLM API here
|
| 145 |
+
return "Climate change is causing global temperature increases..."
|
| 146 |
+
|
| 147 |
+
results = await evaluator.evaluate_prompt(
|
| 148 |
+
prompt_id="abc123",
|
| 149 |
+
inputs=[{"topic": "climate change"}],
|
| 150 |
+
llm_callback=my_llm,
|
| 151 |
+
metric_names=["relevance"] # Use our custom metric
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
print(f"Relevance score: {results['aggregated_metrics']['relevance']}")
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## Customizing Storage
|
| 158 |
+
|
| 159 |
+
PromptLab allows you to customize where and how prompts and related data are stored.
|
| 160 |
+
|
| 161 |
+
### Custom Storage Locations
|
| 162 |
+
|
| 163 |
+
```python
|
| 164 |
+
# Specify a custom storage location
|
| 165 |
+
prompt_manager = PromptManager("/path/to/my/prompts")
|
| 166 |
+
|
| 167 |
+
# Export/import prompts
|
| 168 |
+
import json
|
| 169 |
+
|
| 170 |
+
# Export a prompt to a file
|
| 171 |
+
prompt = prompt_manager.get("abc123")
|
| 172 |
+
with open("exported_prompt.json", "w") as f:
|
| 173 |
+
json.dump(prompt.to_dict(), f, indent=2)
|
| 174 |
+
|
| 175 |
+
# Import a prompt from a file
|
| 176 |
+
with open("exported_prompt.json", "r") as f:
|
| 177 |
+
data = json.load(f)
|
| 178 |
+
imported_prompt = prompt_manager.import_prompt(data)
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
## LLM Integration
|
| 182 |
+
|
| 183 |
+
PromptLab is designed to work with any LLM through callback functions. Here are examples of integrating with popular LLM APIs.
|
| 184 |
+
|
| 185 |
+
### OpenAI Integration
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
import openai
|
| 189 |
+
from promptlab import PromptManager, PromptTesting
|
| 190 |
+
|
| 191 |
+
prompt_manager = PromptManager()
|
| 192 |
+
testing = PromptTesting(prompt_manager)
|
| 193 |
+
|
| 194 |
+
# Configure OpenAI
|
| 195 |
+
openai.api_key = "your-api-key"
|
| 196 |
+
|
| 197 |
+
# OpenAI callback function
|
| 198 |
+
async def openai_callback(prompt, vars):
|
| 199 |
+
response = openai.ChatCompletion.create(
|
| 200 |
+
model="gpt-4",
|
| 201 |
+
messages=[{"role": "user", "content": prompt}],
|
| 202 |
+
temperature=0.7,
|
| 203 |
+
max_tokens=150
|
| 204 |
+
)
|
| 205 |
+
return response.choices[0].message.content
|
| 206 |
+
|
| 207 |
+
# Run tests with OpenAI
|
| 208 |
+
test_results = await testing.run_all_tests("abc123", openai_callback)
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### Anthropic Integration
|
| 212 |
+
|
| 213 |
+
```python
|
| 214 |
+
import anthropic
|
| 215 |
+
from promptlab import PromptManager, Evaluator
|
| 216 |
+
|
| 217 |
+
prompt_manager = PromptManager()
|
| 218 |
+
evaluator = Evaluator(prompt_manager)
|
| 219 |
+
|
| 220 |
+
# Configure Anthropic
|
| 221 |
+
client = anthropic.Anthropic(api_key="your-api-key")
|
| 222 |
+
|
| 223 |
+
# Anthropic callback function
|
| 224 |
+
async def anthropic_callback(prompt, vars):
|
| 225 |
+
response = client.messages.create(
|
| 226 |
+
model="claude-2",
|
| 227 |
+
messages=[{"role": "user", "content": prompt}],
|
| 228 |
+
max_tokens=150
|
| 229 |
+
)
|
| 230 |
+
return response.content[0].text
|
| 231 |
+
|
| 232 |
+
# Evaluate with Anthropic
|
| 233 |
+
eval_results = await evaluator.evaluate_prompt(
|
| 234 |
+
prompt_id="abc123",
|
| 235 |
+
inputs=[{"query": "What is machine learning?"}],
|
| 236 |
+
llm_callback=anthropic_callback
|
| 237 |
+
)
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
### Hugging Face Integration
|
| 241 |
+
|
| 242 |
+
```python
|
| 243 |
+
from transformers import pipeline
|
| 244 |
+
import asyncio
|
| 245 |
+
from promptlab import PromptManager, VersionControl
|
| 246 |
+
|
| 247 |
+
prompt_manager = PromptManager()
|
| 248 |
+
version_control = VersionControl(prompt_manager)
|
| 249 |
+
|
| 250 |
+
# Set up Hugging Face pipeline
|
| 251 |
+
generator = pipeline('text-generation', model='gpt2')
|
| 252 |
+
|
| 253 |
+
# Hugging Face callback function
|
| 254 |
+
async def hf_callback(prompt, vars):
|
| 255 |
+
# Run synchronously but in a way that doesn't block the asyncio event loop
|
| 256 |
+
loop = asyncio.get_event_loop()
|
| 257 |
+
result = await loop.run_in_executor(None, lambda: generator(prompt, max_length=100)[0]['generated_text'])
|
| 258 |
+
return result
|
| 259 |
+
|
| 260 |
+
# Use with version control
|
| 261 |
+
prompt = prompt_manager.create(
|
| 262 |
+
content="Complete this: {text}",
|
| 263 |
+
name="Text Completion"
|
| 264 |
+
)
|
| 265 |
+
version_control.commit(prompt.id, "Initial version")
|
| 266 |
+
|
| 267 |
+
# Test with different models by swapping the callback
|
| 268 |
+
```
|
docs/api_reference.md
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Reference
|
| 2 |
+
|
| 3 |
+
This document provides detailed API documentation for the main components of PromptLab.
|
| 4 |
+
|
| 5 |
+
## PromptManager
|
| 6 |
+
|
| 7 |
+
The `PromptManager` class is the core component for managing prompts.
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
from promptlab import PromptManager
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
### Methods
|
| 14 |
+
|
| 15 |
+
#### `__init__(storage_path=None)`
|
| 16 |
+
- **Description**: Initialize a new PromptManager.
|
| 17 |
+
- **Parameters**:
|
| 18 |
+
- `storage_path` (str, optional): Path to store prompts. Defaults to "~/promptlab_storage".
|
| 19 |
+
|
| 20 |
+
#### `create(content, name, description='', tags=None, metadata=None)`
|
| 21 |
+
- **Description**: Create a new prompt.
|
| 22 |
+
- **Parameters**:
|
| 23 |
+
- `content` (str): The prompt text with optional variables in {variable_name} format.
|
| 24 |
+
- `name` (str): Name of the prompt.
|
| 25 |
+
- `description` (str, optional): Description of the prompt.
|
| 26 |
+
- `tags` (list of str, optional): Tags for categorization.
|
| 27 |
+
- `metadata` (dict, optional): Additional metadata.
|
| 28 |
+
- **Returns**: `Prompt` object.
|
| 29 |
+
|
| 30 |
+
#### `get(prompt_id)`
|
| 31 |
+
- **Description**: Get a prompt by ID.
|
| 32 |
+
- **Parameters**:
|
| 33 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 34 |
+
- **Returns**: `Prompt` object or None if not found.
|
| 35 |
+
|
| 36 |
+
#### `update(prompt_id, content=None, name=None, description=None, tags=None, metadata=None)`
|
| 37 |
+
- **Description**: Update a prompt.
|
| 38 |
+
- **Parameters**:
|
| 39 |
+
- `prompt_id` (str): The ID of the prompt to update.
|
| 40 |
+
- `content` (str, optional): New prompt text.
|
| 41 |
+
- `name` (str, optional): New name.
|
| 42 |
+
- `description` (str, optional): New description.
|
| 43 |
+
- `tags` (list of str, optional): New tags.
|
| 44 |
+
- `metadata` (dict, optional): New metadata.
|
| 45 |
+
- **Returns**: Updated `Prompt` object.
|
| 46 |
+
|
| 47 |
+
#### `delete(prompt_id)`
|
| 48 |
+
- **Description**: Delete a prompt.
|
| 49 |
+
- **Parameters**:
|
| 50 |
+
- `prompt_id` (str): The ID of the prompt to delete.
|
| 51 |
+
- **Returns**: True if deleted, False otherwise.
|
| 52 |
+
|
| 53 |
+
#### `list_all()`
|
| 54 |
+
- **Description**: List all prompts.
|
| 55 |
+
- **Returns**: List of `Prompt` objects.
|
| 56 |
+
|
| 57 |
+
#### `search_by_tags(tags, match_all=False)`
|
| 58 |
+
- **Description**: Search prompts by tags.
|
| 59 |
+
- **Parameters**:
|
| 60 |
+
- `tags` (list of str): Tags to search for.
|
| 61 |
+
- `match_all` (bool, optional): If True, prompt must have all tags.
|
| 62 |
+
- **Returns**: List of matching `Prompt` objects.
|
| 63 |
+
|
| 64 |
+
## VersionControl
|
| 65 |
+
|
| 66 |
+
The `VersionControl` class manages prompt versions.
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
from promptlab import VersionControl
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### Methods
|
| 73 |
+
|
| 74 |
+
#### `__init__(prompt_manager)`
|
| 75 |
+
- **Description**: Initialize the version control system.
|
| 76 |
+
- **Parameters**:
|
| 77 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
| 78 |
+
|
| 79 |
+
#### `commit(prompt_id, commit_message, metadata=None)`
|
| 80 |
+
- **Description**: Create a new version of a prompt.
|
| 81 |
+
- **Parameters**:
|
| 82 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 83 |
+
- `commit_message` (str): Message describing the changes.
|
| 84 |
+
- `metadata` (dict, optional): Additional version metadata.
|
| 85 |
+
- **Returns**: Version number (int).
|
| 86 |
+
|
| 87 |
+
#### `list_versions(prompt_id)`
|
| 88 |
+
- **Description**: List all versions of a prompt.
|
| 89 |
+
- **Parameters**:
|
| 90 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 91 |
+
- **Returns**: List of version objects.
|
| 92 |
+
|
| 93 |
+
#### `get_version(prompt_id, version_number)`
|
| 94 |
+
- **Description**: Get a specific version of a prompt.
|
| 95 |
+
- **Parameters**:
|
| 96 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 97 |
+
- `version_number` (int): The version number.
|
| 98 |
+
- **Returns**: Version data.
|
| 99 |
+
|
| 100 |
+
#### `checkout(prompt_id, version_number)`
|
| 101 |
+
- **Description**: Revert a prompt to a specific version.
|
| 102 |
+
- **Parameters**:
|
| 103 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 104 |
+
- `version_number` (int): The version to revert to.
|
| 105 |
+
- **Returns**: Updated `Prompt` object.
|
| 106 |
+
|
| 107 |
+
#### `diff(prompt_id, version1, version2)`
|
| 108 |
+
- **Description**: Compare two versions of a prompt.
|
| 109 |
+
- **Parameters**:
|
| 110 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 111 |
+
- `version1` (int): First version number.
|
| 112 |
+
- `version2` (int): Second version number.
|
| 113 |
+
- **Returns**: Diff object.
|
| 114 |
+
|
| 115 |
+
## PromptTesting
|
| 116 |
+
|
| 117 |
+
The `PromptTesting` class provides testing capabilities.
|
| 118 |
+
|
| 119 |
+
```python
|
| 120 |
+
from promptlab import PromptTesting
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Methods
|
| 124 |
+
|
| 125 |
+
#### `__init__(prompt_manager)`
|
| 126 |
+
- **Description**: Initialize the testing system.
|
| 127 |
+
- **Parameters**:
|
| 128 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
| 129 |
+
|
| 130 |
+
#### `create_test_case(prompt_id, input_vars, expected_output=None, name=None, description=None)`
|
| 131 |
+
- **Description**: Create a test case for a prompt.
|
| 132 |
+
- **Parameters**:
|
| 133 |
+
- `prompt_id` (str): The ID of the prompt to test.
|
| 134 |
+
- `input_vars` (dict): Variables to substitute in the prompt.
|
| 135 |
+
- `expected_output` (str, optional): Expected response.
|
| 136 |
+
- `name` (str, optional): Test case name.
|
| 137 |
+
- `description` (str, optional): Test case description.
|
| 138 |
+
- **Returns**: Test case object.
|
| 139 |
+
|
| 140 |
+
#### `run_test_case(test_case_id, llm_callback)`
|
| 141 |
+
- **Description**: Run a test case.
|
| 142 |
+
- **Parameters**:
|
| 143 |
+
- `test_case_id` (str): The ID of the test case.
|
| 144 |
+
- `llm_callback` (callable): Function to call LLM.
|
| 145 |
+
- **Returns**: Test result.
|
| 146 |
+
|
| 147 |
+
#### `run_all_tests(prompt_id, llm_callback)`
|
| 148 |
+
- **Description**: Run all tests for a prompt.
|
| 149 |
+
- **Parameters**:
|
| 150 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 151 |
+
- `llm_callback` (callable): Function to call LLM.
|
| 152 |
+
- **Returns**: List of test results.
|
| 153 |
+
|
| 154 |
+
#### `ab_test(prompt_id_a, prompt_id_b, test_cases, llm_callback, metrics=None)`
|
| 155 |
+
- **Description**: Run A/B tests comparing two prompts.
|
| 156 |
+
- **Parameters**:
|
| 157 |
+
- `prompt_id_a` (str): First prompt ID.
|
| 158 |
+
- `prompt_id_b` (str): Second prompt ID.
|
| 159 |
+
- `test_cases` (list): Test cases to run.
|
| 160 |
+
- `llm_callback` (callable): Function to call LLM.
|
| 161 |
+
- `metrics` (list, optional): Metrics to compare.
|
| 162 |
+
- **Returns**: A/B test results.
|
| 163 |
+
|
| 164 |
+
## Evaluator
|
| 165 |
+
|
| 166 |
+
The `Evaluator` class handles prompt evaluation.
|
| 167 |
+
|
| 168 |
+
```python
|
| 169 |
+
from promptlab import Evaluator
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
### Methods
|
| 173 |
+
|
| 174 |
+
#### `__init__(prompt_manager)`
|
| 175 |
+
- **Description**: Initialize the evaluator.
|
| 176 |
+
- **Parameters**:
|
| 177 |
+
- `prompt_manager` (PromptManager): A PromptManager instance.
|
| 178 |
+
|
| 179 |
+
#### `register_metric(metric)`
|
| 180 |
+
- **Description**: Register a new evaluation metric.
|
| 181 |
+
- **Parameters**:
|
| 182 |
+
- `metric` (EvaluationMetric): The metric to register.
|
| 183 |
+
|
| 184 |
+
#### `evaluate_prompt(prompt_id, inputs, llm_callback, expected_outputs=None, metric_names=None)`
|
| 185 |
+
- **Description**: Evaluate a prompt with the given inputs and metrics.
|
| 186 |
+
- **Parameters**:
|
| 187 |
+
- `prompt_id` (str): The ID of the prompt.
|
| 188 |
+
- `inputs` (list): List of input dictionaries.
|
| 189 |
+
- `llm_callback` (callable): Function to call LLM.
|
| 190 |
+
- `expected_outputs` (list, optional): Expected outputs.
|
| 191 |
+
- `metric_names` (list, optional): Metrics to use.
|
| 192 |
+
- **Returns**: Evaluation results.
|
| 193 |
+
|
| 194 |
+
## PromptTemplate
|
| 195 |
+
|
| 196 |
+
The `PromptTemplate` class provides advanced templating.
|
| 197 |
+
|
| 198 |
+
```python
|
| 199 |
+
from promptlab import PromptTemplate
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
### Methods
|
| 203 |
+
|
| 204 |
+
#### `__init__(template_string)`
|
| 205 |
+
- **Description**: Initialize a template.
|
| 206 |
+
- **Parameters**:
|
| 207 |
+
- `template_string` (str): Template with variables, conditionals, and loops.
|
| 208 |
+
|
| 209 |
+
#### `render(**variables)`
|
| 210 |
+
- **Description**: Render the template with given variables.
|
| 211 |
+
- **Parameters**:
|
| 212 |
+
- `variables` (dict): Variables to substitute.
|
| 213 |
+
- **Returns**: Rendered string.
|
| 214 |
+
|
| 215 |
+
## EvaluationMetric
|
| 216 |
+
|
| 217 |
+
The `EvaluationMetric` is the base class for evaluation metrics.
|
| 218 |
+
|
| 219 |
+
```python
|
| 220 |
+
from promptlab import EvaluationMetric
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### Methods
|
| 224 |
+
|
| 225 |
+
#### `__init__(name, description=None)`
|
| 226 |
+
- **Description**: Initialize a metric.
|
| 227 |
+
- **Parameters**:
|
| 228 |
+
- `name` (str): Metric name.
|
| 229 |
+
- `description` (str, optional): Metric description.
|
| 230 |
+
|
| 231 |
+
#### `compute(generated_output, expected_output=None, **kwargs)`
|
| 232 |
+
- **Description**: Compute the metric score.
|
| 233 |
+
- **Parameters**:
|
| 234 |
+
- `generated_output` (str): Output from LLM.
|
| 235 |
+
- `expected_output` (str, optional): Expected output.
|
| 236 |
+
- `**kwargs`: Additional parameters.
|
| 237 |
+
- **Returns**: Score (float between 0 and 1).
|
| 238 |
+
|
| 239 |
+
### Built-in Metrics
|
| 240 |
+
|
| 241 |
+
- `ExactMatchMetric`: Scores exact matches between generated and expected output.
|
| 242 |
+
- `ContainsKeywordsMetric`: Scores based on keyword presence.
|
| 243 |
+
- `LengthMetric`: Scores based on output length.
|
| 244 |
+
|
| 245 |
+
```python
|
| 246 |
+
from promptlab import ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
|
| 247 |
+
```
|
docs/cli_usage.md
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLI Usage
|
| 2 |
+
|
| 3 |
+
PromptLab provides a command-line interface (CLI) for managing prompts, versions, tests, and evaluations.
|
| 4 |
+
|
| 5 |
+
## Basic Commands
|
| 6 |
+
|
| 7 |
+
### Prompt Management
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
# Create a prompt
|
| 11 |
+
promptlab prompt create "Weather Forecast" --content "Provide a weather forecast for {location} on {date}" --tags "weather,forecast"
|
| 12 |
+
|
| 13 |
+
# List all prompts
|
| 14 |
+
promptlab prompt list
|
| 15 |
+
|
| 16 |
+
# Get prompt details
|
| 17 |
+
promptlab prompt get <prompt_id>
|
| 18 |
+
|
| 19 |
+
# Update a prompt
|
| 20 |
+
promptlab prompt update <prompt_id> --content "New content" --tags "new,tags"
|
| 21 |
+
|
| 22 |
+
# Delete a prompt
|
| 23 |
+
promptlab prompt delete <prompt_id>
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Version Control
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
# Commit a version
|
| 30 |
+
promptlab version commit <prompt_id> --message "Version description"
|
| 31 |
+
|
| 32 |
+
# List versions
|
| 33 |
+
promptlab version list <prompt_id>
|
| 34 |
+
|
| 35 |
+
# Check out (revert to) a specific version
|
| 36 |
+
promptlab version checkout <prompt_id> <version_number>
|
| 37 |
+
|
| 38 |
+
# Compare versions
|
| 39 |
+
promptlab version diff <prompt_id> <version1> <version2>
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
### Testing
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
# Create a test case
|
| 46 |
+
promptlab test create <prompt_id> --input '{"location": "New York", "date": "tomorrow"}' --expected "Expected output"
|
| 47 |
+
|
| 48 |
+
# List test cases
|
| 49 |
+
promptlab test list <prompt_id>
|
| 50 |
+
|
| 51 |
+
# Run a specific test case
|
| 52 |
+
promptlab test run <test_case_id> --llm openai
|
| 53 |
+
|
| 54 |
+
# Run all test cases for a prompt
|
| 55 |
+
promptlab test run-all <prompt_id> --llm openai
|
| 56 |
+
|
| 57 |
+
# Run an A/B test between two prompts
|
| 58 |
+
promptlab test ab <prompt_id_a> <prompt_id_b> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Evaluation
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
# Evaluate a prompt
|
| 65 |
+
promptlab eval run <prompt_id> --inputs '[{"var": "value1"}, {"var": "value2"}]' --llm openai
|
| 66 |
+
|
| 67 |
+
# List available metrics
|
| 68 |
+
promptlab eval metrics
|
| 69 |
+
|
| 70 |
+
# Register a custom metric
|
| 71 |
+
promptlab eval register-metric <metric_file.py>
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Environment Configuration
|
| 75 |
+
|
| 76 |
+
The CLI supports environment variables for configuration:
|
| 77 |
+
|
| 78 |
+
- `PROMPTLAB_STORAGE`: Path to store prompts and related data
|
| 79 |
+
- `PROMPTLAB_OPENAI_API_KEY`: OpenAI API key for built-in LLM support
|
| 80 |
+
- `PROMPTLAB_DEFAULT_LLM`: Default LLM to use for testing and evaluation
|
| 81 |
+
|
| 82 |
+
You can also create a config file at `~/.promptlab/config.json`:
|
| 83 |
+
|
| 84 |
+
```json
|
| 85 |
+
{
|
| 86 |
+
"storage_path": "/path/to/storage",
|
| 87 |
+
"default_llm": "openai",
|
| 88 |
+
"api_keys": {
|
| 89 |
+
"openai": "your-openai-key"
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Advanced Usage
|
| 95 |
+
|
| 96 |
+
### Multiple Storage Locations
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
# Specify a storage location for a command
|
| 100 |
+
promptlab --storage /path/to/storage prompt list
|
| 101 |
+
|
| 102 |
+
# Export a prompt to another storage
|
| 103 |
+
promptlab prompt export <prompt_id> --output /path/to/output.json
|
| 104 |
+
|
| 105 |
+
# Import a prompt from a file
|
| 106 |
+
promptlab prompt import /path/to/prompt.json
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### Automation and Scripting
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
# Get output in JSON format
|
| 113 |
+
promptlab --json prompt list
|
| 114 |
+
|
| 115 |
+
# Use in shell scripts
|
| 116 |
+
PROMPT_ID=$(promptlab --json prompt create "Script Prompt" --content "Content" | jq -r '.id')
|
| 117 |
+
echo "Created prompt with ID: $PROMPT_ID"
|
| 118 |
+
```
|
docs/getting_started.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Getting Started with PromptLab
|
| 2 |
+
|
| 3 |
+
This guide will help you get started with PromptLab, a comprehensive library for managing LLM prompts.
|
| 4 |
+
|
| 5 |
+
## Installation
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
pip install promptlab
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## Basic Usage
|
| 12 |
+
|
| 13 |
+
### Initialize Components
|
| 14 |
+
|
| 15 |
+
```python
|
| 16 |
+
from promptlab import PromptManager, VersionControl, PromptTesting, Evaluator
|
| 17 |
+
|
| 18 |
+
# Initialize with default storage location
|
| 19 |
+
prompt_manager = PromptManager()
|
| 20 |
+
|
| 21 |
+
# Or specify a custom storage location
|
| 22 |
+
# prompt_manager = PromptManager("/path/to/storage")
|
| 23 |
+
|
| 24 |
+
# Initialize other components
|
| 25 |
+
version_control = VersionControl(prompt_manager)
|
| 26 |
+
testing = PromptTesting(prompt_manager)
|
| 27 |
+
evaluator = Evaluator(prompt_manager)
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
### Create and Manage Prompts
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
# Create a prompt
|
| 34 |
+
prompt = prompt_manager.create(
|
| 35 |
+
content="Translate the following text from {source_language} to {target_language}: {text}",
|
| 36 |
+
name="Translation Prompt",
|
| 37 |
+
description="A prompt for translating text between languages",
|
| 38 |
+
tags=["translation", "multilingual"]
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# The prompt.id property contains a unique identifier (e.g., "a1b2c3d4e5")
|
| 42 |
+
prompt_id = prompt.id
|
| 43 |
+
|
| 44 |
+
# Get a prompt by ID
|
| 45 |
+
retrieved_prompt = prompt_manager.get(prompt_id)
|
| 46 |
+
|
| 47 |
+
# Update a prompt
|
| 48 |
+
prompt_manager.update(
|
| 49 |
+
prompt_id,
|
| 50 |
+
content="Please translate the following text from {source_language} to {target_language}:\n\n{text}"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Search prompts by tags
|
| 54 |
+
translation_prompts = prompt_manager.search_by_tags(["translation"])
|
| 55 |
+
|
| 56 |
+
# List all prompts
|
| 57 |
+
all_prompts = prompt_manager.list_all()
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Version Control
|
| 61 |
+
|
| 62 |
+
```python
|
| 63 |
+
# Create a version snapshot
|
| 64 |
+
version_control.commit(
|
| 65 |
+
prompt_id=prompt_id,
|
| 66 |
+
commit_message="Initial version"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Update the prompt and create another version
|
| 70 |
+
prompt_manager.update(
|
| 71 |
+
prompt_id,
|
| 72 |
+
content="Please provide a translation of the following text from {source_language} to {target_language}:\n\n{text}\n\nMaintain the original formatting and tone."
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
version_control.commit(
|
| 76 |
+
prompt_id=prompt_id,
|
| 77 |
+
commit_message="Added formatting instructions"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# List all versions
|
| 81 |
+
versions = version_control.list_versions(prompt_id)
|
| 82 |
+
|
| 83 |
+
# Compare versions
|
| 84 |
+
diff = version_control.diff(prompt_id, 1, 2)
|
| 85 |
+
|
| 86 |
+
# Revert to a previous version
|
| 87 |
+
version_control.checkout(prompt_id, 1)
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
### Using Prompts with Variables
|
| 91 |
+
|
| 92 |
+
```python
|
| 93 |
+
# Get a prompt
|
| 94 |
+
prompt = prompt_manager.get(prompt_id)
|
| 95 |
+
|
| 96 |
+
# Render with variables
|
| 97 |
+
rendered_prompt = prompt.render(
|
| 98 |
+
source_language="English",
|
| 99 |
+
target_language="Spanish",
|
| 100 |
+
text="Hello, how are you today?"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Now use rendered_prompt with your LLM API
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## Next Steps
|
| 107 |
+
|
| 108 |
+
- See the [CLI Usage](./cli_usage.md) guide for command-line operations
|
| 109 |
+
- Explore [Advanced Features](./advanced_features.md) for templating and custom metrics
|
| 110 |
+
- Check [Integration Examples](./integration_examples.md) for real-world use cases
|
docs/integration_examples.md
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Integration Examples
|
| 2 |
+
|
| 3 |
+
This document provides concrete examples of integrating PromptLab into various applications and workflows.
|
| 4 |
+
|
| 5 |
+
## Customer Support Chatbot
|
| 6 |
+
|
| 7 |
+
### Setup
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
from promptlab import PromptManager, VersionControl
|
| 11 |
+
import openai
|
| 12 |
+
|
| 13 |
+
# Initialize components
|
| 14 |
+
prompt_manager = PromptManager()
|
| 15 |
+
version_control = VersionControl(prompt_manager)
|
| 16 |
+
|
| 17 |
+
# Create prompt templates for different scenarios
|
| 18 |
+
greeting_prompt = prompt_manager.create(
|
| 19 |
+
content="You are a helpful customer service agent for {company_name}. Greet the customer politely.",
|
| 20 |
+
name="Customer Greeting",
|
| 21 |
+
tags=["customer-service", "greeting"]
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
inquiry_prompt = prompt_manager.create(
|
| 25 |
+
content="""
|
| 26 |
+
You are a helpful customer service agent for {company_name}.
|
| 27 |
+
Customer inquiry: {customer_message}
|
| 28 |
+
|
| 29 |
+
Based on this inquiry:
|
| 30 |
+
1. Identify the main issue
|
| 31 |
+
2. Provide a helpful response
|
| 32 |
+
3. Offer additional assistance
|
| 33 |
+
|
| 34 |
+
Keep your tone professional but friendly.
|
| 35 |
+
""",
|
| 36 |
+
name="Customer Inquiry Response",
|
| 37 |
+
tags=["customer-service", "inquiry"]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Version them
|
| 41 |
+
version_control.commit(greeting_prompt.id, "Initial version")
|
| 42 |
+
version_control.commit(inquiry_prompt.id, "Initial version")
|
| 43 |
+
|
| 44 |
+
# OpenAI callback
|
| 45 |
+
def generate_response(prompt_text):
|
| 46 |
+
response = openai.ChatCompletion.create(
|
| 47 |
+
model="gpt-3.5-turbo",
|
| 48 |
+
messages=[{"role": "user", "content": prompt_text}]
|
| 49 |
+
)
|
| 50 |
+
return response.choices[0].message.content
|
| 51 |
+
|
| 52 |
+
# Main handler function
|
| 53 |
+
def handle_customer_message(customer_name, message, is_new_conversation):
|
| 54 |
+
if is_new_conversation:
|
| 55 |
+
# Use greeting prompt for new conversations
|
| 56 |
+
prompt = prompt_manager.get(greeting_prompt.id)
|
| 57 |
+
prompt_text = prompt.render(company_name="Acme Inc.")
|
| 58 |
+
return generate_response(prompt_text)
|
| 59 |
+
else:
|
| 60 |
+
# Use inquiry prompt for ongoing conversations
|
| 61 |
+
prompt = prompt_manager.get(inquiry_prompt.id)
|
| 62 |
+
prompt_text = prompt.render(
|
| 63 |
+
company_name="Acme Inc.",
|
| 64 |
+
customer_message=message
|
| 65 |
+
)
|
| 66 |
+
return generate_response(prompt_text)
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Content Generation System
|
| 70 |
+
|
| 71 |
+
### Setup
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
from promptlab import PromptManager, PromptTesting, Evaluator
|
| 75 |
+
import asyncio
|
| 76 |
+
|
| 77 |
+
# Initialize components
|
| 78 |
+
prompt_manager = PromptManager("content_system_prompts")
|
| 79 |
+
testing = PromptTesting(prompt_manager)
|
| 80 |
+
evaluator = Evaluator(prompt_manager)
|
| 81 |
+
|
| 82 |
+
# Create content generation prompt
|
| 83 |
+
blog_prompt = prompt_manager.create(
|
| 84 |
+
content="""
|
| 85 |
+
Write a blog post about {topic}.
|
| 86 |
+
|
| 87 |
+
Title: {title}
|
| 88 |
+
|
| 89 |
+
The post should:
|
| 90 |
+
- Be approximately {word_count} words
|
| 91 |
+
- Be written in a {tone} tone
|
| 92 |
+
- Include {num_sections} main sections
|
| 93 |
+
- Target audience: {audience}
|
| 94 |
+
- Include a compelling call-to-action at the end
|
| 95 |
+
|
| 96 |
+
Keywords to include: {keywords}
|
| 97 |
+
""",
|
| 98 |
+
name="Blog Post Generator",
|
| 99 |
+
tags=["content", "blog"]
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Test cases
|
| 103 |
+
test_case = testing.create_test_case(
|
| 104 |
+
prompt_id=blog_prompt.id,
|
| 105 |
+
input_vars={
|
| 106 |
+
"topic": "Sustainable Living",
|
| 107 |
+
"title": "10 Simple Ways to Reduce Your Carbon Footprint",
|
| 108 |
+
"word_count": "800",
|
| 109 |
+
"tone": "informative yet casual",
|
| 110 |
+
"num_sections": "5",
|
| 111 |
+
"audience": "environmentally-conscious millennials",
|
| 112 |
+
"keywords": "sustainability, eco-friendly, carbon footprint, climate change, lifestyle changes"
|
| 113 |
+
}
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# LLM callback
|
| 117 |
+
async def content_llm_callback(prompt, vars):
|
| 118 |
+
# Call your preferred LLM API here
|
| 119 |
+
# This is a placeholder
|
| 120 |
+
return f"Generated content about {vars.get('topic', 'unknown topic')}"
|
| 121 |
+
|
| 122 |
+
# Content generation function
|
| 123 |
+
async def generate_content(content_type, parameters):
|
| 124 |
+
if content_type == "blog":
|
| 125 |
+
prompt = prompt_manager.get(blog_prompt.id)
|
| 126 |
+
rendered_prompt = prompt.render(**parameters)
|
| 127 |
+
|
| 128 |
+
# Generate content
|
| 129 |
+
content = await content_llm_callback(rendered_prompt, parameters)
|
| 130 |
+
|
| 131 |
+
# Evaluate quality
|
| 132 |
+
evaluation = await evaluator.evaluate_prompt(
|
| 133 |
+
prompt_id=blog_prompt.id,
|
| 134 |
+
inputs=[parameters],
|
| 135 |
+
llm_callback=content_llm_callback
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
quality_score = evaluation["aggregated_metrics"].get("length", 0)
|
| 139 |
+
|
| 140 |
+
return {
|
| 141 |
+
"content": content,
|
| 142 |
+
"quality_score": quality_score,
|
| 143 |
+
"metadata": {
|
| 144 |
+
"prompt_id": blog_prompt.id,
|
| 145 |
+
"prompt_version": prompt.version,
|
| 146 |
+
"parameters": parameters
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
else:
|
| 150 |
+
raise ValueError(f"Unsupported content type: {content_type}")
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
## AI-Assisted Research Tool
|
| 154 |
+
|
| 155 |
+
### Setup
|
| 156 |
+
|
| 157 |
+
```python
|
| 158 |
+
from promptlab import PromptManager, VersionControl
|
| 159 |
+
import json
|
| 160 |
+
import openai
|
| 161 |
+
|
| 162 |
+
# Initialize components
|
| 163 |
+
prompt_manager = PromptManager("research_prompts")
|
| 164 |
+
version_control = VersionControl(prompt_manager)
|
| 165 |
+
|
| 166 |
+
# Create research prompts
|
| 167 |
+
article_summary_prompt = prompt_manager.create(
|
| 168 |
+
content="""
|
| 169 |
+
Summarize the following research article:
|
| 170 |
+
|
| 171 |
+
Title: {article_title}
|
| 172 |
+
Abstract: {article_abstract}
|
| 173 |
+
|
| 174 |
+
Provide a summary that:
|
| 175 |
+
1. Identifies the main research question
|
| 176 |
+
2. Outlines the methodology
|
| 177 |
+
3. Summarizes key findings
|
| 178 |
+
4. Highlights limitations
|
| 179 |
+
5. Explains the significance of the results
|
| 180 |
+
|
| 181 |
+
Keep the summary concise, approximately 250 words.
|
| 182 |
+
""",
|
| 183 |
+
name="Article Summarizer",
|
| 184 |
+
tags=["research", "summary"]
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
research_question_prompt = prompt_manager.create(
|
| 188 |
+
content="""
|
| 189 |
+
Based on the following information:
|
| 190 |
+
|
| 191 |
+
Research Area: {research_area}
|
| 192 |
+
Existing Knowledge: {existing_knowledge}
|
| 193 |
+
Observed Gap: {knowledge_gap}
|
| 194 |
+
|
| 195 |
+
Generate 5 potential research questions that:
|
| 196 |
+
1. Address the identified knowledge gap
|
| 197 |
+
2. Are specific and answerable
|
| 198 |
+
3. Have theoretical or practical significance
|
| 199 |
+
4. Can be investigated with available research methods
|
| 200 |
+
""",
|
| 201 |
+
name="Research Question Generator",
|
| 202 |
+
tags=["research", "question-generation"]
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Version control
|
| 206 |
+
version_control.commit(article_summary_prompt.id, "Initial version")
|
| 207 |
+
version_control.commit(research_question_prompt.id, "Initial version")
|
| 208 |
+
|
| 209 |
+
# OpenAI callback
|
| 210 |
+
def research_assistant(prompt_text):
|
| 211 |
+
response = openai.ChatCompletion.create(
|
| 212 |
+
model="gpt-4",
|
| 213 |
+
messages=[{"role": "user", "content": prompt_text}]
|
| 214 |
+
)
|
| 215 |
+
return response.choices[0].message.content
|
| 216 |
+
|
| 217 |
+
# Research functions
|
| 218 |
+
def summarize_article(article_title, article_abstract):
|
| 219 |
+
prompt = prompt_manager.get(article_summary_prompt.id)
|
| 220 |
+
prompt_text = prompt.render(
|
| 221 |
+
article_title=article_title,
|
| 222 |
+
article_abstract=article_abstract
|
| 223 |
+
)
|
| 224 |
+
return research_assistant(prompt_text)
|
| 225 |
+
|
| 226 |
+
def generate_research_questions(research_area, existing_knowledge, knowledge_gap):
|
| 227 |
+
prompt = prompt_manager.get(research_question_prompt.id)
|
| 228 |
+
prompt_text = prompt.render(
|
| 229 |
+
research_area=research_area,
|
| 230 |
+
existing_knowledge=existing_knowledge,
|
| 231 |
+
knowledge_gap=knowledge_gap
|
| 232 |
+
)
|
| 233 |
+
return research_assistant(prompt_text)
|
| 234 |
+
|
| 235 |
+
# Save results
|
| 236 |
+
def save_research_data(research_project, data_type, content):
|
| 237 |
+
# Save the data along with prompt metadata for reproducibility
|
| 238 |
+
if data_type == "summary":
|
| 239 |
+
prompt_id = article_summary_prompt.id
|
| 240 |
+
prompt = prompt_manager.get(prompt_id)
|
| 241 |
+
elif data_type == "questions":
|
| 242 |
+
prompt_id = research_question_prompt.id
|
| 243 |
+
prompt = prompt_manager.get(prompt_id)
|
| 244 |
+
|
| 245 |
+
research_data = {
|
| 246 |
+
"content": content,
|
| 247 |
+
"metadata": {
|
| 248 |
+
"prompt_id": prompt_id,
|
| 249 |
+
"prompt_version": prompt.version,
|
| 250 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
# Save to file (in real application, might save to database)
|
| 255 |
+
with open(f"{research_project}_{data_type}.json", "w") as f:
|
| 256 |
+
json.dump(research_data, f, indent=2)
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
## Educational Quiz Generator
|
| 260 |
+
|
| 261 |
+
### Setup
|
| 262 |
+
|
| 263 |
+
```python
|
| 264 |
+
from promptlab import PromptManager, PromptTemplate
|
| 265 |
+
import asyncio
|
| 266 |
+
import aiohttp
|
| 267 |
+
|
| 268 |
+
# Initialize components
|
| 269 |
+
prompt_manager = PromptManager("education_prompts")
|
| 270 |
+
|
| 271 |
+
# Quiz generation prompt
|
| 272 |
+
quiz_prompt = prompt_manager.create(
|
| 273 |
+
content="""
|
| 274 |
+
Generate a quiz on the topic of {topic} at a {difficulty_level} difficulty level.
|
| 275 |
+
|
| 276 |
+
The quiz should:
|
| 277 |
+
- Have {num_questions} multiple-choice questions
|
| 278 |
+
- Cover the following subtopics: {subtopics}
|
| 279 |
+
- Include {include_explanation} explanations for the correct answers
|
| 280 |
+
- Be appropriate for {grade_level} students
|
| 281 |
+
|
| 282 |
+
For each question, provide:
|
| 283 |
+
1. The question text
|
| 284 |
+
2. Four possible answers (A, B, C, D)
|
| 285 |
+
3. The correct answer
|
| 286 |
+
{if include_explanation == "yes"}
|
| 287 |
+
4. An explanation of why the answer is correct
|
| 288 |
+
{endif}
|
| 289 |
+
|
| 290 |
+
Format the output as valid JSON.
|
| 291 |
+
""",
|
| 292 |
+
name="Quiz Generator",
|
| 293 |
+
tags=["education", "quiz"]
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# Quiz rendering template using advanced templating
|
| 297 |
+
render_template = PromptTemplate("""
|
| 298 |
+
<h1>{quiz_title}</h1>
|
| 299 |
+
|
| 300 |
+
<form id="quiz-form">
|
| 301 |
+
{for question in questions}
|
| 302 |
+
<div class="question">
|
| 303 |
+
<p><strong>Question {question.number}:</strong> {question.text}</p>
|
| 304 |
+
<ul style="list-style-type: none;">
|
| 305 |
+
{for option in question.options}
|
| 306 |
+
<li>
|
| 307 |
+
<input type="radio" name="q{question.number}" id="q{question.number}_{option.letter}" value="{option.letter}">
|
| 308 |
+
<label for="q{question.number}_{option.letter}">{option.letter}. {option.text}</label>
|
| 309 |
+
</li>
|
| 310 |
+
{endfor}
|
| 311 |
+
</ul>
|
| 312 |
+
|
| 313 |
+
{if show_answers}
|
| 314 |
+
<div class="answer">
|
| 315 |
+
<p><strong>Correct Answer:</strong> {question.correct_answer}</p>
|
| 316 |
+
{if question.has_explanation}
|
| 317 |
+
<p><strong>Explanation:</strong> {question.explanation}</p>
|
| 318 |
+
{endif}
|
| 319 |
+
</div>
|
| 320 |
+
{endif}
|
| 321 |
+
</div>
|
| 322 |
+
{endfor}
|
| 323 |
+
|
| 324 |
+
{if !show_answers}
|
| 325 |
+
<button type="submit">Submit Quiz</button>
|
| 326 |
+
{endif}
|
| 327 |
+
</form>
|
| 328 |
+
""")
|
| 329 |
+
|
| 330 |
+
# LLM callback
|
| 331 |
+
async def education_llm_callback(prompt, vars):
|
| 332 |
+
# This would call your LLM API
|
| 333 |
+
# Simulated response for this example
|
| 334 |
+
await asyncio.sleep(1) # Simulate API call
|
| 335 |
+
if "quiz" in prompt:
|
| 336 |
+
return """
|
| 337 |
+
{
|
| 338 |
+
"questions": [
|
| 339 |
+
{
|
| 340 |
+
"text": "What is the capital of France?",
|
| 341 |
+
"options": [
|
| 342 |
+
{"letter": "A", "text": "London"},
|
| 343 |
+
{"letter": "B", "text": "Berlin"},
|
| 344 |
+
{"letter": "C", "text": "Paris"},
|
| 345 |
+
{"letter": "D", "text": "Madrid"}
|
| 346 |
+
],
|
| 347 |
+
"correct_answer": "C",
|
| 348 |
+
"explanation": "Paris is the capital and most populous city of France."
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"text": "Who wrote 'Romeo and Juliet'?",
|
| 352 |
+
"options": [
|
| 353 |
+
{"letter": "A", "text": "Charles Dickens"},
|
| 354 |
+
{"letter": "B", "text": "William Shakespeare"},
|
| 355 |
+
{"letter": "C", "text": "Jane Austen"},
|
| 356 |
+
{"letter": "D", "text": "Mark Twain"}
|
| 357 |
+
],
|
| 358 |
+
"correct_answer": "B",
|
| 359 |
+
"explanation": "William Shakespeare wrote 'Romeo and Juliet' around 1594-1596."
|
| 360 |
+
}
|
| 361 |
+
]
|
| 362 |
+
}
|
| 363 |
+
"""
|
| 364 |
+
return "Default response"
|
| 365 |
+
|
| 366 |
+
# Quiz generation function
|
| 367 |
+
async def generate_quiz(topic, difficulty, num_questions, grade_level, include_explanations=True):
|
| 368 |
+
prompt = prompt_manager.get(quiz_prompt.id)
|
| 369 |
+
rendered_prompt = prompt.render(
|
| 370 |
+
topic=topic,
|
| 371 |
+
difficulty_level=difficulty,
|
| 372 |
+
num_questions=num_questions,
|
| 373 |
+
subtopics=", ".join(["key concepts", "historical context", "practical applications"]),
|
| 374 |
+
include_explanation="yes" if include_explanations else "no",
|
| 375 |
+
grade_level=grade_level
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# Get quiz content from LLM
|
| 379 |
+
quiz_json = await education_llm_callback(rendered_prompt, {})
|
| 380 |
+
|
| 381 |
+
# Parse JSON
|
| 382 |
+
quiz_data = json.loads(quiz_json)
|
| 383 |
+
|
| 384 |
+
# Prepare data for HTML template
|
| 385 |
+
template_data = {
|
| 386 |
+
"quiz_title": f"{topic} Quiz ({difficulty} Level)",
|
| 387 |
+
"questions": [],
|
| 388 |
+
"show_answers": False
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
# Format questions
|
| 392 |
+
for i, q in enumerate(quiz_data["questions"]):
|
| 393 |
+
question = {
|
| 394 |
+
"number": i + 1,
|
| 395 |
+
"text": q["text"],
|
| 396 |
+
"options": q["options"],
|
| 397 |
+
"correct_answer": q["correct_answer"],
|
| 398 |
+
"has_explanation": "explanation" in q,
|
| 399 |
+
"explanation": q.get("explanation", "")
|
| 400 |
+
}
|
| 401 |
+
template_data["questions"].append(question)
|
| 402 |
+
|
| 403 |
+
# Render HTML
|
| 404 |
+
return render_template.render(**template_data)
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
## Automated Coding Assistant
|
| 408 |
+
|
| 409 |
+
### Setup
|
| 410 |
+
|
| 411 |
+
```python
|
| 412 |
+
from promptlab import PromptManager, PromptTesting
|
| 413 |
+
import asyncio
|
| 414 |
+
import subprocess
|
| 415 |
+
import tempfile
|
| 416 |
+
|
| 417 |
+
# Initialize components
|
| 418 |
+
prompt_manager = PromptManager("coding_prompts")
|
| 419 |
+
testing = PromptTesting(prompt_manager)
|
| 420 |
+
|
| 421 |
+
# Create code generation prompts
|
| 422 |
+
function_prompt = prompt_manager.create(
|
| 423 |
+
content="""
|
| 424 |
+
Write a {language} function that solves the following problem:
|
| 425 |
+
|
| 426 |
+
{problem_description}
|
| 427 |
+
|
| 428 |
+
Function signature: {function_signature}
|
| 429 |
+
|
| 430 |
+
Requirements:
|
| 431 |
+
- The function should handle edge cases
|
| 432 |
+
- Include appropriate comments
|
| 433 |
+
- Follow {language} best practices
|
| 434 |
+
- Be optimized for {optimization_goal}
|
| 435 |
+
|
| 436 |
+
{if include_tests == "yes"}
|
| 437 |
+
Also include unit tests for the function.
|
| 438 |
+
{endif}
|
| 439 |
+
""",
|
| 440 |
+
name="Function Generator",
|
| 441 |
+
tags=["coding", "function"]
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
bug_fix_prompt = prompt_manager.create(
|
| 445 |
+
content="""
|
| 446 |
+
Debug the following {language} code which has an issue:
|
| 447 |
+
|
| 448 |
+
```{language}
|
| 449 |
+
{buggy_code}
|
| 450 |
+
```
|
| 451 |
+
|
| 452 |
+
Error message or problem description:
|
| 453 |
+
{error_description}
|
| 454 |
+
|
| 455 |
+
Please:
|
| 456 |
+
1. Identify the issue
|
| 457 |
+
2. Explain the root cause
|
| 458 |
+
3. Provide a fixed version of the code
|
| 459 |
+
4. Suggest how to prevent similar issues
|
| 460 |
+
""",
|
| 461 |
+
name="Bug Fix Assistant",
|
| 462 |
+
tags=["coding", "debugging"]
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
# LLM callback
|
| 466 |
+
async def coding_llm_callback(prompt, vars):
|
| 467 |
+
# This would call your LLM API
|
| 468 |
+
# Simplified example response
|
| 469 |
+
await asyncio.sleep(1)
|
| 470 |
+
|
| 471 |
+
if "function" in prompt:
|
| 472 |
+
return """
|
| 473 |
+
```python
|
| 474 |
+
def find_max_subarray_sum(arr):
|
| 475 |
+
"""
|
| 476 |
+
Finds the maximum sum of any contiguous subarray.
|
| 477 |
+
Uses Kadane's algorithm with O(n) time complexity.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
arr: List of integers
|
| 481 |
+
Returns:
|
| 482 |
+
Maximum subarray sum
|
| 483 |
+
"""
|
| 484 |
+
if not arr:
|
| 485 |
+
return 0
|
| 486 |
+
|
| 487 |
+
current_max = global_max = arr[0]
|
| 488 |
+
|
| 489 |
+
for num in arr[1:]:
|
| 490 |
+
current_max = max(num, current_max + num)
|
| 491 |
+
global_max = max(global_max, current_max)
|
| 492 |
+
|
| 493 |
+
return global_max
|
| 494 |
+
|
| 495 |
+
# Unit tests
|
| 496 |
+
def test_find_max_subarray_sum():
|
| 497 |
+
assert find_max_subarray_sum([]) == 0
|
| 498 |
+
assert find_max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]) == 6
|
| 499 |
+
assert find_max_subarray_sum([-1, -2, -3]) == -1
|
| 500 |
+
print("All tests passed!")
|
| 501 |
+
```
|
| 502 |
+
"""
|
| 503 |
+
elif "debug" in prompt:
|
| 504 |
+
return """
|
| 505 |
+
The issue is a classic off-by-one error in the loop boundary.
|
| 506 |
+
|
| 507 |
+
Root cause:
|
| 508 |
+
The loop is using `i <= len(arr)` which accesses an index that's out of bounds.
|
| 509 |
+
|
| 510 |
+
Fixed code:
|
| 511 |
+
```python
|
| 512 |
+
def process_array(arr):
|
| 513 |
+
result = []
|
| 514 |
+
for i in range(len(arr)): # Changed from i <= len(arr)
|
| 515 |
+
result.append(arr[i] * 2)
|
| 516 |
+
return result
|
| 517 |
+
```
|
| 518 |
+
|
| 519 |
+
Prevention:
|
| 520 |
+
- Remember that array indices are 0-based and go up to len(arr)-1
|
| 521 |
+
- Use range() or enumerate() when iterating through arrays by index
|
| 522 |
+
- Add bounds checking for critical operations
|
| 523 |
+
"""
|
| 524 |
+
|
| 525 |
+
return "Default response"
|
| 526 |
+
|
| 527 |
+
# Function to test generated code
|
| 528 |
+
def test_generated_code(code, language):
|
| 529 |
+
"""Test the generated code by running it in a safe environment."""
|
| 530 |
+
if language.lower() == "python":
|
| 531 |
+
with tempfile.NamedTemporaryFile(suffix=".py") as temp:
|
| 532 |
+
temp.write(code.encode())
|
| 533 |
+
temp.flush()
|
| 534 |
+
|
| 535 |
+
try:
|
| 536 |
+
result = subprocess.run(["python", temp.name],
|
| 537 |
+
capture_output=True,
|
| 538 |
+
text=True,
|
| 539 |
+
timeout=5)
|
| 540 |
+
if result.returncode == 0:
|
| 541 |
+
return {"success": True, "output": result.stdout}
|
| 542 |
+
else:
|
| 543 |
+
return {"success": False, "error": result.stderr}
|
| 544 |
+
except subprocess.TimeoutExpired:
|
| 545 |
+
return {"success": False, "error": "Code execution timed out"}
|
| 546 |
+
|
| 547 |
+
return {"success": False, "error": f"Testing not implemented for {language}"}
|
| 548 |
+
|
| 549 |
+
# Main coding assistant function
|
| 550 |
+
async def generate_function(problem, language="python", optimization_goal="readability", include_tests=True):
|
| 551 |
+
function_name = problem.lower().replace(" ", "_").replace("-", "_")
|
| 552 |
+
signature = f"def {function_name}(parameters):"
|
| 553 |
+
|
| 554 |
+
prompt = prompt_manager.get(function_prompt.id)
|
| 555 |
+
rendered_prompt = prompt.render(
|
| 556 |
+
language=language,
|
| 557 |
+
problem_description=problem,
|
| 558 |
+
function_signature=signature,
|
| 559 |
+
optimization_goal=optimization_goal,
|
| 560 |
+
include_tests="yes" if include_tests else "no"
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
# Get code from LLM
|
| 564 |
+
generated_code = await coding_llm_callback(rendered_prompt, {})
|
| 565 |
+
|
| 566 |
+
# Extract code from markdown if present
|
| 567 |
+
if "```" in generated_code:
|
| 568 |
+
code_blocks = re.findall(r"```(?:\w+)?\n(.+?)```", generated_code, re.DOTALL)
|
| 569 |
+
if code_blocks:
|
| 570 |
+
clean_code = code_blocks[0]
|
| 571 |
+
else:
|
| 572 |
+
clean_code = generated_code
|
| 573 |
+
else:
|
| 574 |
+
clean_code = generated_code
|
| 575 |
+
|
| 576 |
+
# Test the code
|
| 577 |
+
test_result = test_generated_code(clean_code, language)
|
| 578 |
+
|
| 579 |
+
return {
|
| 580 |
+
"code": clean_code,
|
| 581 |
+
"test_result": test_result,
|
| 582 |
+
"prompt_id": function_prompt.id
|
| 583 |
+
}
|
| 584 |
+
```
|
promptlab/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PromptLab - A comprehensive LLM Prompt Management System
|
| 3 |
+
|
| 4 |
+
PromptLab is a Python library that provides tools for managing, versioning,
|
| 5 |
+
testing, and evaluating prompts for Large Language Models.
|
| 6 |
+
|
| 7 |
+
Features:
|
| 8 |
+
- Prompt management with versioning
|
| 9 |
+
- A/B testing for prompt optimization
|
| 10 |
+
- Evaluation framework with customizable metrics
|
| 11 |
+
- Command-line interface for easy integration
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from .core.prompt_manager import PromptManager, Prompt
|
| 15 |
+
from .core.version_control import VersionControl, PromptVersion
|
| 16 |
+
from .core.testing import PromptTesting, TestCase, TestResult, ABTestResult
|
| 17 |
+
from .core.evaluation import Evaluator, EvaluationMetric, ExactMatchMetric, ContainsKeywordsMetric, LengthMetric
|
| 18 |
+
from .utils.metrics import create_default_metrics_set
|
| 19 |
+
from .utils.templating import PromptTemplate, template_registry
|
| 20 |
+
|
| 21 |
+
__version__ = "0.1.0"
|
| 22 |
+
__all__ = [
|
| 23 |
+
"PromptManager",
|
| 24 |
+
"Prompt",
|
| 25 |
+
"VersionControl",
|
| 26 |
+
"PromptVersion",
|
| 27 |
+
"PromptTesting",
|
| 28 |
+
"TestCase",
|
| 29 |
+
"TestResult",
|
| 30 |
+
"ABTestResult",
|
| 31 |
+
"Evaluator",
|
| 32 |
+
"EvaluationMetric",
|
| 33 |
+
"ExactMatchMetric",
|
| 34 |
+
"ContainsKeywordsMetric",
|
| 35 |
+
"LengthMetric",
|
| 36 |
+
"create_default_metrics_set",
|
| 37 |
+
"PromptTemplate",
|
| 38 |
+
"template_registry"
|
| 39 |
+
]
|
promptlab/cli/__init__.py
ADDED
|
File without changes
|
promptlab/cli/commands.py
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
from typing import List, Optional, Dict, Any
|
| 6 |
+
import asyncio
|
| 7 |
+
|
| 8 |
+
from ..core.prompt_manager import PromptManager
|
| 9 |
+
from ..core.version_control import VersionControl
|
| 10 |
+
from ..core.testing import PromptTesting
|
| 11 |
+
from ..core.evaluation import Evaluator, ContainsKeywordsMetric, LengthMetric
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class CLI:
|
| 15 |
+
"""Command-line interface for PromptLab."""
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.prompt_manager = PromptManager()
|
| 18 |
+
self.version_control = VersionControl(self.prompt_manager)
|
| 19 |
+
self.testing = PromptTesting(self.prompt_manager)
|
| 20 |
+
self.evaluator = Evaluator(self.prompt_manager)
|
| 21 |
+
|
| 22 |
+
self.parser = argparse.ArgumentParser(description="PromptLab - LLM Prompt Management System")
|
| 23 |
+
self._setup_commands()
|
| 24 |
+
|
| 25 |
+
def _setup_commands(self) -> None:
|
| 26 |
+
"""Set up command-line arguments."""
|
| 27 |
+
subparsers = self.parser.add_subparsers(dest="command", help="Command")
|
| 28 |
+
|
| 29 |
+
# Prompt commands
|
| 30 |
+
prompt_parser = subparsers.add_parser("prompt", help="Prompt management")
|
| 31 |
+
prompt_subparsers = prompt_parser.add_subparsers(dest="subcommand", help="Prompt subcommand")
|
| 32 |
+
|
| 33 |
+
# Create prompt
|
| 34 |
+
create_parser = prompt_subparsers.add_parser("create", help="Create a new prompt")
|
| 35 |
+
create_parser.add_argument("name", help="Prompt name")
|
| 36 |
+
create_parser.add_argument("--content", help="Prompt content")
|
| 37 |
+
create_parser.add_argument("--file", help="File containing prompt content")
|
| 38 |
+
create_parser.add_argument("--description", help="Prompt description")
|
| 39 |
+
create_parser.add_argument("--tags", help="Comma-separated list of tags")
|
| 40 |
+
|
| 41 |
+
# List prompts
|
| 42 |
+
# List prompts
|
| 43 |
+
list_parser = prompt_subparsers.add_parser("list", help="List prompts")
|
| 44 |
+
list_parser.add_argument("--tags", help="Filter by comma-separated list of tags")
|
| 45 |
+
|
| 46 |
+
# Get prompt
|
| 47 |
+
get_parser = prompt_subparsers.add_parser("get", help="Get a prompt")
|
| 48 |
+
get_parser.add_argument("id", help="Prompt ID")
|
| 49 |
+
|
| 50 |
+
# Update prompt
|
| 51 |
+
update_parser = prompt_subparsers.add_parser("update", help="Update a prompt")
|
| 52 |
+
update_parser.add_argument("id", help="Prompt ID")
|
| 53 |
+
update_parser.add_argument("--content", help="New prompt content")
|
| 54 |
+
update_parser.add_argument("--file", help="File containing new prompt content")
|
| 55 |
+
update_parser.add_argument("--name", help="New prompt name")
|
| 56 |
+
update_parser.add_argument("--description", help="New prompt description")
|
| 57 |
+
update_parser.add_argument("--tags", help="New comma-separated list of tags")
|
| 58 |
+
|
| 59 |
+
# Delete prompt
|
| 60 |
+
delete_parser = prompt_subparsers.add_parser("delete", help="Delete a prompt")
|
| 61 |
+
delete_parser.add_argument("id", help="Prompt ID")
|
| 62 |
+
|
| 63 |
+
# Version control commands
|
| 64 |
+
version_parser = subparsers.add_parser("version", help="Version control")
|
| 65 |
+
version_subparsers = version_parser.add_subparsers(dest="subcommand", help="Version subcommand")
|
| 66 |
+
|
| 67 |
+
# Commit
|
| 68 |
+
commit_parser = version_subparsers.add_parser("commit", help="Create a new version")
|
| 69 |
+
commit_parser.add_argument("id", help="Prompt ID")
|
| 70 |
+
commit_parser.add_argument("--message", help="Commit message")
|
| 71 |
+
|
| 72 |
+
# List versions
|
| 73 |
+
list_versions_parser = version_subparsers.add_parser("list", help="List versions")
|
| 74 |
+
list_versions_parser.add_argument("id", help="Prompt ID")
|
| 75 |
+
|
| 76 |
+
# Checkout
|
| 77 |
+
checkout_parser = version_subparsers.add_parser("checkout", help="Checkout a version")
|
| 78 |
+
checkout_parser.add_argument("id", help="Prompt ID")
|
| 79 |
+
checkout_parser.add_argument("version", type=int, help="Version number")
|
| 80 |
+
|
| 81 |
+
# Diff
|
| 82 |
+
diff_parser = version_subparsers.add_parser("diff", help="Compare versions")
|
| 83 |
+
diff_parser.add_argument("id", help="Prompt ID")
|
| 84 |
+
diff_parser.add_argument("version1", type=int, help="First version")
|
| 85 |
+
diff_parser.add_argument("version2", type=int, help="Second version")
|
| 86 |
+
|
| 87 |
+
# Testing commands
|
| 88 |
+
test_parser = subparsers.add_parser("test", help="Testing")
|
| 89 |
+
test_subparsers = test_parser.add_subparsers(dest="subcommand", help="Test subcommand")
|
| 90 |
+
|
| 91 |
+
# Create test case
|
| 92 |
+
create_test_parser = test_subparsers.add_parser("create", help="Create a test case")
|
| 93 |
+
create_test_parser.add_argument("prompt_id", help="Prompt ID")
|
| 94 |
+
create_test_parser.add_argument("--input", help="JSON string of input variables")
|
| 95 |
+
create_test_parser.add_argument("--input-file", help="File containing JSON input variables")
|
| 96 |
+
create_test_parser.add_argument("--expected", help="Expected output")
|
| 97 |
+
create_test_parser.add_argument("--expected-file", help="File containing expected output")
|
| 98 |
+
create_test_parser.add_argument("--name", help="Test case name")
|
| 99 |
+
create_test_parser.add_argument("--description", help="Test case description")
|
| 100 |
+
|
| 101 |
+
# List test cases
|
| 102 |
+
list_tests_parser = test_subparsers.add_parser("list", help="List test cases")
|
| 103 |
+
list_tests_parser.add_argument("--prompt-id", help="Filter by prompt ID")
|
| 104 |
+
|
| 105 |
+
# Run test case
|
| 106 |
+
run_test_parser = test_subparsers.add_parser("run", help="Run a test case")
|
| 107 |
+
run_test_parser.add_argument("test_id", help="Test case ID")
|
| 108 |
+
run_test_parser.add_argument("--llm", help="LLM callback function to use")
|
| 109 |
+
|
| 110 |
+
# Run all test cases for a prompt
|
| 111 |
+
run_all_parser = test_subparsers.add_parser("run-all", help="Run all test cases for a prompt")
|
| 112 |
+
run_all_parser.add_argument("prompt_id", help="Prompt ID")
|
| 113 |
+
run_all_parser.add_argument("--llm", help="LLM callback function to use")
|
| 114 |
+
|
| 115 |
+
# A/B test
|
| 116 |
+
ab_test_parser = test_subparsers.add_parser("ab", help="Run an A/B test")
|
| 117 |
+
ab_test_parser.add_argument("prompt_a", help="Prompt A ID")
|
| 118 |
+
ab_test_parser.add_argument("prompt_b", help="Prompt B ID")
|
| 119 |
+
ab_test_parser.add_argument("--llm", help="LLM callback function to use")
|
| 120 |
+
ab_test_parser.add_argument("--test-cases", help="Comma-separated list of test case IDs")
|
| 121 |
+
|
| 122 |
+
# Evaluation commands
|
| 123 |
+
eval_parser = subparsers.add_parser("eval", help="Evaluation")
|
| 124 |
+
eval_subparsers = eval_parser.add_subparsers(dest="subcommand", help="Evaluation subcommand")
|
| 125 |
+
|
| 126 |
+
# List metrics
|
| 127 |
+
list_metrics_parser = eval_subparsers.add_parser("metrics", help="List evaluation metrics")
|
| 128 |
+
|
| 129 |
+
# Register metric
|
| 130 |
+
register_metric_parser = eval_subparsers.add_parser("register", help="Register a custom metric")
|
| 131 |
+
register_metric_parser.add_argument("name", help="Metric name")
|
| 132 |
+
register_metric_parser.add_argument("--keywords", help="Keywords for ContainsKeywordsMetric")
|
| 133 |
+
register_metric_parser.add_argument("--min-length", type=int, help="Minimum length for LengthMetric")
|
| 134 |
+
register_metric_parser.add_argument("--max-length", type=int, help="Maximum length for LengthMetric")
|
| 135 |
+
register_metric_parser.add_argument("--target-length", type=int, help="Target length for LengthMetric")
|
| 136 |
+
|
| 137 |
+
# Evaluate prompt
|
| 138 |
+
evaluate_parser = eval_subparsers.add_parser("run", help="Evaluate a prompt")
|
| 139 |
+
evaluate_parser.add_argument("prompt_id", help="Prompt ID")
|
| 140 |
+
evaluate_parser.add_argument("--inputs", help="JSON string of input variables list")
|
| 141 |
+
evaluate_parser.add_argument("--inputs-file", help="File containing JSON input variables list")
|
| 142 |
+
evaluate_parser.add_argument("--expected", help="JSON string of expected outputs list")
|
| 143 |
+
evaluate_parser.add_argument("--expected-file", help="File containing JSON expected outputs list")
|
| 144 |
+
evaluate_parser.add_argument("--metrics", help="Comma-separated list of metrics to use")
|
| 145 |
+
evaluate_parser.add_argument("--llm", help="LLM callback function to use")
|
| 146 |
+
|
| 147 |
+
def run(self, args: Optional[List[str]] = None) -> None:
|
| 148 |
+
"""Run the CLI with the given arguments."""
|
| 149 |
+
args = self.parser.parse_args(args)
|
| 150 |
+
|
| 151 |
+
if not args.command:
|
| 152 |
+
self.parser.print_help()
|
| 153 |
+
return
|
| 154 |
+
|
| 155 |
+
# Handle commands
|
| 156 |
+
if args.command == "prompt":
|
| 157 |
+
self._handle_prompt_command(args)
|
| 158 |
+
elif args.command == "version":
|
| 159 |
+
self._handle_version_command(args)
|
| 160 |
+
elif args.command == "test":
|
| 161 |
+
self._handle_test_command(args)
|
| 162 |
+
elif args.command == "eval":
|
| 163 |
+
self._handle_eval_command(args)
|
| 164 |
+
|
| 165 |
+
def _handle_prompt_command(self, args) -> None:
|
| 166 |
+
"""Handle prompt commands."""
|
| 167 |
+
if not args.subcommand:
|
| 168 |
+
return
|
| 169 |
+
|
| 170 |
+
if args.subcommand == "create":
|
| 171 |
+
# Get content from file or argument
|
| 172 |
+
content = ""
|
| 173 |
+
if args.file:
|
| 174 |
+
with open(args.file, "r") as f:
|
| 175 |
+
content = f.read()
|
| 176 |
+
elif args.content:
|
| 177 |
+
content = args.content
|
| 178 |
+
else:
|
| 179 |
+
print("Error: Must provide either --content or --file")
|
| 180 |
+
return
|
| 181 |
+
|
| 182 |
+
# Parse tags
|
| 183 |
+
tags = []
|
| 184 |
+
if args.tags:
|
| 185 |
+
tags = [tag.strip() for tag in args.tags.split(",")]
|
| 186 |
+
|
| 187 |
+
# Create prompt
|
| 188 |
+
prompt = self.prompt_manager.create(
|
| 189 |
+
content=content,
|
| 190 |
+
name=args.name,
|
| 191 |
+
description=args.description,
|
| 192 |
+
tags=tags
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
print(f"Created prompt with ID: {prompt.id}")
|
| 196 |
+
|
| 197 |
+
elif args.subcommand == "list":
|
| 198 |
+
# Parse tags
|
| 199 |
+
tags = None
|
| 200 |
+
if args.tags:
|
| 201 |
+
tags = [tag.strip() for tag in args.tags.split(",")]
|
| 202 |
+
|
| 203 |
+
# List prompts
|
| 204 |
+
prompts = self.prompt_manager.list(tags)
|
| 205 |
+
|
| 206 |
+
if not prompts:
|
| 207 |
+
print("No prompts found")
|
| 208 |
+
return
|
| 209 |
+
|
| 210 |
+
# Print prompts
|
| 211 |
+
print(f"Found {len(prompts)} prompts:")
|
| 212 |
+
for prompt in prompts:
|
| 213 |
+
tags_str = ", ".join(prompt.tags) if prompt.tags else ""
|
| 214 |
+
print(f"ID: {prompt.id} | Name: {prompt.name} | Tags: {tags_str}")
|
| 215 |
+
|
| 216 |
+
elif args.subcommand == "get":
|
| 217 |
+
# Get prompt
|
| 218 |
+
prompt = self.prompt_manager.get(args.id)
|
| 219 |
+
|
| 220 |
+
if not prompt:
|
| 221 |
+
print(f"Prompt with ID {args.id} not found")
|
| 222 |
+
return
|
| 223 |
+
|
| 224 |
+
# Print prompt
|
| 225 |
+
print(f"ID: {prompt.id}")
|
| 226 |
+
print(f"Name: {prompt.name}")
|
| 227 |
+
print(f"Description: {prompt.description}")
|
| 228 |
+
print(f"Tags: {', '.join(prompt.tags)}")
|
| 229 |
+
print(f"Version: {prompt.version}")
|
| 230 |
+
print(f"Created: {prompt.created_at}")
|
| 231 |
+
print(f"Updated: {prompt.updated_at}")
|
| 232 |
+
print("\nContent:")
|
| 233 |
+
print(prompt.content)
|
| 234 |
+
|
| 235 |
+
elif args.subcommand == "update":
|
| 236 |
+
# Get prompt
|
| 237 |
+
prompt = self.prompt_manager.get(args.id)
|
| 238 |
+
|
| 239 |
+
if not prompt:
|
| 240 |
+
print(f"Prompt with ID {args.id} not found")
|
| 241 |
+
return
|
| 242 |
+
|
| 243 |
+
# Update kwargs
|
| 244 |
+
kwargs = {}
|
| 245 |
+
|
| 246 |
+
if args.name:
|
| 247 |
+
kwargs["name"] = args.name
|
| 248 |
+
|
| 249 |
+
if args.description:
|
| 250 |
+
kwargs["description"] = args.description
|
| 251 |
+
|
| 252 |
+
if args.tags:
|
| 253 |
+
kwargs["tags"] = [tag.strip() for tag in args.tags.split(",")]
|
| 254 |
+
|
| 255 |
+
# Get content from file or argument
|
| 256 |
+
if args.file:
|
| 257 |
+
with open(args.file, "r") as f:
|
| 258 |
+
kwargs["content"] = f.read()
|
| 259 |
+
elif args.content:
|
| 260 |
+
kwargs["content"] = args.content
|
| 261 |
+
|
| 262 |
+
# Update prompt
|
| 263 |
+
prompt = self.prompt_manager.update(args.id, **kwargs)
|
| 264 |
+
|
| 265 |
+
print(f"Updated prompt with ID: {prompt.id}")
|
| 266 |
+
|
| 267 |
+
elif args.subcommand == "delete":
|
| 268 |
+
# Delete prompt
|
| 269 |
+
success = self.prompt_manager.delete(args.id)
|
| 270 |
+
|
| 271 |
+
if success:
|
| 272 |
+
print(f"Deleted prompt with ID: {args.id}")
|
| 273 |
+
else:
|
| 274 |
+
print(f"Prompt with ID {args.id} not found")
|
| 275 |
+
|
| 276 |
+
def _handle_version_command(self, args) -> None:
|
| 277 |
+
"""Handle version control commands."""
|
| 278 |
+
if not args.subcommand:
|
| 279 |
+
return
|
| 280 |
+
|
| 281 |
+
if args.subcommand == "commit":
|
| 282 |
+
# Commit version
|
| 283 |
+
version = self.version_control.commit(
|
| 284 |
+
prompt_id=args.id,
|
| 285 |
+
commit_message=args.message
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
if not version:
|
| 289 |
+
print(f"Prompt with ID {args.id} not found")
|
| 290 |
+
return
|
| 291 |
+
|
| 292 |
+
print(f"Committed version {version.version} for prompt {args.id}")
|
| 293 |
+
|
| 294 |
+
elif args.subcommand == "list":
|
| 295 |
+
# List versions
|
| 296 |
+
versions = self.version_control.list_versions(args.id)
|
| 297 |
+
|
| 298 |
+
if not versions:
|
| 299 |
+
print(f"No versions found for prompt {args.id}")
|
| 300 |
+
return
|
| 301 |
+
|
| 302 |
+
# Print versions
|
| 303 |
+
print(f"Found {len(versions)} versions for prompt {args.id}:")
|
| 304 |
+
for version in versions:
|
| 305 |
+
message = version.commit_message or "No commit message"
|
| 306 |
+
print(f"Version: {version.version} | Created: {version.created_at} | Message: {message}")
|
| 307 |
+
|
| 308 |
+
elif args.subcommand == "checkout":
|
| 309 |
+
# Checkout version
|
| 310 |
+
prompt = self.version_control.checkout(
|
| 311 |
+
prompt_id=args.id,
|
| 312 |
+
version=args.version
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
if not prompt:
|
| 316 |
+
print(f"Prompt with ID {args.id} or version {args.version} not found")
|
| 317 |
+
return
|
| 318 |
+
|
| 319 |
+
print(f"Checked out version {args.version} for prompt {args.id}")
|
| 320 |
+
|
| 321 |
+
elif args.subcommand == "diff":
|
| 322 |
+
# Diff versions
|
| 323 |
+
diff = self.version_control.diff(
|
| 324 |
+
prompt_id=args.id,
|
| 325 |
+
version1=args.version1,
|
| 326 |
+
version2=args.version2
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
if not diff:
|
| 330 |
+
print(f"Could not compare versions {args.version1} and {args.version2} for prompt {args.id}")
|
| 331 |
+
return
|
| 332 |
+
|
| 333 |
+
# Print diff
|
| 334 |
+
print(f"Diff between version {args.version1} and {args.version2} for prompt {args.id}:")
|
| 335 |
+
for line in diff["diff"]:
|
| 336 |
+
print(line)
|
| 337 |
+
|
| 338 |
+
def _handle_test_command(self, args) -> None:
|
| 339 |
+
"""Handle testing commands."""
|
| 340 |
+
if not args.subcommand:
|
| 341 |
+
return
|
| 342 |
+
|
| 343 |
+
if args.subcommand == "create":
|
| 344 |
+
# Parse input variables
|
| 345 |
+
input_vars = {}
|
| 346 |
+
if args.input:
|
| 347 |
+
input_vars = json.loads(args.input)
|
| 348 |
+
elif args.input_file:
|
| 349 |
+
with open(args.input_file, "r") as f:
|
| 350 |
+
input_vars = json.loads(f.read())
|
| 351 |
+
else:
|
| 352 |
+
print("Error: Must provide either --input or --input-file")
|
| 353 |
+
return
|
| 354 |
+
|
| 355 |
+
# Parse expected output
|
| 356 |
+
expected = None
|
| 357 |
+
if args.expected:
|
| 358 |
+
expected = args.expected
|
| 359 |
+
elif args.expected_file:
|
| 360 |
+
with open(args.expected_file, "r") as f:
|
| 361 |
+
expected = f.read()
|
| 362 |
+
|
| 363 |
+
# Create test case
|
| 364 |
+
test_case = self.testing.create_test_case(
|
| 365 |
+
prompt_id=args.prompt_id,
|
| 366 |
+
input_vars=input_vars,
|
| 367 |
+
expected_output=expected,
|
| 368 |
+
name=args.name,
|
| 369 |
+
description=args.description
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
print(f"Created test case with ID: {test_case.id}")
|
| 373 |
+
|
| 374 |
+
elif args.subcommand == "list":
|
| 375 |
+
# List test cases
|
| 376 |
+
test_cases = self.testing.list_test_cases(args.prompt_id)
|
| 377 |
+
|
| 378 |
+
if not test_cases:
|
| 379 |
+
print("No test cases found")
|
| 380 |
+
return
|
| 381 |
+
|
| 382 |
+
# Print test cases
|
| 383 |
+
print(f"Found {len(test_cases)} test cases:")
|
| 384 |
+
for tc in test_cases:
|
| 385 |
+
print(f"ID: {tc.id} | Name: {tc.name} | Prompt ID: {tc.prompt_id}")
|
| 386 |
+
|
| 387 |
+
elif args.subcommand == "run":
|
| 388 |
+
# Get LLM callback
|
| 389 |
+
llm_callback = self._get_llm_callback(args.llm)
|
| 390 |
+
|
| 391 |
+
# Run test case
|
| 392 |
+
asyncio.run(self._run_test_case(args.test_id, llm_callback))
|
| 393 |
+
|
| 394 |
+
elif args.subcommand == "run-all":
|
| 395 |
+
# Get LLM callback
|
| 396 |
+
llm_callback = self._get_llm_callback(args.llm)
|
| 397 |
+
|
| 398 |
+
# Run all test cases
|
| 399 |
+
asyncio.run(self._run_all_test_cases(args.prompt_id, llm_callback))
|
| 400 |
+
|
| 401 |
+
elif args.subcommand == "ab":
|
| 402 |
+
# Get LLM callback
|
| 403 |
+
llm_callback = self._get_llm_callback(args.llm)
|
| 404 |
+
|
| 405 |
+
# Parse test case IDs
|
| 406 |
+
test_cases = None
|
| 407 |
+
if args.test_cases:
|
| 408 |
+
test_cases = [tc.strip() for tc in args.test_cases.split(",")]
|
| 409 |
+
|
| 410 |
+
# Run A/B test
|
| 411 |
+
asyncio.run(self._run_ab_test(args.prompt_a, args.prompt_b, llm_callback, test_cases))
|
| 412 |
+
|
| 413 |
+
async def _run_test_case(self, test_case_id, llm_callback) -> None:
|
| 414 |
+
"""Run a test case."""
|
| 415 |
+
try:
|
| 416 |
+
metrics_callbacks = [
|
| 417 |
+
self._create_metrics_callback("exact_match"),
|
| 418 |
+
self._create_metrics_callback("similarity"),
|
| 419 |
+
self._create_metrics_callback("length")
|
| 420 |
+
]
|
| 421 |
+
|
| 422 |
+
result = await self.testing.run_test_case(
|
| 423 |
+
test_case_id=test_case_id,
|
| 424 |
+
llm_callback=llm_callback,
|
| 425 |
+
metrics_callbacks=metrics_callbacks
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
print(f"Test result ID: {result.id}")
|
| 429 |
+
print(f"Test case ID: {result.test_case_id}")
|
| 430 |
+
print(f"Prompt ID: {result.prompt_id}")
|
| 431 |
+
print(f"Prompt version: {result.prompt_version}")
|
| 432 |
+
print(f"Passed: {result.passed}")
|
| 433 |
+
|
| 434 |
+
if result.metrics:
|
| 435 |
+
print("\nMetrics:")
|
| 436 |
+
for name, value in result.metrics.items():
|
| 437 |
+
print(f"{name}: {value}")
|
| 438 |
+
|
| 439 |
+
print("\nOutput:")
|
| 440 |
+
print(result.output)
|
| 441 |
+
except Exception as e:
|
| 442 |
+
print(f"Error running test case: {e}")
|
| 443 |
+
|
| 444 |
+
async def _run_all_test_cases(self, prompt_id, llm_callback) -> None:
|
| 445 |
+
"""Run all test cases for a prompt."""
|
| 446 |
+
try:
|
| 447 |
+
metrics_callbacks = [
|
| 448 |
+
self._create_metrics_callback("exact_match"),
|
| 449 |
+
self._create_metrics_callback("similarity"),
|
| 450 |
+
self._create_metrics_callback("length")
|
| 451 |
+
]
|
| 452 |
+
|
| 453 |
+
results = await self.testing.run_test_cases(
|
| 454 |
+
prompt_id=prompt_id,
|
| 455 |
+
llm_callback=llm_callback,
|
| 456 |
+
metrics_callbacks=metrics_callbacks
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
print(f"Ran {len(results)} test cases for prompt {prompt_id}")
|
| 460 |
+
|
| 461 |
+
# Calculate aggregate metrics
|
| 462 |
+
if results:
|
| 463 |
+
passed = sum(1 for r in results if r.passed)
|
| 464 |
+
print(f"Passed: {passed}/{len(results)} ({passed/len(results)*100:.2f}%)")
|
| 465 |
+
|
| 466 |
+
# Aggregate metrics
|
| 467 |
+
metrics = {}
|
| 468 |
+
for r in results:
|
| 469 |
+
for name, value in r.metrics.items():
|
| 470 |
+
if name not in metrics:
|
| 471 |
+
metrics[name] = []
|
| 472 |
+
metrics[name].append(value)
|
| 473 |
+
|
| 474 |
+
print("\nAggregate metrics:")
|
| 475 |
+
for name, values in metrics.items():
|
| 476 |
+
avg = sum(values) / len(values)
|
| 477 |
+
print(f"{name}: {avg:.4f}")
|
| 478 |
+
except Exception as e:
|
| 479 |
+
print(f"Error running test cases: {e}")
|
| 480 |
+
|
| 481 |
+
async def _run_ab_test(self, prompt_a_id, prompt_b_id, llm_callback, test_cases) -> None:
|
| 482 |
+
"""Run an A/B test."""
|
| 483 |
+
try:
|
| 484 |
+
metrics_callbacks = [
|
| 485 |
+
self._create_metrics_callback("exact_match"),
|
| 486 |
+
self._create_metrics_callback("similarity"),
|
| 487 |
+
self._create_metrics_callback("length")
|
| 488 |
+
]
|
| 489 |
+
|
| 490 |
+
result = await self.testing.run_ab_test(
|
| 491 |
+
prompt_a_id=prompt_a_id,
|
| 492 |
+
prompt_b_id=prompt_b_id,
|
| 493 |
+
llm_callback=llm_callback,
|
| 494 |
+
metrics_callbacks=metrics_callbacks,
|
| 495 |
+
test_cases=test_cases
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
print(f"A/B test result ID: {result.id}")
|
| 499 |
+
print(f"Prompt A ID: {result.prompt_a_id}")
|
| 500 |
+
print(f"Prompt B ID: {result.prompt_b_id}")
|
| 501 |
+
print(f"Winner: {result.winner or 'Tie'}")
|
| 502 |
+
|
| 503 |
+
print("\nPrompt A metrics:")
|
| 504 |
+
for name, value in result.metrics_a.items():
|
| 505 |
+
print(f"{name}: {value:.4f}")
|
| 506 |
+
|
| 507 |
+
print("\nPrompt B metrics:")
|
| 508 |
+
for name, value in result.metrics_b.items():
|
| 509 |
+
print(f"{name}: {value:.4f}")
|
| 510 |
+
except Exception as e:
|
| 511 |
+
print(f"Error running A/B test: {e}")
|
| 512 |
+
|
| 513 |
+
def _handle_eval_command(self, args) -> None:
|
| 514 |
+
"""Handle evaluation commands."""
|
| 515 |
+
if not args.subcommand:
|
| 516 |
+
return
|
| 517 |
+
|
| 518 |
+
if args.subcommand == "metrics":
|
| 519 |
+
# List metrics
|
| 520 |
+
metrics = self.evaluator.list_metrics()
|
| 521 |
+
|
| 522 |
+
if not metrics:
|
| 523 |
+
print("No metrics registered")
|
| 524 |
+
return
|
| 525 |
+
|
| 526 |
+
# Print metrics
|
| 527 |
+
print(f"Found {len(metrics)} metrics:")
|
| 528 |
+
for metric in metrics:
|
| 529 |
+
print(f"Name: {metric.name} | Description: {metric.description}")
|
| 530 |
+
|
| 531 |
+
elif args.subcommand == "register":
|
| 532 |
+
# Register custom metric
|
| 533 |
+
if args.keywords:
|
| 534 |
+
# Register ContainsKeywordsMetric
|
| 535 |
+
keywords = [k.strip() for k in args.keywords.split(",")]
|
| 536 |
+
metric = ContainsKeywordsMetric(keywords)
|
| 537 |
+
self.evaluator.register_metric(metric)
|
| 538 |
+
print(f"Registered ContainsKeywordsMetric with name: {metric.name}")
|
| 539 |
+
elif args.min_length is not None or args.max_length is not None or args.target_length is not None:
|
| 540 |
+
# Register LengthMetric
|
| 541 |
+
metric = LengthMetric(
|
| 542 |
+
min_length=args.min_length,
|
| 543 |
+
max_length=args.max_length,
|
| 544 |
+
target_length=args.target_length
|
| 545 |
+
)
|
| 546 |
+
self.evaluator.register_metric(metric)
|
| 547 |
+
print(f"Registered LengthMetric with name: {metric.name}")
|
| 548 |
+
else:
|
| 549 |
+
print("Error: Must provide either --keywords, --min-length, --max-length, or --target-length")
|
| 550 |
+
|
| 551 |
+
elif args.subcommand == "run":
|
| 552 |
+
# Parse inputs
|
| 553 |
+
inputs = []
|
| 554 |
+
if args.inputs:
|
| 555 |
+
inputs = json.loads(args.inputs)
|
| 556 |
+
elif args.inputs_file:
|
| 557 |
+
with open(args.inputs_file, "r") as f:
|
| 558 |
+
inputs = json.loads(f.read())
|
| 559 |
+
else:
|
| 560 |
+
print("Error: Must provide either --inputs or --inputs-file")
|
| 561 |
+
return
|
| 562 |
+
|
| 563 |
+
# Parse expected outputs
|
| 564 |
+
expected_outputs = None
|
| 565 |
+
if args.expected:
|
| 566 |
+
expected_outputs = json.loads(args.expected)
|
| 567 |
+
elif args.expected_file:
|
| 568 |
+
with open(args.expected_file, "r") as f:
|
| 569 |
+
expected_outputs = json.loads(f.read())
|
| 570 |
+
|
| 571 |
+
# Parse metrics
|
| 572 |
+
metric_names = None
|
| 573 |
+
if args.metrics:
|
| 574 |
+
metric_names = [m.strip() for m in args.metrics.split(",")]
|
| 575 |
+
|
| 576 |
+
# Get LLM callback
|
| 577 |
+
llm_callback = self._get_llm_callback(args.llm)
|
| 578 |
+
|
| 579 |
+
# Run evaluation
|
| 580 |
+
asyncio.run(self._run_evaluation(
|
| 581 |
+
args.prompt_id,
|
| 582 |
+
inputs,
|
| 583 |
+
expected_outputs,
|
| 584 |
+
metric_names,
|
| 585 |
+
llm_callback
|
| 586 |
+
))
|
| 587 |
+
|
| 588 |
+
async def _run_evaluation(self, prompt_id, inputs, expected_outputs, metric_names, llm_callback) -> None:
|
| 589 |
+
"""Run an evaluation."""
|
| 590 |
+
try:
|
| 591 |
+
result = await self.evaluator.evaluate_prompt(
|
| 592 |
+
prompt_id=prompt_id,
|
| 593 |
+
inputs=inputs,
|
| 594 |
+
llm_callback=llm_callback,
|
| 595 |
+
expected_outputs=expected_outputs,
|
| 596 |
+
metric_names=metric_names
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
print(f"Evaluated prompt {prompt_id} with {result['num_samples']} samples")
|
| 600 |
+
|
| 601 |
+
# Print aggregated metrics
|
| 602 |
+
print("\nAggregated metrics:")
|
| 603 |
+
for name, value in result["aggregated_metrics"].items():
|
| 604 |
+
print(f"{name}: {value:.4f}")
|
| 605 |
+
|
| 606 |
+
# Print individual results
|
| 607 |
+
print("\nIndividual results:")
|
| 608 |
+
for i, r in enumerate(result["individual_results"]):
|
| 609 |
+
print(f"\nSample {i+1}:")
|
| 610 |
+
print(f"Input: {json.dumps(r['input'])}")
|
| 611 |
+
print(f"Output: {r['output']}")
|
| 612 |
+
if r["expected"]:
|
| 613 |
+
print(f"Expected: {r['expected']}")
|
| 614 |
+
|
| 615 |
+
print("Metrics:")
|
| 616 |
+
for name, value in r["metrics"].items():
|
| 617 |
+
print(f"{name}: {value:.4f}")
|
| 618 |
+
except Exception as e:
|
| 619 |
+
print(f"Error running evaluation: {e}")
|
| 620 |
+
|
| 621 |
+
def _get_llm_callback(self, llm_name: Optional[str]) -> callable:
|
| 622 |
+
"""Get an LLM callback function."""
|
| 623 |
+
# Default to a simple echo function for testing
|
| 624 |
+
if not llm_name or llm_name == "echo":
|
| 625 |
+
async def echo_callback(prompt, vars):
|
| 626 |
+
return f"Echo: {prompt}"
|
| 627 |
+
return echo_callback
|
| 628 |
+
|
| 629 |
+
# Add more LLM callbacks as needed
|
| 630 |
+
if llm_name == "openai":
|
| 631 |
+
# Example implementation using OpenAI
|
| 632 |
+
try:
|
| 633 |
+
import openai
|
| 634 |
+
|
| 635 |
+
async def openai_callback(prompt, vars):
|
| 636 |
+
response = await openai.Completion.acreate(
|
| 637 |
+
model="text-davinci-003",
|
| 638 |
+
prompt=prompt,
|
| 639 |
+
max_tokens=1000
|
| 640 |
+
)
|
| 641 |
+
return response.choices[0].text.strip()
|
| 642 |
+
|
| 643 |
+
return openai_callback
|
| 644 |
+
except ImportError:
|
| 645 |
+
print("Error: OpenAI package not installed. Run `pip install openai` to use this LLM.")
|
| 646 |
+
sys.exit(1)
|
| 647 |
+
|
| 648 |
+
# Add more LLM implementations as needed
|
| 649 |
+
|
| 650 |
+
print(f"Error: Unknown LLM callback: {llm_name}")
|
| 651 |
+
sys.exit(1)
|
| 652 |
+
|
| 653 |
+
def _create_metrics_callback(self, metric_type: str) -> callable:
|
| 654 |
+
"""Create a metrics callback function."""
|
| 655 |
+
# Simple metrics
|
| 656 |
+
if metric_type == "exact_match":
|
| 657 |
+
def exact_match_callback(output, expected):
|
| 658 |
+
if not expected:
|
| 659 |
+
return {"exact_match": 0.0}
|
| 660 |
+
return {"exact_match": 1.0 if output.strip() == expected.strip() else 0.0}
|
| 661 |
+
return exact_match_callback
|
| 662 |
+
|
| 663 |
+
elif metric_type == "similarity":
|
| 664 |
+
from difflib import SequenceMatcher
|
| 665 |
+
|
| 666 |
+
def similarity_callback(output, expected):
|
| 667 |
+
if not expected:
|
| 668 |
+
return {"similarity": 0.0}
|
| 669 |
+
return {"similarity": SequenceMatcher(None, output, expected).ratio()}
|
| 670 |
+
return similarity_callback
|
| 671 |
+
|
| 672 |
+
elif metric_type == "length":
|
| 673 |
+
def length_callback(output, expected):
|
| 674 |
+
out_len = len(output)
|
| 675 |
+
if not expected:
|
| 676 |
+
return {"length": 1.0 if out_len > 0 else 0.0}
|
| 677 |
+
|
| 678 |
+
exp_len = len(expected)
|
| 679 |
+
if exp_len == 0:
|
| 680 |
+
return {"length": 1.0 if out_len == 0 else 0.0}
|
| 681 |
+
|
| 682 |
+
# Return score inversely proportional to the difference
|
| 683 |
+
ratio = min(out_len / exp_len, exp_len / out_len)
|
| 684 |
+
return {"length": ratio}
|
| 685 |
+
return length_callback
|
| 686 |
+
|
| 687 |
+
# Default no-op metric
|
| 688 |
+
return lambda output, expected: {}
|
| 689 |
+
|
| 690 |
+
|
| 691 |
+
def main():
|
| 692 |
+
"""Main entry point for the CLI."""
|
| 693 |
+
CLI().run()
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
if __name__ == "__main__":
|
| 697 |
+
main()
|
promptlab/core/__init__.py
ADDED
|
File without changes
|
promptlab/core/evaluation.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
from typing import Dict, List, Optional, Any, Callable, Union, Awaitable
|
| 5 |
+
import asyncio
|
| 6 |
+
from .prompt_manager import PromptManager, Prompt
|
| 7 |
+
|
| 8 |
+
class EvaluationMetric:
|
| 9 |
+
"""Base class for evaluation metrics."""
|
| 10 |
+
def __init__(self, name: str, description: Optional[str] = None):
|
| 11 |
+
self.name = name
|
| 12 |
+
self.description = description or ""
|
| 13 |
+
|
| 14 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
| 15 |
+
"""Compute the metric. Must be implemented by subclasses."""
|
| 16 |
+
raise NotImplementedError("Subclasses must implement compute method")
|
| 17 |
+
|
| 18 |
+
class ExactMatchMetric(EvaluationMetric):
|
| 19 |
+
"""Evaluates exact match between generated and expected output."""
|
| 20 |
+
def __init__(self):
|
| 21 |
+
super().__init__("exact_match", "Exact match between generated and expected output")
|
| 22 |
+
|
| 23 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
| 24 |
+
"""Return 1.0 if generated matches expected exactly, 0.0 otherwise."""
|
| 25 |
+
if not expected_output:
|
| 26 |
+
return 0.0
|
| 27 |
+
return 1.0 if generated_output.strip() == expected_output.strip() else 0.0
|
| 28 |
+
|
| 29 |
+
class ContainsKeywordsMetric(EvaluationMetric):
|
| 30 |
+
"""Evaluates if the generated output contains specified keywords."""
|
| 31 |
+
def __init__(self, keywords: List[str], case_sensitive: bool = False):
|
| 32 |
+
super().__init__(
|
| 33 |
+
"contains_keywords",
|
| 34 |
+
f"Check if output contains keywords: {', '.join(keywords)}"
|
| 35 |
+
)
|
| 36 |
+
self.keywords = keywords
|
| 37 |
+
self.case_sensitive = case_sensitive
|
| 38 |
+
|
| 39 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
| 40 |
+
"""Return percentage of keywords found in the output."""
|
| 41 |
+
if not self.keywords:
|
| 42 |
+
return 0.0
|
| 43 |
+
|
| 44 |
+
if not self.case_sensitive:
|
| 45 |
+
generated_output = generated_output.lower()
|
| 46 |
+
keywords = [k.lower() for k in self.keywords]
|
| 47 |
+
else:
|
| 48 |
+
keywords = self.keywords
|
| 49 |
+
|
| 50 |
+
matches = sum(1 for k in keywords if k in generated_output)
|
| 51 |
+
return matches / len(keywords)
|
| 52 |
+
|
| 53 |
+
class LengthMetric(EvaluationMetric):
|
| 54 |
+
"""Evaluates if the generated output length is within the desired range."""
|
| 55 |
+
def __init__(self, min_length: Optional[int] = None, max_length: Optional[int] = None, target_length: Optional[int] = None):
|
| 56 |
+
description = "Evaluate output length"
|
| 57 |
+
if target_length is not None:
|
| 58 |
+
description = f"Evaluate if output length is close to {target_length} characters"
|
| 59 |
+
elif min_length is not None and max_length is not None:
|
| 60 |
+
description = f"Evaluate if output length is between {min_length} and {max_length} characters"
|
| 61 |
+
elif min_length is not None:
|
| 62 |
+
description = f"Evaluate if output length is at least {min_length} characters"
|
| 63 |
+
elif max_length is not None:
|
| 64 |
+
description = f"Evaluate if output length is at most {max_length} characters"
|
| 65 |
+
|
| 66 |
+
super().__init__("length", description)
|
| 67 |
+
self.min_length = min_length
|
| 68 |
+
self.max_length = max_length
|
| 69 |
+
self.target_length = target_length
|
| 70 |
+
|
| 71 |
+
def compute(self, generated_output: str, expected_output: Optional[str] = None, **kwargs) -> float:
|
| 72 |
+
"""Return score based on length conditions."""
|
| 73 |
+
length = len(generated_output)
|
| 74 |
+
|
| 75 |
+
if self.target_length is not None:
|
| 76 |
+
# Score inversely proportional to the distance from target
|
| 77 |
+
max_distance = self.target_length # Normalize to a max distance
|
| 78 |
+
distance = abs(length - self.target_length)
|
| 79 |
+
return max(0, 1 - (distance / max_distance))
|
| 80 |
+
|
| 81 |
+
# Check if within bounds
|
| 82 |
+
within_min = self.min_length is None or length >= self.min_length
|
| 83 |
+
within_max = self.max_length is None or length <= self.max_length
|
| 84 |
+
|
| 85 |
+
if within_min and within_max:
|
| 86 |
+
return 1.0
|
| 87 |
+
elif within_min and self.max_length:
|
| 88 |
+
# Over max length, calculate proportional penalty
|
| 89 |
+
return max(0, 1 - ((length - self.max_length) / self.max_length))
|
| 90 |
+
elif within_max and self.min_length:
|
| 91 |
+
# Under min length, calculate proportional penalty
|
| 92 |
+
return max(0, length / self.min_length)
|
| 93 |
+
return 0.0
|
| 94 |
+
|
| 95 |
+
class Evaluator:
|
| 96 |
+
"""Manages evaluation metrics and evaluation runs."""
|
| 97 |
+
def __init__(self, prompt_manager: PromptManager):
|
| 98 |
+
self.prompt_manager = prompt_manager
|
| 99 |
+
self.metrics: Dict[str, EvaluationMetric] = {}
|
| 100 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "evaluations")
|
| 101 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
| 102 |
+
|
| 103 |
+
# Register built-in metrics
|
| 104 |
+
self.register_metric(ExactMatchMetric())
|
| 105 |
+
self.register_metric(ContainsKeywordsMetric(["important", "critical", "necessary"]))
|
| 106 |
+
self.register_metric(LengthMetric(min_length=50, max_length=500))
|
| 107 |
+
|
| 108 |
+
def register_metric(self, metric: EvaluationMetric) -> None:
|
| 109 |
+
"""Register a new evaluation metric."""
|
| 110 |
+
self.metrics[metric.name] = metric
|
| 111 |
+
|
| 112 |
+
def get_metric(self, name: str) -> Optional[EvaluationMetric]:
|
| 113 |
+
"""Get a registered metric by name."""
|
| 114 |
+
return self.metrics.get(name)
|
| 115 |
+
|
| 116 |
+
def list_metrics(self) -> List[EvaluationMetric]:
|
| 117 |
+
"""List all registered metrics."""
|
| 118 |
+
return list(self.metrics.values())
|
| 119 |
+
|
| 120 |
+
async def evaluate_prompt(
|
| 121 |
+
self,
|
| 122 |
+
prompt_id: str,
|
| 123 |
+
inputs: List[Dict[str, Any]],
|
| 124 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
| 125 |
+
expected_outputs: Optional[List[Optional[str]]] = None,
|
| 126 |
+
metric_names: Optional[List[str]] = None
|
| 127 |
+
) -> Dict[str, Any]:
|
| 128 |
+
"""Evaluate a prompt with the given inputs and metrics."""
|
| 129 |
+
prompt = self.prompt_manager.get(prompt_id)
|
| 130 |
+
if not prompt:
|
| 131 |
+
raise ValueError(f"Prompt with ID {prompt_id} not found")
|
| 132 |
+
|
| 133 |
+
# Use all registered metrics if none specified
|
| 134 |
+
if not metric_names:
|
| 135 |
+
metrics_to_use = list(self.metrics.values())
|
| 136 |
+
else:
|
| 137 |
+
metrics_to_use = [self.get_metric(name) for name in metric_names if self.get_metric(name)]
|
| 138 |
+
|
| 139 |
+
if not metrics_to_use:
|
| 140 |
+
raise ValueError("No valid metrics specified")
|
| 141 |
+
|
| 142 |
+
# Ensure expected_outputs is the same length as inputs
|
| 143 |
+
if expected_outputs is None:
|
| 144 |
+
expected_outputs = [None] * len(inputs)
|
| 145 |
+
elif len(expected_outputs) != len(inputs):
|
| 146 |
+
raise ValueError("Expected outputs must be the same length as inputs")
|
| 147 |
+
|
| 148 |
+
results = []
|
| 149 |
+
for i, (input_vars, expected) in enumerate(zip(inputs, expected_outputs)):
|
| 150 |
+
# Render the prompt
|
| 151 |
+
rendered_prompt = prompt.render(**input_vars)
|
| 152 |
+
|
| 153 |
+
# Generate output
|
| 154 |
+
if asyncio.iscoroutinefunction(llm_callback):
|
| 155 |
+
output = await llm_callback(rendered_prompt, input_vars)
|
| 156 |
+
else:
|
| 157 |
+
output = llm_callback(rendered_prompt, input_vars)
|
| 158 |
+
|
| 159 |
+
# Compute metrics
|
| 160 |
+
metrics_results = {}
|
| 161 |
+
for metric in metrics_to_use:
|
| 162 |
+
metrics_results[metric.name] = metric.compute(output, expected, **input_vars)
|
| 163 |
+
|
| 164 |
+
results.append({
|
| 165 |
+
"input": input_vars,
|
| 166 |
+
"output": output,
|
| 167 |
+
"expected": expected,
|
| 168 |
+
"metrics": metrics_results
|
| 169 |
+
})
|
| 170 |
+
|
| 171 |
+
# Aggregate metrics
|
| 172 |
+
aggregated_metrics = {}
|
| 173 |
+
for metric in metrics_to_use:
|
| 174 |
+
values = [r["metrics"][metric.name] for r in results]
|
| 175 |
+
aggregated_metrics[metric.name] = sum(values) / len(values) if values else 0
|
| 176 |
+
|
| 177 |
+
evaluation_result = {
|
| 178 |
+
"prompt_id": prompt_id,
|
| 179 |
+
"prompt_version": prompt.version,
|
| 180 |
+
"num_samples": len(inputs),
|
| 181 |
+
"aggregated_metrics": aggregated_metrics,
|
| 182 |
+
"individual_results": results
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
# Save evaluation result
|
| 186 |
+
timestamp = datetime.datetime.now().isoformat().replace(":", "-").replace(".", "-")
|
| 187 |
+
file_path = os.path.join(self.storage_path, f"eval_{prompt_id}_{timestamp}.json")
|
| 188 |
+
with open(file_path, "w") as f:
|
| 189 |
+
json.dump(evaluation_result, f, indent=2)
|
| 190 |
+
|
| 191 |
+
return evaluation_result
|
promptlab/core/prompt_manager.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import hashlib
|
| 4 |
+
import datetime
|
| 5 |
+
from typing import Dict, List, Optional, Union, Any
|
| 6 |
+
|
| 7 |
+
class Prompt:
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
content: str,
|
| 11 |
+
name: str,
|
| 12 |
+
description: Optional[str] = None,
|
| 13 |
+
tags: Optional[List[str]] = None,
|
| 14 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 15 |
+
):
|
| 16 |
+
self.content = content
|
| 17 |
+
self.name = name
|
| 18 |
+
self.description = description or ""
|
| 19 |
+
self.tags = tags or []
|
| 20 |
+
self.metadata = metadata or {}
|
| 21 |
+
self.created_at = datetime.datetime.now().isoformat()
|
| 22 |
+
self.updated_at = self.created_at
|
| 23 |
+
self.id = self._generate_id()
|
| 24 |
+
self.version = 1
|
| 25 |
+
|
| 26 |
+
def _generate_id(self) -> str:
|
| 27 |
+
"""Generate a unique ID based on content and name."""
|
| 28 |
+
unique_string = f"{self.name}:{self.content}:{self.created_at}"
|
| 29 |
+
return hashlib.md5(unique_string.encode()).hexdigest()[:10]
|
| 30 |
+
|
| 31 |
+
def update(self, content: Optional[str] = None, **kwargs) -> None:
|
| 32 |
+
"""Update prompt attributes."""
|
| 33 |
+
if content is not None:
|
| 34 |
+
self.content = content
|
| 35 |
+
|
| 36 |
+
for key, value in kwargs.items():
|
| 37 |
+
if hasattr(self, key):
|
| 38 |
+
setattr(self, key, value)
|
| 39 |
+
|
| 40 |
+
self.updated_at = datetime.datetime.now().isoformat()
|
| 41 |
+
|
| 42 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 43 |
+
"""Convert prompt to dictionary."""
|
| 44 |
+
return {
|
| 45 |
+
"id": self.id,
|
| 46 |
+
"name": self.name,
|
| 47 |
+
"content": self.content,
|
| 48 |
+
"description": self.description,
|
| 49 |
+
"tags": self.tags,
|
| 50 |
+
"metadata": self.metadata,
|
| 51 |
+
"created_at": self.created_at,
|
| 52 |
+
"updated_at": self.updated_at,
|
| 53 |
+
"version": self.version
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
@classmethod
|
| 57 |
+
def from_dict(cls, data: Dict[str, Any]) -> "Prompt":
|
| 58 |
+
"""Create prompt from dictionary."""
|
| 59 |
+
prompt = cls(
|
| 60 |
+
content=data["content"],
|
| 61 |
+
name=data["name"],
|
| 62 |
+
description=data.get("description", ""),
|
| 63 |
+
tags=data.get("tags", []),
|
| 64 |
+
metadata=data.get("metadata", {})
|
| 65 |
+
)
|
| 66 |
+
prompt.id = data["id"]
|
| 67 |
+
prompt.created_at = data["created_at"]
|
| 68 |
+
prompt.updated_at = data["updated_at"]
|
| 69 |
+
prompt.version = data["version"]
|
| 70 |
+
return prompt
|
| 71 |
+
|
| 72 |
+
def render(self, **kwargs) -> str:
|
| 73 |
+
"""Render prompt with provided variables."""
|
| 74 |
+
rendered = self.content
|
| 75 |
+
for key, value in kwargs.items():
|
| 76 |
+
placeholder = f"{{{key}}}"
|
| 77 |
+
rendered = rendered.replace(placeholder, str(value))
|
| 78 |
+
return rendered
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class PromptManager:
|
| 82 |
+
def __init__(self, storage_path: Optional[str] = None):
|
| 83 |
+
self.storage_path = storage_path or os.path.join(os.getcwd(), "promptlab_storage")
|
| 84 |
+
self.prompts: Dict[str, Prompt] = {}
|
| 85 |
+
self._ensure_storage_dir()
|
| 86 |
+
self._load_prompts()
|
| 87 |
+
|
| 88 |
+
def _ensure_storage_dir(self) -> None:
|
| 89 |
+
"""Ensure storage directory exists."""
|
| 90 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
def _load_prompts(self) -> None:
|
| 93 |
+
"""Load prompts from storage."""
|
| 94 |
+
prompts_dir = os.path.join(self.storage_path, "prompts")
|
| 95 |
+
if not os.path.exists(prompts_dir):
|
| 96 |
+
os.makedirs(prompts_dir)
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
for filename in os.listdir(prompts_dir):
|
| 100 |
+
if filename.endswith(".json"):
|
| 101 |
+
with open(os.path.join(prompts_dir, filename), "r") as f:
|
| 102 |
+
prompt_data = json.load(f)
|
| 103 |
+
prompt = Prompt.from_dict(prompt_data)
|
| 104 |
+
self.prompts[prompt.id] = prompt
|
| 105 |
+
|
| 106 |
+
def _save_prompt(self, prompt: Prompt) -> None:
|
| 107 |
+
"""Save prompt to storage."""
|
| 108 |
+
prompts_dir = os.path.join(self.storage_path, "prompts")
|
| 109 |
+
os.makedirs(prompts_dir, exist_ok=True)
|
| 110 |
+
|
| 111 |
+
prompt_path = os.path.join(prompts_dir, f"{prompt.id}.json")
|
| 112 |
+
with open(prompt_path, "w") as f:
|
| 113 |
+
json.dump(prompt.to_dict(), f, indent=2)
|
| 114 |
+
|
| 115 |
+
def create(
|
| 116 |
+
self,
|
| 117 |
+
content: str,
|
| 118 |
+
name: str,
|
| 119 |
+
description: Optional[str] = None,
|
| 120 |
+
tags: Optional[List[str]] = None,
|
| 121 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 122 |
+
) -> Prompt:
|
| 123 |
+
"""Create a new prompt."""
|
| 124 |
+
prompt = Prompt(
|
| 125 |
+
content=content,
|
| 126 |
+
name=name,
|
| 127 |
+
description=description,
|
| 128 |
+
tags=tags,
|
| 129 |
+
metadata=metadata
|
| 130 |
+
)
|
| 131 |
+
self.prompts[prompt.id] = prompt
|
| 132 |
+
self._save_prompt(prompt)
|
| 133 |
+
return prompt
|
| 134 |
+
|
| 135 |
+
def get(self, prompt_id: str) -> Optional[Prompt]:
|
| 136 |
+
"""Get prompt by ID."""
|
| 137 |
+
return self.prompts.get(prompt_id)
|
| 138 |
+
|
| 139 |
+
def update(self, prompt_id: str, **kwargs) -> Optional[Prompt]:
|
| 140 |
+
"""Update prompt by ID."""
|
| 141 |
+
prompt = self.get(prompt_id)
|
| 142 |
+
if prompt:
|
| 143 |
+
prompt.update(**kwargs)
|
| 144 |
+
self._save_prompt(prompt)
|
| 145 |
+
return prompt
|
| 146 |
+
|
| 147 |
+
def delete(self, prompt_id: str) -> bool:
|
| 148 |
+
"""Delete prompt by ID."""
|
| 149 |
+
if prompt_id in self.prompts:
|
| 150 |
+
del self.prompts[prompt_id]
|
| 151 |
+
prompt_path = os.path.join(self.storage_path, "prompts", f"{prompt_id}.json")
|
| 152 |
+
if os.path.exists(prompt_path):
|
| 153 |
+
os.remove(prompt_path)
|
| 154 |
+
return True
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
def list(self, tags: Optional[List[str]] = None) -> List[Prompt]:
|
| 158 |
+
"""List prompts, optionally filtered by tags."""
|
| 159 |
+
if tags:
|
| 160 |
+
return [p for p in self.prompts.values() if any(tag in p.tags for tag in tags)]
|
| 161 |
+
return list(self.prompts.values())
|
| 162 |
+
|
| 163 |
+
def search(self, query: str) -> List[Prompt]:
|
| 164 |
+
"""Search prompts by name or content."""
|
| 165 |
+
query = query.lower()
|
| 166 |
+
return [
|
| 167 |
+
p for p in self.prompts.values()
|
| 168 |
+
if query in p.name.lower() or query in p.content.lower()
|
| 169 |
+
]
|
promptlab/core/testing.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import uuid
|
| 4 |
+
import datetime
|
| 5 |
+
import asyncio
|
| 6 |
+
from typing import Dict, List, Optional, Any, Callable, Union, Awaitable, Tuple
|
| 7 |
+
from .prompt_manager import Prompt, PromptManager
|
| 8 |
+
|
| 9 |
+
class TestCase:
|
| 10 |
+
"""Represents a test case for a prompt."""
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
prompt_id: str,
|
| 14 |
+
input_vars: Dict[str, Any],
|
| 15 |
+
expected_output: Optional[str] = None,
|
| 16 |
+
name: Optional[str] = None,
|
| 17 |
+
description: Optional[str] = None
|
| 18 |
+
):
|
| 19 |
+
self.id = str(uuid.uuid4())[:10]
|
| 20 |
+
self.prompt_id = prompt_id
|
| 21 |
+
self.input_vars = input_vars
|
| 22 |
+
self.expected_output = expected_output
|
| 23 |
+
self.name = name or f"Test case {self.id}"
|
| 24 |
+
self.description = description or ""
|
| 25 |
+
self.created_at = datetime.datetime.now().isoformat()
|
| 26 |
+
|
| 27 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 28 |
+
"""Convert test case to dictionary."""
|
| 29 |
+
return {
|
| 30 |
+
"id": self.id,
|
| 31 |
+
"prompt_id": self.prompt_id,
|
| 32 |
+
"input_vars": self.input_vars,
|
| 33 |
+
"expected_output": self.expected_output,
|
| 34 |
+
"name": self.name,
|
| 35 |
+
"description": self.description,
|
| 36 |
+
"created_at": self.created_at
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
@classmethod
|
| 40 |
+
def from_dict(cls, data: Dict[str, Any]) -> "TestCase":
|
| 41 |
+
"""Create test case from dictionary."""
|
| 42 |
+
test_case = cls(
|
| 43 |
+
prompt_id=data["prompt_id"],
|
| 44 |
+
input_vars=data["input_vars"],
|
| 45 |
+
expected_output=data.get("expected_output"),
|
| 46 |
+
name=data.get("name"),
|
| 47 |
+
description=data.get("description")
|
| 48 |
+
)
|
| 49 |
+
test_case.id = data["id"]
|
| 50 |
+
test_case.created_at = data["created_at"]
|
| 51 |
+
return test_case
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestResult:
|
| 55 |
+
"""Represents the result of a test case execution."""
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
test_case_id: str,
|
| 59 |
+
prompt_id: str,
|
| 60 |
+
prompt_version: int,
|
| 61 |
+
output: str,
|
| 62 |
+
passed: Optional[bool] = None,
|
| 63 |
+
metrics: Optional[Dict[str, float]] = None
|
| 64 |
+
):
|
| 65 |
+
self.id = str(uuid.uuid4())[:10]
|
| 66 |
+
self.test_case_id = test_case_id
|
| 67 |
+
self.prompt_id = prompt_id
|
| 68 |
+
self.prompt_version = prompt_version
|
| 69 |
+
self.output = output
|
| 70 |
+
self.passed = passed
|
| 71 |
+
self.metrics = metrics or {}
|
| 72 |
+
self.created_at = datetime.datetime.now().isoformat()
|
| 73 |
+
|
| 74 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 75 |
+
"""Convert test result to dictionary."""
|
| 76 |
+
return {
|
| 77 |
+
"id": self.id,
|
| 78 |
+
"test_case_id": self.test_case_id,
|
| 79 |
+
"prompt_id": self.prompt_id,
|
| 80 |
+
"prompt_version": self.prompt_version,
|
| 81 |
+
"output": self.output,
|
| 82 |
+
"passed": self.passed,
|
| 83 |
+
"metrics": self.metrics,
|
| 84 |
+
"created_at": self.created_at
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
@classmethod
|
| 88 |
+
def from_dict(cls, data: Dict[str, Any]) -> "TestResult":
|
| 89 |
+
"""Create test result from dictionary."""
|
| 90 |
+
return cls(
|
| 91 |
+
test_case_id=data["test_case_id"],
|
| 92 |
+
prompt_id=data["prompt_id"],
|
| 93 |
+
prompt_version=data["prompt_version"],
|
| 94 |
+
output=data["output"],
|
| 95 |
+
passed=data.get("passed"),
|
| 96 |
+
metrics=data.get("metrics", {})
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class ABTestResult:
|
| 101 |
+
"""Represents the result of an A/B test."""
|
| 102 |
+
def __init__(
|
| 103 |
+
self,
|
| 104 |
+
prompt_a_id: str,
|
| 105 |
+
prompt_b_id: str,
|
| 106 |
+
prompt_a_version: int,
|
| 107 |
+
prompt_b_version: int,
|
| 108 |
+
metrics_a: Dict[str, float],
|
| 109 |
+
metrics_b: Dict[str, float],
|
| 110 |
+
winner: Optional[str] = None
|
| 111 |
+
):
|
| 112 |
+
self.id = str(uuid.uuid4())[:10]
|
| 113 |
+
self.prompt_a_id = prompt_a_id
|
| 114 |
+
self.prompt_b_id = prompt_b_id
|
| 115 |
+
self.prompt_a_version = prompt_a_version
|
| 116 |
+
self.prompt_b_version = prompt_b_version
|
| 117 |
+
self.metrics_a = metrics_a
|
| 118 |
+
self.metrics_b = metrics_b
|
| 119 |
+
self.winner = winner
|
| 120 |
+
self.created_at = datetime.datetime.now().isoformat()
|
| 121 |
+
|
| 122 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 123 |
+
"""Convert A/B test result to dictionary."""
|
| 124 |
+
return {
|
| 125 |
+
"id": self.id,
|
| 126 |
+
"prompt_a_id": self.prompt_a_id,
|
| 127 |
+
"prompt_b_id": self.prompt_b_id,
|
| 128 |
+
"prompt_a_version": self.prompt_a_version,
|
| 129 |
+
"prompt_b_version": self.prompt_b_version,
|
| 130 |
+
"metrics_a": self.metrics_a,
|
| 131 |
+
"metrics_b": self.metrics_b,
|
| 132 |
+
"winner": self.winner,
|
| 133 |
+
"created_at": self.created_at
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
@classmethod
|
| 137 |
+
def from_dict(cls, data: Dict[str, Any]) -> "ABTestResult":
|
| 138 |
+
"""Create A/B test result from dictionary."""
|
| 139 |
+
return cls(
|
| 140 |
+
prompt_a_id=data["prompt_a_id"],
|
| 141 |
+
prompt_b_id=data["prompt_b_id"],
|
| 142 |
+
prompt_a_version=data["prompt_a_version"],
|
| 143 |
+
prompt_b_version=data["prompt_b_version"],
|
| 144 |
+
metrics_a=data["metrics_a"],
|
| 145 |
+
metrics_b=data["metrics_b"],
|
| 146 |
+
winner=data.get("winner")
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class PromptTesting:
|
| 151 |
+
"""Manages testing for prompts."""
|
| 152 |
+
def __init__(self, prompt_manager: PromptManager):
|
| 153 |
+
self.prompt_manager = prompt_manager
|
| 154 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "tests")
|
| 155 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
| 156 |
+
|
| 157 |
+
# Storage paths
|
| 158 |
+
self.test_cases_path = os.path.join(self.storage_path, "test_cases")
|
| 159 |
+
self.test_results_path = os.path.join(self.storage_path, "test_results")
|
| 160 |
+
self.ab_test_results_path = os.path.join(self.storage_path, "ab_test_results")
|
| 161 |
+
|
| 162 |
+
os.makedirs(self.test_cases_path, exist_ok=True)
|
| 163 |
+
os.makedirs(self.test_results_path, exist_ok=True)
|
| 164 |
+
os.makedirs(self.ab_test_results_path, exist_ok=True)
|
| 165 |
+
|
| 166 |
+
self.test_cases: Dict[str, TestCase] = {}
|
| 167 |
+
self.test_results: Dict[str, TestResult] = {}
|
| 168 |
+
self.ab_test_results: Dict[str, ABTestResult] = {}
|
| 169 |
+
|
| 170 |
+
self._load_test_cases()
|
| 171 |
+
self._load_test_results()
|
| 172 |
+
self._load_ab_test_results()
|
| 173 |
+
|
| 174 |
+
def _load_test_cases(self) -> None:
|
| 175 |
+
"""Load test cases from storage."""
|
| 176 |
+
for filename in os.listdir(self.test_cases_path):
|
| 177 |
+
if filename.endswith(".json"):
|
| 178 |
+
with open(os.path.join(self.test_cases_path, filename), "r") as f:
|
| 179 |
+
data = json.load(f)
|
| 180 |
+
test_case = TestCase.from_dict(data)
|
| 181 |
+
self.test_cases[test_case.id] = test_case
|
| 182 |
+
|
| 183 |
+
def _load_test_results(self) -> None:
|
| 184 |
+
"""Load test results from storage."""
|
| 185 |
+
for filename in os.listdir(self.test_results_path):
|
| 186 |
+
if filename.endswith(".json"):
|
| 187 |
+
with open(os.path.join(self.test_results_path, filename), "r") as f:
|
| 188 |
+
data = json.load(f)
|
| 189 |
+
test_result = TestResult.from_dict(data)
|
| 190 |
+
self.test_results[test_result.id] = test_result
|
| 191 |
+
|
| 192 |
+
def _load_ab_test_results(self) -> None:
|
| 193 |
+
"""Load A/B test results from storage."""
|
| 194 |
+
for filename in os.listdir(self.ab_test_results_path):
|
| 195 |
+
if filename.endswith(".json"):
|
| 196 |
+
with open(os.path.join(self.ab_test_results_path, filename), "r") as f:
|
| 197 |
+
data = json.load(f)
|
| 198 |
+
ab_test_result = ABTestResult.from_dict(data)
|
| 199 |
+
self.ab_test_results[ab_test_result.id] = ab_test_result
|
| 200 |
+
|
| 201 |
+
def _save_test_case(self, test_case: TestCase) -> None:
|
| 202 |
+
"""Save test case to storage."""
|
| 203 |
+
file_path = os.path.join(self.test_cases_path, f"{test_case.id}.json")
|
| 204 |
+
with open(file_path, "w") as f:
|
| 205 |
+
json.dump(test_case.to_dict(), f, indent=2)
|
| 206 |
+
|
| 207 |
+
def _save_test_result(self, test_result: TestResult) -> None:
|
| 208 |
+
"""Save test result to storage."""
|
| 209 |
+
file_path = os.path.join(self.test_results_path, f"{test_result.id}.json")
|
| 210 |
+
with open(file_path, "w") as f:
|
| 211 |
+
json.dump(test_result.to_dict(), f, indent=2)
|
| 212 |
+
|
| 213 |
+
def _save_ab_test_result(self, ab_test_result: ABTestResult) -> None:
|
| 214 |
+
"""Save A/B test result to storage."""
|
| 215 |
+
file_path = os.path.join(self.ab_test_results_path, f"{ab_test_result.id}.json")
|
| 216 |
+
with open(file_path, "w") as f:
|
| 217 |
+
json.dump(ab_test_result.to_dict(), f, indent=2)
|
| 218 |
+
|
| 219 |
+
def create_test_case(
|
| 220 |
+
self,
|
| 221 |
+
prompt_id: str,
|
| 222 |
+
input_vars: Dict[str, Any],
|
| 223 |
+
expected_output: Optional[str] = None,
|
| 224 |
+
name: Optional[str] = None,
|
| 225 |
+
description: Optional[str] = None
|
| 226 |
+
) -> TestCase:
|
| 227 |
+
"""Create a test case for a prompt."""
|
| 228 |
+
test_case = TestCase(
|
| 229 |
+
prompt_id=prompt_id,
|
| 230 |
+
input_vars=input_vars,
|
| 231 |
+
expected_output=expected_output,
|
| 232 |
+
name=name,
|
| 233 |
+
description=description
|
| 234 |
+
)
|
| 235 |
+
self.test_cases[test_case.id] = test_case
|
| 236 |
+
self._save_test_case(test_case)
|
| 237 |
+
return test_case
|
| 238 |
+
|
| 239 |
+
def get_test_case(self, test_case_id: str) -> Optional[TestCase]:
|
| 240 |
+
"""Get a test case by ID."""
|
| 241 |
+
return self.test_cases.get(test_case_id)
|
| 242 |
+
|
| 243 |
+
def list_test_cases(self, prompt_id: Optional[str] = None) -> List[TestCase]:
|
| 244 |
+
"""List test cases, optionally filtered by prompt ID."""
|
| 245 |
+
if prompt_id:
|
| 246 |
+
return [tc for tc in self.test_cases.values() if tc.prompt_id == prompt_id]
|
| 247 |
+
return list(self.test_cases.values())
|
| 248 |
+
|
| 249 |
+
def delete_test_case(self, test_case_id: str) -> bool:
|
| 250 |
+
"""Delete a test case by ID."""
|
| 251 |
+
if test_case_id in self.test_cases:
|
| 252 |
+
del self.test_cases[test_case_id]
|
| 253 |
+
file_path = os.path.join(self.test_cases_path, f"{test_case_id}.json")
|
| 254 |
+
if os.path.exists(file_path):
|
| 255 |
+
os.remove(file_path)
|
| 256 |
+
return True
|
| 257 |
+
return False
|
| 258 |
+
|
| 259 |
+
async def run_test_case(
|
| 260 |
+
self,
|
| 261 |
+
test_case_id: str,
|
| 262 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
| 263 |
+
metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
|
| 264 |
+
) -> TestResult:
|
| 265 |
+
"""Run a test case with the given LLM callback."""
|
| 266 |
+
test_case = self.get_test_case(test_case_id)
|
| 267 |
+
if not test_case:
|
| 268 |
+
raise ValueError(f"Test case with ID {test_case_id} not found")
|
| 269 |
+
|
| 270 |
+
prompt = self.prompt_manager.get(test_case.prompt_id)
|
| 271 |
+
if not prompt:
|
| 272 |
+
raise ValueError(f"Prompt with ID {test_case.prompt_id} not found")
|
| 273 |
+
|
| 274 |
+
# Render the prompt with the input variables
|
| 275 |
+
rendered_prompt = prompt.render(**test_case.input_vars)
|
| 276 |
+
|
| 277 |
+
# Call the LLM with the rendered prompt
|
| 278 |
+
if asyncio.iscoroutinefunction(llm_callback):
|
| 279 |
+
output = await llm_callback(rendered_prompt, test_case.input_vars)
|
| 280 |
+
else:
|
| 281 |
+
output = llm_callback(rendered_prompt, test_case.input_vars)
|
| 282 |
+
|
| 283 |
+
# Determine if the test passed
|
| 284 |
+
passed = None
|
| 285 |
+
if test_case.expected_output:
|
| 286 |
+
passed = output.strip() == test_case.expected_output.strip()
|
| 287 |
+
|
| 288 |
+
# Calculate metrics if callbacks are provided
|
| 289 |
+
metrics = {}
|
| 290 |
+
if metrics_callbacks:
|
| 291 |
+
for metric_callback in metrics_callbacks:
|
| 292 |
+
metrics.update(metric_callback(output, test_case.expected_output or ""))
|
| 293 |
+
|
| 294 |
+
# Create and save the test result
|
| 295 |
+
test_result = TestResult(
|
| 296 |
+
test_case_id=test_case.id,
|
| 297 |
+
prompt_id=test_case.prompt_id,
|
| 298 |
+
prompt_version=prompt.version,
|
| 299 |
+
output=output,
|
| 300 |
+
passed=passed,
|
| 301 |
+
metrics=metrics
|
| 302 |
+
)
|
| 303 |
+
self.test_results[test_result.id] = test_result
|
| 304 |
+
self._save_test_result(test_result)
|
| 305 |
+
|
| 306 |
+
return test_result
|
| 307 |
+
|
| 308 |
+
async def run_test_cases(
|
| 309 |
+
self,
|
| 310 |
+
prompt_id: str,
|
| 311 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
| 312 |
+
metrics_callbacks: Optional[List[Callable[[str, str], Dict[str, float]]]] = None
|
| 313 |
+
) -> List[TestResult]:
|
| 314 |
+
"""Run all test cases for a prompt."""
|
| 315 |
+
test_cases = self.list_test_cases(prompt_id)
|
| 316 |
+
results = []
|
| 317 |
+
|
| 318 |
+
for test_case in test_cases:
|
| 319 |
+
result = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
|
| 320 |
+
results.append(result)
|
| 321 |
+
|
| 322 |
+
return results
|
| 323 |
+
|
| 324 |
+
async def run_ab_test(
|
| 325 |
+
self,
|
| 326 |
+
prompt_a_id: str,
|
| 327 |
+
prompt_b_id: str,
|
| 328 |
+
llm_callback: Callable[[str, Dict[str, Any]], Union[str, Awaitable[str]]],
|
| 329 |
+
metrics_callbacks: List[Callable[[str, str], Dict[str, float]]],
|
| 330 |
+
test_cases: Optional[List[str]] = None
|
| 331 |
+
) -> ABTestResult:
|
| 332 |
+
"""Run an A/B test with two prompts."""
|
| 333 |
+
prompt_a = self.prompt_manager.get(prompt_a_id)
|
| 334 |
+
prompt_b = self.prompt_manager.get(prompt_b_id)
|
| 335 |
+
|
| 336 |
+
if not prompt_a or not prompt_b:
|
| 337 |
+
raise ValueError("Both prompts must exist")
|
| 338 |
+
|
| 339 |
+
# Get test cases to use
|
| 340 |
+
if test_cases:
|
| 341 |
+
# Use specified test cases
|
| 342 |
+
test_case_objs = [self.get_test_case(tc_id) for tc_id in test_cases]
|
| 343 |
+
test_case_objs = [tc for tc in test_case_objs if tc]
|
| 344 |
+
else:
|
| 345 |
+
# Use all test cases for prompt A
|
| 346 |
+
test_case_objs = self.list_test_cases(prompt_a_id)
|
| 347 |
+
|
| 348 |
+
if not test_case_objs:
|
| 349 |
+
raise ValueError("No test cases found for the A/B test")
|
| 350 |
+
|
| 351 |
+
# Run test cases for both prompts
|
| 352 |
+
results_a = []
|
| 353 |
+
results_b = []
|
| 354 |
+
|
| 355 |
+
for test_case in test_case_objs:
|
| 356 |
+
# Create a copy of the test case for prompt B
|
| 357 |
+
if test_case.prompt_id != prompt_b_id:
|
| 358 |
+
test_case_b = self.create_test_case(
|
| 359 |
+
prompt_id=prompt_b_id,
|
| 360 |
+
input_vars=test_case.input_vars,
|
| 361 |
+
expected_output=test_case.expected_output,
|
| 362 |
+
name=f"Copy of {test_case.name} for B",
|
| 363 |
+
description=test_case.description
|
| 364 |
+
)
|
| 365 |
+
else:
|
| 366 |
+
test_case_b = test_case
|
| 367 |
+
|
| 368 |
+
# Run the test cases
|
| 369 |
+
result_a = await self.run_test_case(test_case.id, llm_callback, metrics_callbacks)
|
| 370 |
+
result_b = await self.run_test_case(test_case_b.id, llm_callback, metrics_callbacks)
|
| 371 |
+
|
| 372 |
+
results_a.append(result_a)
|
| 373 |
+
results_b.append(result_b)
|
| 374 |
+
|
| 375 |
+
# Calculate aggregate metrics
|
| 376 |
+
metrics_a = self._aggregate_metrics([r.metrics for r in results_a])
|
| 377 |
+
metrics_b = self._aggregate_metrics([r.metrics for r in results_b])
|
| 378 |
+
|
| 379 |
+
# Determine winner
|
| 380 |
+
winner = self._determine_winner(metrics_a, metrics_b)
|
| 381 |
+
|
| 382 |
+
# Create and save the A/B test result
|
| 383 |
+
ab_test_result = ABTestResult(
|
| 384 |
+
prompt_a_id=prompt_a_id,
|
| 385 |
+
prompt_b_id=prompt_b_id,
|
| 386 |
+
prompt_a_version=prompt_a.version,
|
| 387 |
+
prompt_b_version=prompt_b.version,
|
| 388 |
+
metrics_a=metrics_a,
|
| 389 |
+
metrics_b=metrics_b,
|
| 390 |
+
winner=winner
|
| 391 |
+
)
|
| 392 |
+
self.ab_test_results[ab_test_result.id] = ab_test_result
|
| 393 |
+
self._save_ab_test_result(ab_test_result)
|
| 394 |
+
|
| 395 |
+
return ab_test_result
|
| 396 |
+
|
| 397 |
+
def _aggregate_metrics(self, metrics_list: List[Dict[str, float]]) -> Dict[str, float]:
|
| 398 |
+
"""Aggregate metrics from multiple test results."""
|
| 399 |
+
if not metrics_list:
|
| 400 |
+
return {}
|
| 401 |
+
|
| 402 |
+
aggregated = {}
|
| 403 |
+
for key in metrics_list[0].keys():
|
| 404 |
+
values = [m.get(key, 0) for m in metrics_list]
|
| 405 |
+
aggregated[key] = sum(values) / len(values) # Simple average
|
| 406 |
+
|
| 407 |
+
return aggregated
|
| 408 |
+
|
| 409 |
+
def _determine_winner(self, metrics_a: Dict[str, float], metrics_b: Dict[str, float]) -> Optional[str]:
|
| 410 |
+
"""Determine winner of A/B test based on metrics."""
|
| 411 |
+
if not metrics_a or not metrics_b:
|
| 412 |
+
return None
|
| 413 |
+
|
| 414 |
+
# Assume higher values are better for all metrics
|
| 415 |
+
a_wins = 0
|
| 416 |
+
b_wins = 0
|
| 417 |
+
|
| 418 |
+
for key in metrics_a.keys():
|
| 419 |
+
if key in metrics_b:
|
| 420 |
+
if metrics_a[key] > metrics_b[key]:
|
| 421 |
+
a_wins += 1
|
| 422 |
+
elif metrics_b[key] > metrics_a[key]:
|
| 423 |
+
b_wins += 1
|
| 424 |
+
|
| 425 |
+
if a_wins > b_wins:
|
| 426 |
+
return "A"
|
| 427 |
+
elif b_wins > a_wins:
|
| 428 |
+
return "B"
|
| 429 |
+
else:
|
| 430 |
+
return None # Tie
|
| 431 |
+
|
| 432 |
+
def get_test_results(self, test_case_id: Optional[str] = None, prompt_id: Optional[str] = None) -> List[TestResult]:
|
| 433 |
+
"""Get test results, optionally filtered by test case ID or prompt ID."""
|
| 434 |
+
results = list(self.test_results.values())
|
| 435 |
+
|
| 436 |
+
if test_case_id:
|
| 437 |
+
results = [r for r in results if r.test_case_id == test_case_id]
|
| 438 |
+
|
| 439 |
+
if prompt_id:
|
| 440 |
+
results = [r for r in results if r.prompt_id == prompt_id]
|
| 441 |
+
|
| 442 |
+
return sorted(results, key=lambda r: r.created_at, reverse=True)
|
| 443 |
+
|
| 444 |
+
def get_ab_test_results(self, prompt_id: Optional[str] = None) -> List[ABTestResult]:
|
| 445 |
+
"""Get A/B test results, optionally filtered by prompt ID."""
|
| 446 |
+
results = list(self.ab_test_results.values())
|
| 447 |
+
|
| 448 |
+
if prompt_id:
|
| 449 |
+
results = [r for r in results if r.prompt_a_id == prompt_id or r.prompt_b_id == prompt_id]
|
| 450 |
+
|
| 451 |
+
return sorted(results, key=lambda r: r.created_at, reverse=True)
|
promptlab/core/version_control.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
from typing import Dict, List, Optional, Any
|
| 5 |
+
from .prompt_manager import Prompt, PromptManager
|
| 6 |
+
|
| 7 |
+
class PromptVersion:
|
| 8 |
+
"""Represents a specific version of a prompt."""
|
| 9 |
+
def __init__(
|
| 10 |
+
self,
|
| 11 |
+
prompt_id: str,
|
| 12 |
+
version: int,
|
| 13 |
+
content: str,
|
| 14 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 15 |
+
commit_message: Optional[str] = None
|
| 16 |
+
):
|
| 17 |
+
self.prompt_id = prompt_id
|
| 18 |
+
self.version = version
|
| 19 |
+
self.content = content
|
| 20 |
+
self.metadata = metadata or {}
|
| 21 |
+
self.commit_message = commit_message or ""
|
| 22 |
+
self.created_at = datetime.datetime.now().isoformat()
|
| 23 |
+
|
| 24 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 25 |
+
"""Convert version to dictionary."""
|
| 26 |
+
return {
|
| 27 |
+
"prompt_id": self.prompt_id,
|
| 28 |
+
"version": self.version,
|
| 29 |
+
"content": self.content,
|
| 30 |
+
"metadata": self.metadata,
|
| 31 |
+
"commit_message": self.commit_message,
|
| 32 |
+
"created_at": self.created_at
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
@classmethod
|
| 36 |
+
def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion":
|
| 37 |
+
"""Create version from dictionary."""
|
| 38 |
+
return cls(
|
| 39 |
+
prompt_id=data["prompt_id"],
|
| 40 |
+
version=data["version"],
|
| 41 |
+
content=data["content"],
|
| 42 |
+
metadata=data.get("metadata", {}),
|
| 43 |
+
commit_message=data.get("commit_message", "")
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class VersionControl:
|
| 48 |
+
"""Manages versioning for prompts."""
|
| 49 |
+
def __init__(self, prompt_manager: PromptManager):
|
| 50 |
+
self.prompt_manager = prompt_manager
|
| 51 |
+
self.storage_path = os.path.join(prompt_manager.storage_path, "versions")
|
| 52 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
| 53 |
+
self.versions: Dict[str, Dict[int, PromptVersion]] = {}
|
| 54 |
+
self._load_versions()
|
| 55 |
+
|
| 56 |
+
def _load_versions(self) -> None:
|
| 57 |
+
"""Load versions from storage."""
|
| 58 |
+
if not os.path.exists(self.storage_path):
|
| 59 |
+
os.makedirs(self.storage_path)
|
| 60 |
+
return
|
| 61 |
+
|
| 62 |
+
for prompt_id_dir in os.listdir(self.storage_path):
|
| 63 |
+
prompt_dir = os.path.join(self.storage_path, prompt_id_dir)
|
| 64 |
+
if os.path.isdir(prompt_dir):
|
| 65 |
+
self.versions[prompt_id_dir] = {}
|
| 66 |
+
|
| 67 |
+
for filename in os.listdir(prompt_dir):
|
| 68 |
+
if filename.endswith(".json"):
|
| 69 |
+
with open(os.path.join(prompt_dir, filename), "r") as f:
|
| 70 |
+
version_data = json.load(f)
|
| 71 |
+
version = PromptVersion.from_dict(version_data)
|
| 72 |
+
self.versions[prompt_id_dir][version.version] = version
|
| 73 |
+
|
| 74 |
+
def _save_version(self, version: PromptVersion) -> None:
|
| 75 |
+
"""Save version to storage."""
|
| 76 |
+
prompt_dir = os.path.join(self.storage_path, version.prompt_id)
|
| 77 |
+
os.makedirs(prompt_dir, exist_ok=True)
|
| 78 |
+
|
| 79 |
+
version_path = os.path.join(prompt_dir, f"v{version.version}.json")
|
| 80 |
+
with open(version_path, "w") as f:
|
| 81 |
+
json.dump(version.to_dict(), f, indent=2)
|
| 82 |
+
|
| 83 |
+
def commit(
|
| 84 |
+
self,
|
| 85 |
+
prompt_id: str,
|
| 86 |
+
commit_message: Optional[str] = None,
|
| 87 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 88 |
+
) -> Optional[PromptVersion]:
|
| 89 |
+
"""Create a new version of a prompt."""
|
| 90 |
+
prompt = self.prompt_manager.get(prompt_id)
|
| 91 |
+
if not prompt:
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
# Initialize versions dict for this prompt if it doesn't exist
|
| 95 |
+
if prompt_id not in self.versions:
|
| 96 |
+
self.versions[prompt_id] = {}
|
| 97 |
+
|
| 98 |
+
# Get the highest version number for this prompt
|
| 99 |
+
current_versions = self.versions.get(prompt_id, {})
|
| 100 |
+
next_version = max(current_versions.keys(), default=0) + 1
|
| 101 |
+
|
| 102 |
+
# Create the new version
|
| 103 |
+
version = PromptVersion(
|
| 104 |
+
prompt_id=prompt_id,
|
| 105 |
+
version=next_version,
|
| 106 |
+
content=prompt.content,
|
| 107 |
+
metadata=metadata or {},
|
| 108 |
+
commit_message=commit_message
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Save the new version
|
| 112 |
+
self.versions[prompt_id][next_version] = version
|
| 113 |
+
self._save_version(version)
|
| 114 |
+
|
| 115 |
+
# Update the prompt's version number
|
| 116 |
+
prompt.version = next_version
|
| 117 |
+
self.prompt_manager._save_prompt(prompt)
|
| 118 |
+
|
| 119 |
+
return version
|
| 120 |
+
|
| 121 |
+
def get_version(self, prompt_id: str, version: int) -> Optional[PromptVersion]:
|
| 122 |
+
"""Get a specific version of a prompt."""
|
| 123 |
+
return self.versions.get(prompt_id, {}).get(version)
|
| 124 |
+
|
| 125 |
+
def list_versions(self, prompt_id: str) -> List[PromptVersion]:
|
| 126 |
+
"""List all versions of a prompt."""
|
| 127 |
+
versions = self.versions.get(prompt_id, {})
|
| 128 |
+
return sorted(versions.values(), key=lambda v: v.version)
|
| 129 |
+
|
| 130 |
+
def checkout(self, prompt_id: str, version: int) -> Optional[Prompt]:
|
| 131 |
+
"""Checkout a specific version of a prompt."""
|
| 132 |
+
prompt = self.prompt_manager.get(prompt_id)
|
| 133 |
+
version_obj = self.get_version(prompt_id, version)
|
| 134 |
+
|
| 135 |
+
if not prompt or not version_obj:
|
| 136 |
+
return None
|
| 137 |
+
|
| 138 |
+
prompt.content = version_obj.content
|
| 139 |
+
prompt.version = version
|
| 140 |
+
prompt.updated_at = datetime.datetime.now().isoformat()
|
| 141 |
+
|
| 142 |
+
self.prompt_manager._save_prompt(prompt)
|
| 143 |
+
return prompt
|
| 144 |
+
|
| 145 |
+
def diff(self, prompt_id: str, version1: int, version2: int) -> Dict[str, Any]:
|
| 146 |
+
"""Compare two versions of a prompt."""
|
| 147 |
+
v1 = self.get_version(prompt_id, version1)
|
| 148 |
+
v2 = self.get_version(prompt_id, version2)
|
| 149 |
+
|
| 150 |
+
if not v1 or not v2:
|
| 151 |
+
return {}
|
| 152 |
+
|
| 153 |
+
import difflib
|
| 154 |
+
d = difflib.Differ()
|
| 155 |
+
diff = list(d.compare(v1.content.splitlines(), v2.content.splitlines()))
|
| 156 |
+
|
| 157 |
+
return {
|
| 158 |
+
"version1": version1,
|
| 159 |
+
"version2": version2,
|
| 160 |
+
"diff": diff
|
| 161 |
+
}
|
promptlab/examples/__init__.py
ADDED
|
File without changes
|
promptlab/examples/ab_testing.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A/B testing example for PromptLab.
|
| 3 |
+
|
| 4 |
+
This example demonstrates how to use PromptLab to perform A/B testing
|
| 5 |
+
on different prompt variations to find the most effective one.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import os
|
| 10 |
+
from promptlab import PromptManager, PromptTesting
|
| 11 |
+
|
| 12 |
+
async def llm_callback(prompt, vars):
|
| 13 |
+
"""
|
| 14 |
+
Simulated LLM callback for testing.
|
| 15 |
+
|
| 16 |
+
In a real scenario, this would call an actual LLM API.
|
| 17 |
+
"""
|
| 18 |
+
# Simple simulation - return different responses based on prompt content
|
| 19 |
+
if "concise" in prompt.lower():
|
| 20 |
+
return "This is a short, concise response."
|
| 21 |
+
elif "detailed" in prompt.lower():
|
| 22 |
+
return "This is a much more detailed response that provides additional context and information about the query. It elaborates on various aspects and provides a comprehensive answer."
|
| 23 |
+
else:
|
| 24 |
+
return "Default response."
|
| 25 |
+
|
| 26 |
+
async def main():
|
| 27 |
+
# Initialize the prompt manager with a custom storage path
|
| 28 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
| 29 |
+
prompt_manager = PromptManager(storage_path)
|
| 30 |
+
|
| 31 |
+
# Initialize testing
|
| 32 |
+
testing = PromptTesting(prompt_manager)
|
| 33 |
+
|
| 34 |
+
# Create two prompt variations
|
| 35 |
+
prompt_a = prompt_manager.create(
|
| 36 |
+
content="Provide a concise answer to the following question: {question}",
|
| 37 |
+
name="Concise Prompt",
|
| 38 |
+
description="A prompt that asks for concise answers",
|
| 39 |
+
tags=["concise", "test"]
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
prompt_b = prompt_manager.create(
|
| 43 |
+
content="Provide a detailed and comprehensive answer to the following question: {question}",
|
| 44 |
+
name="Detailed Prompt",
|
| 45 |
+
description="A prompt that asks for detailed answers",
|
| 46 |
+
tags=["detailed", "test"]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
print(f"Created prompt A with ID: {prompt_a.id}")
|
| 50 |
+
print(f"Created prompt B with ID: {prompt_b.id}")
|
| 51 |
+
|
| 52 |
+
# Create test cases
|
| 53 |
+
test_cases = []
|
| 54 |
+
|
| 55 |
+
questions = [
|
| 56 |
+
"What is machine learning?",
|
| 57 |
+
"How does a neural network work?",
|
| 58 |
+
"What are the benefits of version control?"
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
for i, question in enumerate(questions):
|
| 62 |
+
test_case = testing.create_test_case(
|
| 63 |
+
prompt_id=prompt_a.id,
|
| 64 |
+
input_vars={"question": question},
|
| 65 |
+
name=f"Test Case {i+1}",
|
| 66 |
+
description=f"Test case for question: {question}"
|
| 67 |
+
)
|
| 68 |
+
test_cases.append(test_case.id)
|
| 69 |
+
|
| 70 |
+
print(f"Created {len(test_cases)} test cases")
|
| 71 |
+
|
| 72 |
+
# Define metrics callbacks
|
| 73 |
+
def length_metric(output, expected):
|
| 74 |
+
"""Measure output length as a metric."""
|
| 75 |
+
return {"length": len(output) / 1000} # Normalize to 0-1 range
|
| 76 |
+
|
| 77 |
+
def keyword_metric(output, expected):
|
| 78 |
+
"""Check for presence of keywords."""
|
| 79 |
+
keywords = ["machine", "learning", "neural", "network", "version", "control"]
|
| 80 |
+
matches = sum(1 for k in keywords if k.lower() in output.lower())
|
| 81 |
+
return {"keyword_matches": matches / len(keywords)}
|
| 82 |
+
|
| 83 |
+
# Run A/B test
|
| 84 |
+
ab_result = await testing.run_ab_test(
|
| 85 |
+
prompt_a_id=prompt_a.id,
|
| 86 |
+
prompt_b_id=prompt_b.id,
|
| 87 |
+
llm_callback=llm_callback,
|
| 88 |
+
metrics_callbacks=[length_metric, keyword_metric],
|
| 89 |
+
test_cases=test_cases
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
print(f"A/B test completed with ID: {ab_result.id}")
|
| 93 |
+
print(f"Prompt A metrics: {ab_result.metrics_a}")
|
| 94 |
+
print(f"Prompt B metrics: {ab_result.metrics_b}")
|
| 95 |
+
print(f"Winner: {ab_result.winner or 'Tie'}")
|
| 96 |
+
|
| 97 |
+
# List all test results
|
| 98 |
+
results_a = testing.get_test_results(prompt_id=prompt_a.id)
|
| 99 |
+
results_b = testing.get_test_results(prompt_id=prompt_b.id)
|
| 100 |
+
|
| 101 |
+
print(f"Found {len(results_a)} test results for prompt A")
|
| 102 |
+
print(f"Found {len(results_b)} test results for prompt B")
|
| 103 |
+
|
| 104 |
+
# Display individual test results
|
| 105 |
+
print("\nSample outputs:")
|
| 106 |
+
|
| 107 |
+
for i, (result_a, result_b) in enumerate(zip(results_a[:3], results_b[:3])):
|
| 108 |
+
print(f"\nTest Case {i+1}:")
|
| 109 |
+
|
| 110 |
+
print("\nConcise prompt output:")
|
| 111 |
+
print(result_a.output)
|
| 112 |
+
|
| 113 |
+
print("\nDetailed prompt output:")
|
| 114 |
+
print(result_b.output)
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
asyncio.run(main())
|
promptlab/examples/basic_usage.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
Basic usage example for PromptLab.
|
| 4 |
+
|
| 5 |
+
This example demonstrates the fundamental features of PromptLab
|
| 6 |
+
including creating prompts, versioning, and rendering.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import os
|
| 11 |
+
from promptlab import PromptManager, VersionControl
|
| 12 |
+
|
| 13 |
+
async def main():
|
| 14 |
+
# Initialize the prompt manager with a custom storage path
|
| 15 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
| 16 |
+
prompt_manager = PromptManager(storage_path)
|
| 17 |
+
|
| 18 |
+
# Initialize version control
|
| 19 |
+
version_control = VersionControl(prompt_manager)
|
| 20 |
+
|
| 21 |
+
# Create a basic prompt
|
| 22 |
+
basic_prompt = prompt_manager.create(
|
| 23 |
+
content="Hello, my name is {name} and I am a {occupation}.",
|
| 24 |
+
name="Introduction",
|
| 25 |
+
description="A simple introduction prompt",
|
| 26 |
+
tags=["basic", "introduction"]
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
print(f"Created prompt with ID: {basic_prompt.id}")
|
| 30 |
+
|
| 31 |
+
# Render the prompt with variables
|
| 32 |
+
rendered = basic_prompt.render(name="Alice", occupation="Data Scientist")
|
| 33 |
+
print(f"Rendered prompt: {rendered}")
|
| 34 |
+
|
| 35 |
+
# Create a more complex prompt
|
| 36 |
+
complex_prompt = prompt_manager.create(
|
| 37 |
+
content="""
|
| 38 |
+
System: {system_message}
|
| 39 |
+
|
| 40 |
+
User: {user_message}
|
| 41 |
+
|
| 42 |
+
Assistant:
|
| 43 |
+
""",
|
| 44 |
+
name="Chat Interaction",
|
| 45 |
+
description="A prompt for chat interactions",
|
| 46 |
+
tags=["chat", "interaction"]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
print(f"Created complex prompt with ID: {complex_prompt.id}")
|
| 50 |
+
|
| 51 |
+
# Render the complex prompt
|
| 52 |
+
rendered = complex_prompt.render(
|
| 53 |
+
system_message="You are a helpful assistant.",
|
| 54 |
+
user_message="Can you help me with Python programming?"
|
| 55 |
+
)
|
| 56 |
+
print(f"Rendered complex prompt:\n{rendered}")
|
| 57 |
+
|
| 58 |
+
# Create a version
|
| 59 |
+
version = version_control.commit(
|
| 60 |
+
prompt_id=complex_prompt.id,
|
| 61 |
+
commit_message="Initial version"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
print(f"Created version {version.version} for prompt {complex_prompt.id}")
|
| 65 |
+
|
| 66 |
+
# Update the prompt
|
| 67 |
+
complex_prompt = prompt_manager.update(
|
| 68 |
+
complex_prompt.id,
|
| 69 |
+
content="""
|
| 70 |
+
System: {system_message}
|
| 71 |
+
|
| 72 |
+
User: {user_message}
|
| 73 |
+
|
| 74 |
+
Think step by step:
|
| 75 |
+
{thinking}
|
| 76 |
+
|
| 77 |
+
Assistant:
|
| 78 |
+
"""
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
print(f"Updated prompt with ID: {complex_prompt.id}")
|
| 82 |
+
|
| 83 |
+
# Create another version
|
| 84 |
+
version = version_control.commit(
|
| 85 |
+
prompt_id=complex_prompt.id,
|
| 86 |
+
commit_message="Added thinking step"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
print(f"Created version {version.version} for prompt {complex_prompt.id}")
|
| 90 |
+
|
| 91 |
+
# List all versions
|
| 92 |
+
versions = version_control.list_versions(complex_prompt.id)
|
| 93 |
+
print(f"Found {len(versions)} versions for prompt {complex_prompt.id}:")
|
| 94 |
+
for v in versions:
|
| 95 |
+
print(f"Version: {v.version} | Created: {v.created_at} | Message: {v.commit_message}")
|
| 96 |
+
|
| 97 |
+
# Checkout a specific version
|
| 98 |
+
prompt = version_control.checkout(complex_prompt.id, 1)
|
| 99 |
+
print(f"Checked out version 1 for prompt {complex_prompt.id}")
|
| 100 |
+
print(f"Content:\n{prompt.content}")
|
| 101 |
+
|
| 102 |
+
# List all prompts
|
| 103 |
+
prompts = prompt_manager.list()
|
| 104 |
+
print(f"Found {len(prompts)} prompts:")
|
| 105 |
+
for p in prompts:
|
| 106 |
+
print(f"ID: {p.id} | Name: {p.name} | Tags: {', '.join(p.tags)}")
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
asyncio.run(main())
|
promptlab/examples/evaluation_example.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation example for PromptLab.
|
| 3 |
+
|
| 4 |
+
This example demonstrates how to use PromptLab's evaluation framework
|
| 5 |
+
to measure the quality of prompts using various metrics.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import os
|
| 10 |
+
from promptlab import PromptManager, Evaluator, ContainsKeywordsMetric, LengthMetric
|
| 11 |
+
|
| 12 |
+
async def llm_callback(prompt, vars):
|
| 13 |
+
"""
|
| 14 |
+
Simulated LLM callback for testing.
|
| 15 |
+
|
| 16 |
+
In a real scenario, this would call an actual LLM API.
|
| 17 |
+
"""
|
| 18 |
+
# Simple simulation based on input text
|
| 19 |
+
text = vars.get("text", "")
|
| 20 |
+
|
| 21 |
+
if "code" in text.lower():
|
| 22 |
+
return "```python\ndef hello_world():\n print('Hello, world!')\n```"
|
| 23 |
+
elif "list" in text.lower():
|
| 24 |
+
return "1. First item\n2. Second item\n3. Third item"
|
| 25 |
+
elif "summary" in text.lower():
|
| 26 |
+
return f"This is a summary of the text about {text.split()[0]}."
|
| 27 |
+
else:
|
| 28 |
+
return f"Response to: {text}"
|
| 29 |
+
|
| 30 |
+
async def main():
|
| 31 |
+
# Initialize the prompt manager with a custom storage path
|
| 32 |
+
storage_path = os.path.join(os.getcwd(), "promptlab_storage")
|
| 33 |
+
prompt_manager = PromptManager(storage_path)
|
| 34 |
+
|
| 35 |
+
# Initialize evaluator
|
| 36 |
+
evaluator = Evaluator(prompt_manager)
|
| 37 |
+
|
| 38 |
+
# Create a prompt for evaluation
|
| 39 |
+
prompt = prompt_manager.create(
|
| 40 |
+
content="Please {action} the following text: {text}",
|
| 41 |
+
name="Dynamic Action Prompt",
|
| 42 |
+
description="A prompt that can perform different actions based on input",
|
| 43 |
+
tags=["action", "dynamic"]
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print(f"Created prompt with ID: {prompt.id}")
|
| 47 |
+
|
| 48 |
+
# Register custom metrics
|
| 49 |
+
code_keywords = ContainsKeywordsMetric(
|
| 50 |
+
keywords=["def", "print", "function", "return"],
|
| 51 |
+
case_sensitive=False
|
| 52 |
+
)
|
| 53 |
+
evaluator.register_metric(code_keywords)
|
| 54 |
+
|
| 55 |
+
list_keywords = ContainsKeywordsMetric(
|
| 56 |
+
keywords=["1.", "2.", "3.", "item"],
|
| 57 |
+
case_sensitive=False
|
| 58 |
+
)
|
| 59 |
+
evaluator.register_metric(list_keywords)
|
| 60 |
+
|
| 61 |
+
length_metric = LengthMetric(min_length=10, max_length=500)
|
| 62 |
+
evaluator.register_metric(length_metric)
|
| 63 |
+
|
| 64 |
+
# Create test inputs for different actions
|
| 65 |
+
test_inputs = [
|
| 66 |
+
{"action": "write code for", "text": "a simple hello world function"},
|
| 67 |
+
{"action": "create a list of", "text": "three important items"},
|
| 68 |
+
{"action": "summarize", "text": "machine learning concepts in data science"},
|
| 69 |
+
{"action": "analyze", "text": "the impact of climate change on ecosystems"}
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
# Run evaluation
|
| 73 |
+
evaluation_result = await evaluator.evaluate_prompt(
|
| 74 |
+
prompt_id=prompt.id,
|
| 75 |
+
inputs=test_inputs,
|
| 76 |
+
llm_callback=llm_callback
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Print evaluation results
|
| 80 |
+
print("\nEvaluation completed!")
|
| 81 |
+
print("\nAggregated metrics:")
|
| 82 |
+
for name, value in evaluation_result["aggregated_metrics"].items():
|
| 83 |
+
print(f"{name}: {value:.4f}")
|
| 84 |
+
|
| 85 |
+
print("\nIndividual results:")
|
| 86 |
+
for i, result in enumerate(evaluation_result["individual_results"]):
|
| 87 |
+
print(f"\nTest {i+1} ({result['input']['action']} {result['input']['text']}):")
|
| 88 |
+
print(f"Output: {result['output']}")
|
| 89 |
+
|
| 90 |
+
print("Metrics:")
|
| 91 |
+
for name, value in result["metrics"].items():
|
| 92 |
+
print(f" {name}: {value:.4f}")
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
asyncio.run(main())
|
promptlab/tests/__init__.py
ADDED
|
File without changes
|
promptlab/tests/test_evaluation.py
ADDED
|
File without changes
|
promptlab/tests/test_prompt_manager.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import os
|
| 3 |
+
import shutil
|
| 4 |
+
import tempfile
|
| 5 |
+
from promptlab.core.prompt_manager import PromptManager, Prompt
|
| 6 |
+
|
| 7 |
+
class TestPromptManager(unittest.TestCase):
|
| 8 |
+
def setUp(self):
|
| 9 |
+
"""Set up test environment."""
|
| 10 |
+
self.test_dir = tempfile.mkdtemp()
|
| 11 |
+
self.prompt_manager = PromptManager(self.test_dir)
|
| 12 |
+
|
| 13 |
+
def tearDown(self):
|
| 14 |
+
"""Clean up test environment."""
|
| 15 |
+
shutil.rmtree(self.test_dir)
|
| 16 |
+
|
| 17 |
+
def test_create_prompt(self):
|
| 18 |
+
"""Test creating a prompt."""
|
| 19 |
+
prompt = self.prompt_manager.create(
|
| 20 |
+
content="Test prompt {var}",
|
| 21 |
+
name="Test Prompt",
|
| 22 |
+
description="A test prompt",
|
| 23 |
+
tags=["test", "example"]
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
self.assertIsNotNone(prompt)
|
| 27 |
+
self.assertEqual(prompt.name, "Test Prompt")
|
| 28 |
+
self.assertEqual(prompt.content, "Test prompt {var}")
|
| 29 |
+
self.assertEqual(prompt.description, "A test prompt")
|
| 30 |
+
self.assertEqual(prompt.tags, ["test", "example"])
|
| 31 |
+
|
| 32 |
+
def test_get_prompt(self):
|
| 33 |
+
"""Test getting a prompt."""
|
| 34 |
+
prompt = self.prompt_manager.create(
|
| 35 |
+
content="Test prompt",
|
| 36 |
+
name="Test Prompt"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
retrieved = self.prompt_manager.get(prompt.id)
|
| 40 |
+
|
| 41 |
+
self.assertIsNotNone(retrieved)
|
| 42 |
+
self.assertEqual(retrieved.id, prompt.id)
|
| 43 |
+
self.assertEqual(retrieved.name, prompt.name)
|
| 44 |
+
self.assertEqual(retrieved.content, prompt.content)
|
| 45 |
+
|
| 46 |
+
def test_update_prompt(self):
|
| 47 |
+
"""Test updating a prompt."""
|
| 48 |
+
prompt = self.prompt_manager.create(
|
| 49 |
+
content="Test prompt",
|
| 50 |
+
name="Test Prompt"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
updated = self.prompt_manager.update(
|
| 54 |
+
prompt.id,
|
| 55 |
+
content="Updated prompt",
|
| 56 |
+
name="Updated Name"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
self.assertEqual(updated.content, "Updated prompt")
|
| 60 |
+
self.assertEqual(updated.name, "Updated Name")
|
| 61 |
+
|
| 62 |
+
# Check that the update was persisted
|
| 63 |
+
retrieved = self.prompt_manager.get(prompt.id)
|
| 64 |
+
self.assertEqual(retrieved.content, "Updated prompt")
|
| 65 |
+
self.assertEqual(retrieved.name, "Updated Name")
|
| 66 |
+
|
| 67 |
+
def test_delete_prompt(self):
|
| 68 |
+
"""Test deleting a prompt."""
|
| 69 |
+
prompt = self.prompt_manager.create(
|
| 70 |
+
content="Test prompt",
|
| 71 |
+
name="Test Prompt"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
success = self.prompt_manager.delete(prompt.id)
|
| 75 |
+
|
| 76 |
+
self.assertTrue(success)
|
| 77 |
+
self.assertIsNone(self.prompt_manager.get(prompt.id))
|
| 78 |
+
|
| 79 |
+
def test_list_prompts(self):
|
| 80 |
+
"""Test listing prompts."""
|
| 81 |
+
self.prompt_manager.create(
|
| 82 |
+
content="Test prompt 1",
|
| 83 |
+
name="Test Prompt 1",
|
| 84 |
+
tags=["test", "one"]
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self.prompt_manager.create(
|
| 88 |
+
content="Test prompt 2",
|
| 89 |
+
name="Test Prompt 2",
|
| 90 |
+
tags=["test", "two"]
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
all_prompts = self.prompt_manager.list()
|
| 94 |
+
self.assertEqual(len(all_prompts), 2)
|
| 95 |
+
|
| 96 |
+
test_tag_prompts = self.prompt_manager.list(tags=["test"])
|
| 97 |
+
self.assertEqual(len(test_tag_prompts), 2)
|
| 98 |
+
|
| 99 |
+
one_tag_prompts = self.prompt_manager.list(tags=["one"])
|
| 100 |
+
self.assertEqual(len(one_tag_prompts), 1)
|
| 101 |
+
self.assertEqual(one_tag_prompts[0].name, "Test Prompt 1")
|
| 102 |
+
|
| 103 |
+
def test_render_prompt(self):
|
| 104 |
+
"""Test rendering a prompt with variables."""
|
| 105 |
+
prompt = self.prompt_manager.create(
|
| 106 |
+
content="Hello, {name}! You are a {occupation}.",
|
| 107 |
+
name="Test Prompt"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
rendered = prompt.render(name="Alice", occupation="Data Scientist")
|
| 111 |
+
|
| 112 |
+
self.assertEqual(rendered, "Hello, Alice! You are a Data Scientist.")
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
unittest.main()
|
promptlab/tests/test_testing.py
ADDED
|
File without changes
|
promptlab/tests/test_version_control.py
ADDED
|
File without changes
|
promptlab/utils/__init__.py
ADDED
|
File without changes
|
promptlab/utils/metrics.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Optional, Any, Union, Callable
|
| 2 |
+
import re
|
| 3 |
+
import numpy as np
|
| 4 |
+
from difflib import SequenceMatcher
|
| 5 |
+
|
| 6 |
+
def exact_match(generated: str, expected: str) -> float:
|
| 7 |
+
"""Calculate exact match score (1.0 if exact match, 0.0 otherwise)."""
|
| 8 |
+
if not expected or not generated:
|
| 9 |
+
return 0.0
|
| 10 |
+
return 1.0 if generated.strip() == expected.strip() else 0.0
|
| 11 |
+
|
| 12 |
+
def contains_all(generated: str, items: List[str], case_sensitive: bool = False) -> float:
|
| 13 |
+
"""Check if generated text contains all items in the list."""
|
| 14 |
+
if not items:
|
| 15 |
+
return 0.0
|
| 16 |
+
|
| 17 |
+
if not case_sensitive:
|
| 18 |
+
generated = generated.lower()
|
| 19 |
+
items = [item.lower() for item in items]
|
| 20 |
+
|
| 21 |
+
matches = sum(1 for item in items if item in generated)
|
| 22 |
+
return matches / len(items)
|
| 23 |
+
|
| 24 |
+
def similarity_score(str1: str, str2: str) -> float:
|
| 25 |
+
"""Calculate string similarity using difflib."""
|
| 26 |
+
if not str1 or not str2:
|
| 27 |
+
return 0.0
|
| 28 |
+
return SequenceMatcher(None, str1, str2).ratio()
|
| 29 |
+
|
| 30 |
+
def word_count(text: str) -> int:
|
| 31 |
+
"""Count words in text."""
|
| 32 |
+
return len(re.findall(r'\w+', text))
|
| 33 |
+
|
| 34 |
+
def length_ratio(generated: str, expected: str) -> float:
|
| 35 |
+
"""Calculate ratio of generated text length to expected text length."""
|
| 36 |
+
if not expected:
|
| 37 |
+
return 0.0
|
| 38 |
+
|
| 39 |
+
gen_length = len(generated)
|
| 40 |
+
exp_length = len(expected)
|
| 41 |
+
|
| 42 |
+
# Avoid division by zero
|
| 43 |
+
if exp_length == 0:
|
| 44 |
+
return 0.0 if gen_length > 0 else 1.0
|
| 45 |
+
|
| 46 |
+
# Return value between 0 and 1, with 1 being perfect match
|
| 47 |
+
# and decreasing as the ratio diverges from 1
|
| 48 |
+
ratio = gen_length / exp_length
|
| 49 |
+
return min(ratio, 1/ratio) if ratio > 0 else 0.0
|
| 50 |
+
|
| 51 |
+
def word_overlap(generated: str, expected: str) -> float:
|
| 52 |
+
"""Calculate the word overlap between generated and expected text."""
|
| 53 |
+
if not expected or not generated:
|
| 54 |
+
return 0.0
|
| 55 |
+
|
| 56 |
+
gen_words = set(re.findall(r'\w+', generated.lower()))
|
| 57 |
+
exp_words = set(re.findall(r'\w+', expected.lower()))
|
| 58 |
+
|
| 59 |
+
if not exp_words:
|
| 60 |
+
return 0.0
|
| 61 |
+
|
| 62 |
+
intersection = gen_words.intersection(exp_words)
|
| 63 |
+
return len(intersection) / len(exp_words)
|
| 64 |
+
|
| 65 |
+
def keyword_presence(text: str, keywords: List[str], weight: Optional[Dict[str, float]] = None) -> Dict[str, float]:
|
| 66 |
+
"""Check for presence of keywords with optional weights."""
|
| 67 |
+
if not keywords:
|
| 68 |
+
return {"keyword_score": 0.0}
|
| 69 |
+
|
| 70 |
+
text = text.lower()
|
| 71 |
+
result = {}
|
| 72 |
+
|
| 73 |
+
total_weight = 0
|
| 74 |
+
weighted_score = 0
|
| 75 |
+
|
| 76 |
+
for keyword in keywords:
|
| 77 |
+
keyword_lower = keyword.lower()
|
| 78 |
+
presence = 1.0 if keyword_lower in text else 0.0
|
| 79 |
+
|
| 80 |
+
# Apply weight if provided
|
| 81 |
+
kw_weight = weight.get(keyword, 1.0) if weight else 1.0
|
| 82 |
+
total_weight += kw_weight
|
| 83 |
+
weighted_score += presence * kw_weight
|
| 84 |
+
|
| 85 |
+
result[f"keyword_{keyword}"] = presence
|
| 86 |
+
|
| 87 |
+
# Calculate overall weighted score
|
| 88 |
+
if total_weight > 0:
|
| 89 |
+
result["keyword_score"] = weighted_score / total_weight
|
| 90 |
+
else:
|
| 91 |
+
result["keyword_score"] = 0.0
|
| 92 |
+
|
| 93 |
+
return result
|
| 94 |
+
|
| 95 |
+
class MetricsSet:
|
| 96 |
+
"""A collection of evaluation metrics functions."""
|
| 97 |
+
def __init__(self):
|
| 98 |
+
self.metrics = {}
|
| 99 |
+
|
| 100 |
+
def add_metric(self, name: str, func: Callable, description: Optional[str] = None) -> None:
|
| 101 |
+
"""Add a metric function to the set."""
|
| 102 |
+
self.metrics[name] = {
|
| 103 |
+
"function": func,
|
| 104 |
+
"description": description or ""
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
def evaluate(self, generated: str, expected: Optional[str] = None, **kwargs) -> Dict[str, float]:
|
| 108 |
+
"""Evaluate all metrics on the given text."""
|
| 109 |
+
results = {}
|
| 110 |
+
|
| 111 |
+
for name, metric in self.metrics.items():
|
| 112 |
+
try:
|
| 113 |
+
# Different metrics may require different arguments
|
| 114 |
+
if expected is not None:
|
| 115 |
+
if "keywords" in kwargs and name == "keyword_presence":
|
| 116 |
+
result = metric["function"](generated, kwargs["keywords"])
|
| 117 |
+
else:
|
| 118 |
+
result = metric["function"](generated, expected)
|
| 119 |
+
else:
|
| 120 |
+
result = metric["function"](generated)
|
| 121 |
+
|
| 122 |
+
# Handle both single values and dictionaries
|
| 123 |
+
if isinstance(result, dict):
|
| 124 |
+
results.update(result)
|
| 125 |
+
else:
|
| 126 |
+
results[name] = result
|
| 127 |
+
except Exception as e:
|
| 128 |
+
results[name] = 0.0
|
| 129 |
+
print(f"Error calculating metric {name}: {e}")
|
| 130 |
+
|
| 131 |
+
return results
|
| 132 |
+
|
| 133 |
+
def create_default_metrics_set() -> MetricsSet:
|
| 134 |
+
"""Create a MetricsSet with default metrics."""
|
| 135 |
+
metrics = MetricsSet()
|
| 136 |
+
|
| 137 |
+
metrics.add_metric(
|
| 138 |
+
"exact_match",
|
| 139 |
+
exact_match,
|
| 140 |
+
"Exact string match between expected and generated"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
metrics.add_metric(
|
| 144 |
+
"similarity",
|
| 145 |
+
similarity_score,
|
| 146 |
+
"String similarity using difflib's SequenceMatcher"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
metrics.add_metric(
|
| 150 |
+
"word_overlap",
|
| 151 |
+
word_overlap,
|
| 152 |
+
"Ratio of words in expected that appear in generated"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
metrics.add_metric(
|
| 156 |
+
"length_ratio",
|
| 157 |
+
length_ratio,
|
| 158 |
+
"Ratio of generated text length to expected text length"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
return metrics
|
promptlab/utils/storage.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import shutil
|
| 4 |
+
from typing import Dict, Any, Optional, List
|
| 5 |
+
|
| 6 |
+
class Storage:
|
| 7 |
+
"""Handles persistent storage for PromptLab."""
|
| 8 |
+
def __init__(self, base_path: str):
|
| 9 |
+
self.base_path = base_path
|
| 10 |
+
os.makedirs(base_path, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
def ensure_dir(self, dir_path: str) -> str:
|
| 13 |
+
"""Ensure directory exists and return its path."""
|
| 14 |
+
full_path = os.path.join(self.base_path, dir_path)
|
| 15 |
+
os.makedirs(full_path, exist_ok=True)
|
| 16 |
+
return full_path
|
| 17 |
+
|
| 18 |
+
def save_json(self, dir_path: str, filename: str, data: Dict[str, Any]) -> str:
|
| 19 |
+
"""Save data to a JSON file."""
|
| 20 |
+
dir_full_path = self.ensure_dir(dir_path)
|
| 21 |
+
file_path = os.path.join(dir_full_path, f"{filename}.json")
|
| 22 |
+
|
| 23 |
+
with open(file_path, "w") as f:
|
| 24 |
+
json.dump(data, f, indent=2)
|
| 25 |
+
|
| 26 |
+
return file_path
|
| 27 |
+
|
| 28 |
+
def load_json(self, dir_path: str, filename: str) -> Optional[Dict[str, Any]]:
|
| 29 |
+
"""Load data from a JSON file."""
|
| 30 |
+
file_path = os.path.join(self.base_path, dir_path, f"{filename}.json")
|
| 31 |
+
|
| 32 |
+
if not os.path.exists(file_path):
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
with open(file_path, "r") as f:
|
| 36 |
+
return json.load(f)
|
| 37 |
+
|
| 38 |
+
def list_files(self, dir_path: str, extension: Optional[str] = None) -> List[str]:
|
| 39 |
+
"""List files in a directory, optionally filtered by extension."""
|
| 40 |
+
full_path = os.path.join(self.base_path, dir_path)
|
| 41 |
+
|
| 42 |
+
if not os.path.exists(full_path):
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
files = os.listdir(full_path)
|
| 46 |
+
|
| 47 |
+
if extension:
|
| 48 |
+
return [f for f in files if f.endswith(extension)]
|
| 49 |
+
|
| 50 |
+
return files
|
| 51 |
+
|
| 52 |
+
def delete_file(self, dir_path: str, filename: str) -> bool:
|
| 53 |
+
"""Delete a file."""
|
| 54 |
+
file_path = os.path.join(self.base_path, dir_path, filename)
|
| 55 |
+
|
| 56 |
+
if os.path.exists(file_path):
|
| 57 |
+
os.remove(file_path)
|
| 58 |
+
return True
|
| 59 |
+
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
def backup(self, backup_path: Optional[str] = None) -> str:
|
| 63 |
+
"""Create a backup of the entire storage."""
|
| 64 |
+
if not backup_path:
|
| 65 |
+
backup_path = f"{self.base_path}_backup"
|
| 66 |
+
|
| 67 |
+
shutil.make_archive(backup_path, "zip", self.base_path)
|
| 68 |
+
return f"{backup_path}.zip"
|
| 69 |
+
|
| 70 |
+
def restore(self, backup_path: str) -> bool:
|
| 71 |
+
"""Restore from a backup archive."""
|
| 72 |
+
if not os.path.exists(backup_path):
|
| 73 |
+
return False
|
| 74 |
+
|
| 75 |
+
shutil.rmtree(self.base_path, ignore_errors=True)
|
| 76 |
+
os.makedirs(self.base_path, exist_ok=True)
|
| 77 |
+
|
| 78 |
+
shutil.unpack_archive(backup_path, self.base_path)
|
| 79 |
+
return True
|
promptlab/utils/templating.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
from typing import Dict, Any, List, Optional, Union, Callable
|
| 4 |
+
from string import Formatter
|
| 5 |
+
|
| 6 |
+
class TemplateError(Exception):
|
| 7 |
+
"""Exception raised for errors in template rendering."""
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
class PromptTemplate:
|
| 11 |
+
"""Advanced templating system for prompts."""
|
| 12 |
+
def __init__(self, template: str):
|
| 13 |
+
self.template = template
|
| 14 |
+
self._validate_template()
|
| 15 |
+
|
| 16 |
+
def _validate_template(self) -> None:
|
| 17 |
+
"""Validate template syntax."""
|
| 18 |
+
try:
|
| 19 |
+
# Check for basic placeholder syntax
|
| 20 |
+
list(Formatter().parse(self.template))
|
| 21 |
+
|
| 22 |
+
# Check for conditional syntax
|
| 23 |
+
self._validate_conditionals()
|
| 24 |
+
|
| 25 |
+
# Check for loop syntax
|
| 26 |
+
self._validate_loops()
|
| 27 |
+
except Exception as e:
|
| 28 |
+
raise TemplateError(f"Invalid template syntax: {str(e)}")
|
| 29 |
+
|
| 30 |
+
def _validate_conditionals(self) -> None:
|
| 31 |
+
"""Validate conditional blocks in the template."""
|
| 32 |
+
# Simple validation to ensure if/endif blocks match
|
| 33 |
+
if_count = len(re.findall(r'\{\s*if\s+.*?\s*\}', self.template))
|
| 34 |
+
endif_count = len(re.findall(r'\{\s*endif\s*\}', self.template))
|
| 35 |
+
|
| 36 |
+
if if_count != endif_count:
|
| 37 |
+
raise TemplateError(f"Mismatched conditional blocks: {if_count} 'if' and {endif_count} 'endif'")
|
| 38 |
+
|
| 39 |
+
def _validate_loops(self) -> None:
|
| 40 |
+
"""Validate loop blocks in the template."""
|
| 41 |
+
# Simple validation to ensure for/endfor blocks match
|
| 42 |
+
for_count = len(re.findall(r'\{\s*for\s+.*?\s*\}', self.template))
|
| 43 |
+
endfor_count = len(re.findall(r'\{\s*endfor\s*\}', self.template))
|
| 44 |
+
|
| 45 |
+
if for_count != endfor_count:
|
| 46 |
+
raise TemplateError(f"Mismatched loop blocks: {for_count} 'for' and {endfor_count} 'endfor'")
|
| 47 |
+
|
| 48 |
+
def _render_conditionals(self, template: str, variables: Dict[str, Any]) -> str:
|
| 49 |
+
"""Process conditional blocks in the template."""
|
| 50 |
+
# Handle if-else-endif blocks
|
| 51 |
+
pattern = r'\{\s*if\s+(.*?)\s*\}(.*?)(?:\{\s*else\s*\}(.*?))?\{\s*endif\s*\}'
|
| 52 |
+
|
| 53 |
+
def replace_conditional(match):
|
| 54 |
+
condition = match.group(1)
|
| 55 |
+
if_block = match.group(2)
|
| 56 |
+
else_block = match.group(3) or ""
|
| 57 |
+
|
| 58 |
+
# Evaluate condition
|
| 59 |
+
try:
|
| 60 |
+
# Replace variables in condition
|
| 61 |
+
for var_name, var_value in variables.items():
|
| 62 |
+
if isinstance(var_value, str):
|
| 63 |
+
# For strings, replace with quoted value
|
| 64 |
+
condition = condition.replace(var_name, f'"{var_value}"')
|
| 65 |
+
else:
|
| 66 |
+
# For other types, replace directly
|
| 67 |
+
condition = condition.replace(var_name, str(var_value))
|
| 68 |
+
|
| 69 |
+
result = eval(condition, {"__builtins__": {}}, variables)
|
| 70 |
+
return if_block if result else else_block
|
| 71 |
+
except Exception as e:
|
| 72 |
+
raise TemplateError(f"Error evaluating condition '{condition}': {str(e)}")
|
| 73 |
+
|
| 74 |
+
# Use re.DOTALL to match across multiple lines
|
| 75 |
+
return re.sub(pattern, replace_conditional, template, flags=re.DOTALL)
|
| 76 |
+
|
| 77 |
+
def _render_loops(self, template: str, variables: Dict[str, Any]) -> str:
|
| 78 |
+
"""Process loop blocks in the template."""
|
| 79 |
+
# Handle for loops
|
| 80 |
+
pattern = r'\{\s*for\s+(.*?)\s+in\s+(.*?)\s*\}(.*?)\{\s*endfor\s*\}'
|
| 81 |
+
|
| 82 |
+
def replace_loop(match):
|
| 83 |
+
var_name = match.group(1)
|
| 84 |
+
iterable_expr = match.group(2)
|
| 85 |
+
loop_body = match.group(3)
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Get the iterable from variables
|
| 89 |
+
if iterable_expr in variables and hasattr(variables[iterable_expr], '__iter__'):
|
| 90 |
+
iterable = variables[iterable_expr]
|
| 91 |
+
else:
|
| 92 |
+
# Try to evaluate the expression
|
| 93 |
+
iterable = eval(iterable_expr, {"__builtins__": {}}, variables)
|
| 94 |
+
|
| 95 |
+
if not hasattr(iterable, '__iter__'):
|
| 96 |
+
raise TemplateError(f"'{iterable_expr}' is not iterable")
|
| 97 |
+
|
| 98 |
+
# Process the loop body for each item
|
| 99 |
+
result = []
|
| 100 |
+
for item in iterable:
|
| 101 |
+
# Create a copy of variables with loop variable
|
| 102 |
+
loop_vars = variables.copy()
|
| 103 |
+
loop_vars[var_name] = item
|
| 104 |
+
|
| 105 |
+
# Process the loop body with the new variables
|
| 106 |
+
body_content = loop_body
|
| 107 |
+
for k, v in loop_vars.items():
|
| 108 |
+
placeholder = f"{{{k}}}"
|
| 109 |
+
if placeholder in body_content:
|
| 110 |
+
body_content = body_content.replace(placeholder, str(v))
|
| 111 |
+
|
| 112 |
+
result.append(body_content)
|
| 113 |
+
|
| 114 |
+
return "".join(result)
|
| 115 |
+
except Exception as e:
|
| 116 |
+
raise TemplateError(f"Error processing loop '{match.group(0)}': {str(e)}")
|
| 117 |
+
|
| 118 |
+
# Use re.DOTALL to match across multiple lines
|
| 119 |
+
return re.sub(pattern, replace_loop, template, flags=re.DOTALL)
|
| 120 |
+
|
| 121 |
+
def _apply_filters(self, value: Any, filters: List[str]) -> str:
|
| 122 |
+
"""Apply filters to a value."""
|
| 123 |
+
result = value
|
| 124 |
+
for filter_name in filters:
|
| 125 |
+
if filter_name == "upper":
|
| 126 |
+
result = str(result).upper()
|
| 127 |
+
elif filter_name == "lower":
|
| 128 |
+
result = str(result).lower()
|
| 129 |
+
elif filter_name == "title":
|
| 130 |
+
result = str(result).title()
|
| 131 |
+
elif filter_name == "capitalize":
|
| 132 |
+
result = str(result).capitalize()
|
| 133 |
+
elif filter_name == "strip":
|
| 134 |
+
result = str(result).strip()
|
| 135 |
+
elif filter_name == "json":
|
| 136 |
+
result = json.dumps(result)
|
| 137 |
+
else:
|
| 138 |
+
raise TemplateError(f"Unknown filter: {filter_name}")
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
def _render_variables(self, template: str, variables: Dict[str, Any]) -> str:
|
| 142 |
+
"""Replace variables in the template with their values."""
|
| 143 |
+
result = template
|
| 144 |
+
|
| 145 |
+
# Process variables with filters
|
| 146 |
+
pattern = r'\{(.*?)(?:\|(.*?))?\}'
|
| 147 |
+
|
| 148 |
+
def replace_var(match):
|
| 149 |
+
var_expr = match.group(1).strip()
|
| 150 |
+
filters_expr = match.group(2)
|
| 151 |
+
|
| 152 |
+
# Extract filters
|
| 153 |
+
filters = []
|
| 154 |
+
if filters_expr:
|
| 155 |
+
filters = [f.strip() for f in filters_expr.split('|')]
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
# Simple variable
|
| 159 |
+
if var_expr in variables:
|
| 160 |
+
value = variables[var_expr]
|
| 161 |
+
else:
|
| 162 |
+
# Try to evaluate as an expression
|
| 163 |
+
try:
|
| 164 |
+
value = eval(var_expr, {"__builtins__": {}}, variables)
|
| 165 |
+
except:
|
| 166 |
+
return match.group(0) # Keep as is if evaluation fails
|
| 167 |
+
|
| 168 |
+
# Apply filters
|
| 169 |
+
return str(self._apply_filters(value, filters))
|
| 170 |
+
except Exception as e:
|
| 171 |
+
raise TemplateError(f"Error processing variable '{var_expr}': {str(e)}")
|
| 172 |
+
|
| 173 |
+
return re.sub(pattern, replace_var, result)
|
| 174 |
+
|
| 175 |
+
def render(self, **kwargs) -> str:
|
| 176 |
+
"""Render the template with provided variables."""
|
| 177 |
+
result = self.template
|
| 178 |
+
|
| 179 |
+
# Process templates in multiple passes
|
| 180 |
+
# First, handle conditional blocks
|
| 181 |
+
result = self._render_conditionals(result, kwargs)
|
| 182 |
+
|
| 183 |
+
# Then, handle loops
|
| 184 |
+
result = self._render_loops(result, kwargs)
|
| 185 |
+
|
| 186 |
+
# Finally, handle simple variable substitution
|
| 187 |
+
result = self._render_variables(result, kwargs)
|
| 188 |
+
|
| 189 |
+
return result
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
class PromptTemplateRegistry:
|
| 193 |
+
"""Registry for prompt templates."""
|
| 194 |
+
def __init__(self):
|
| 195 |
+
self.templates: Dict[str, PromptTemplate] = {}
|
| 196 |
+
|
| 197 |
+
def register(self, name: str, template: Union[str, PromptTemplate]) -> None:
|
| 198 |
+
"""Register a template."""
|
| 199 |
+
if isinstance(template, str):
|
| 200 |
+
template = PromptTemplate(template)
|
| 201 |
+
self.templates[name] = template
|
| 202 |
+
|
| 203 |
+
def get(self, name: str) -> Optional[PromptTemplate]:
|
| 204 |
+
"""Get a template by name."""
|
| 205 |
+
return self.templates.get(name)
|
| 206 |
+
|
| 207 |
+
def render(self, name: str, **kwargs) -> str:
|
| 208 |
+
"""Render a template by name."""
|
| 209 |
+
template = self.get(name)
|
| 210 |
+
if not template:
|
| 211 |
+
raise ValueError(f"Template '{name}' not found")
|
| 212 |
+
return template.render(**kwargs)
|
| 213 |
+
|
| 214 |
+
def list_templates(self) -> List[str]:
|
| 215 |
+
"""List all registered templates."""
|
| 216 |
+
return list(self.templates.keys())
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# Create a singleton instance
|
| 220 |
+
template_registry = PromptTemplateRegistry()
|
| 221 |
+
|
| 222 |
+
# Register some common templates
|
| 223 |
+
template_registry.register(
|
| 224 |
+
"basic_completion",
|
| 225 |
+
"""
|
| 226 |
+
{system_message}
|
| 227 |
+
|
| 228 |
+
{user_message}
|
| 229 |
+
"""
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
template_registry.register(
|
| 233 |
+
"chat_template",
|
| 234 |
+
"""
|
| 235 |
+
{system_message}
|
| 236 |
+
|
| 237 |
+
{for message in conversation}
|
| 238 |
+
{if message.role == "user"}Human: {message.content}
|
| 239 |
+
{else}Assistant: {message.content}
|
| 240 |
+
{endif}
|
| 241 |
+
{endfor}
|
| 242 |
+
"""
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
template_registry.register(
|
| 246 |
+
"few_shot",
|
| 247 |
+
"""
|
| 248 |
+
{system_message}
|
| 249 |
+
|
| 250 |
+
Here are some examples:
|
| 251 |
+
{for example in examples}
|
| 252 |
+
Input: {example.input}
|
| 253 |
+
Output: {example.output}
|
| 254 |
+
{endfor}
|
| 255 |
+
|
| 256 |
+
Input: {input}
|
| 257 |
+
Output:
|
| 258 |
+
"""
|
| 259 |
+
)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=42", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "promptlab"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "A comprehensive LLM Prompt Management System"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.7"
|
| 11 |
+
license = {text = "MIT"}
|
| 12 |
+
keywords = ["llm", "prompt engineering", "nlp", "machine learning"]
|
| 13 |
+
authors = [
|
| 14 |
+
{name = "Biswanath Roul"}
|
| 15 |
+
]
|
| 16 |
+
maintainers = [
|
| 17 |
+
{name = "Biswanath Roul"}
|
| 18 |
+
]
|
| 19 |
+
classifiers = [
|
| 20 |
+
"Development Status :: 3 - Alpha",
|
| 21 |
+
"Intended Audience :: Developers",
|
| 22 |
+
"Intended Audience :: Science/Research",
|
| 23 |
+
"License :: OSI Approved :: MIT License",
|
| 24 |
+
"Programming Language :: Python :: 3",
|
| 25 |
+
"Programming Language :: Python :: 3.7",
|
| 26 |
+
"Programming Language :: Python :: 3.8",
|
| 27 |
+
"Programming Language :: Python :: 3.9",
|
| 28 |
+
"Programming Language :: Python :: 3.10",
|
| 29 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 30 |
+
]
|
| 31 |
+
dependencies = [
|
| 32 |
+
"numpy>=1.20.0",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
[project.urls]
|
| 36 |
+
"Homepage" = "https://github.com/biswanathroul/promptlab"
|
| 37 |
+
"Bug Tracker" = "https://github.com/biswanathroul/promptlab/issues"
|
| 38 |
+
"Documentation" = "https://github.com/biswanathroul/promptlab/wiki"
|
| 39 |
+
"Source Code" = "https://github.com/biswanathroul/promptlab"
|
| 40 |
+
|
| 41 |
+
[project.scripts]
|
| 42 |
+
promptlab = "promptlab.cli.commands:main"
|
| 43 |
+
|
| 44 |
+
[tool.setuptools]
|
| 45 |
+
packages = ["promptlab", "promptlab.core", "promptlab.cli", "promptlab.utils"]
|