Prompt_Tester / app.py
Marek4321's picture
Update app.py
a85ca7e verified
"""
Prompt A/B Tester - Streamlit Web Application
Aplikacja do testowania promptów OpenAI GPT z metodologią A/B
"""
import streamlit as st
import time
from datetime import datetime
from config import ConfigManager
from api_handler import APIHandler
from test_runner import TestRunner
# Page configuration
st.set_page_config(
page_title="Prompt A/B Tester",
page_icon="🔬",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better text visibility in disabled text areas
st.markdown("""
<style>
/* Improve visibility of disabled text areas */
.stTextArea textarea[disabled] {
color: #FFFFFF !important;
opacity: 1 !important;
-webkit-text-fill-color: #FFFFFF !important;
}
/* For light theme */
[data-theme="light"] .stTextArea textarea[disabled] {
color: #000000 !important;
-webkit-text-fill-color: #000000 !important;
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'initialized' not in st.session_state:
st.session_state.initialized = True
st.session_state.responses = []
st.session_state.test_running = False
st.session_state.logs = []
st.session_state.results = None
st.session_state.rated_responses = []
# Initialize components
config = ConfigManager()
api_handler = APIHandler(config.get_api_key())
test_runner = TestRunner(api_handler)
# Helper functions
def log_message(message):
"""Add message to logs"""
timestamp = datetime.now().strftime("%H:%M:%S")
st.session_state.logs.append(f"[{timestamp}] {message}")
def clear_logs():
"""Clear all logs"""
st.session_state.logs = []
def update_progress_callback(current, total):
"""Progress callback for test runner"""
# Progress is handled by st.progress in Streamlit
pass
# Title
st.title("🔬 Prompt A/B Tester")
st.markdown("**Testuj i porównuj wydajność dwóch wersji promptów OpenAI GPT**")
# Sidebar - Configuration
with st.sidebar:
st.header("⚙️ Konfiguracja")
# API Key
st.subheader("🔑 Klucz API OpenAI")
# Check if API key is in secrets (Hugging Face deployment)
api_key_from_secrets = None
try:
if hasattr(st, 'secrets') and 'OPENAI_API_KEY' in st.secrets:
api_key_from_secrets = st.secrets['OPENAI_API_KEY']
st.success("✅ Klucz API załadowany z Secrets")
except:
pass
if not api_key_from_secrets:
api_key_input = st.text_input(
"Klucz API",
type="password",
value=config.get("api_key", ""),
help="Wprowadź swój klucz API OpenAI"
)
if st.button("💾 Zapisz i Waliduj API Key"):
if api_key_input:
api_handler.set_api_key(api_key_input)
config.set_api_key(api_key_input)
success, message = api_handler.validate_api_key()
if success:
st.success(f"✅ {message}")
else:
st.error(f"❌ {message}")
else:
st.warning("⚠️ Klucz API nie może być pusty")
st.divider()
# Model and Parameters
st.subheader("🤖 Model i Parametry")
# Refresh models
if st.button("🔄 Odśwież modele"):
models = api_handler.get_available_models()
st.success(f"Znaleziono {len(models)} modeli")
models = api_handler.get_available_models()
model = st.selectbox(
"Model",
options=models,
index=models.index(config.get("model", "gpt-4o")) if config.get("model", "gpt-4o") in models else 0,
help="Wybierz model OpenAI do testowania. Aplikacja automatycznie dostosowuje parametry API dla wszystkich modeli."
)
temperature = st.slider(
"Temperature",
min_value=0.0,
max_value=2.0,
value=config.get("temperature", 0.1),
step=0.1,
help="0.0 = deterministyczne, 2.0 = kreatywne"
)
max_tokens = st.number_input(
"Max Tokens",
min_value=100,
max_value=4000,
value=config.get("max_tokens", 2000),
step=100
)
num_responses = st.number_input(
"Liczba odpowiedzi (na prompt)",
min_value=1,
max_value=50,
value=config.get("num_responses", 5),
step=1,
help="Ile odpowiedzi wygenerować dla każdego promptu"
)
# Save settings
if st.button("💾 Zapisz ustawienia"):
settings = {
"model": model,
"temperature": temperature,
"max_tokens": max_tokens,
"num_responses": num_responses
}
config.update_settings(settings)
st.success("✅ Ustawienia zapisane")
# Cost estimation
if st.button("💰 Szacuj koszt"):
total_responses = num_responses * 2
cost = api_handler.estimate_cost(model, total_responses)
st.info(f"**Szacunkowy koszt:**\n\n"
f"Model: {model}\n\n"
f"Odpowiedzi: {total_responses}\n\n"
f"Koszt: ${cost:.4f} USD")
# Main tabs
tab1, tab2, tab3, tab4 = st.tabs(["🚀 Test", "⭐ Ocenianie", "📊 Wyniki", "📋 Logi"])
# Tab 1: Test
with tab1:
st.header("🚀 Przeprowadź Test A/B")
# Input method selection
input_method = st.radio(
"Metoda wprowadzania promptów:",
options=["📁 Upload plików", "✏️ Wklej tekst"],
horizontal=True
)
prompt_a = None
prompt_b = None
if input_method == "📁 Upload plików":
col1, col2 = st.columns(2)
with col1:
st.subheader("Prompt A")
file_a = st.file_uploader(
"Wybierz plik dla Promptu A",
type=['md', 'txt'],
key="file_a"
)
if file_a:
prompt_a = file_a.read().decode('utf-8')
st.success(f"✅ Wczytano {len(prompt_a)} znaków")
with col2:
st.subheader("Prompt B")
file_b = st.file_uploader(
"Wybierz plik dla Promptu B",
type=['md', 'txt'],
key="file_b"
)
if file_b:
prompt_b = file_b.read().decode('utf-8')
st.success(f"✅ Wczytano {len(prompt_b)} znaków")
else: # Wklej tekst
col1, col2 = st.columns(2)
with col1:
st.subheader("Prompt A")
prompt_a = st.text_area(
"Wklej treść Promptu A",
height=300,
key="text_a"
)
with col2:
st.subheader("Prompt B")
prompt_b = st.text_area(
"Wklej treść Promptu B",
height=300,
key="text_b"
)
# Test controls
st.divider()
col1, col2, col3 = st.columns([1, 1, 3])
with col1:
run_button = st.button(
"🚀 Uruchom Test",
type="primary",
disabled=st.session_state.test_running,
use_container_width=True
)
with col2:
if st.button("🗑️ Wyczyść", use_container_width=True):
st.session_state.responses = []
st.session_state.results = None
st.session_state.rated_responses = []
clear_logs()
st.rerun()
# Run test
if run_button:
# Validation
if not config.get_api_key():
st.error("❌ Najpierw ustaw klucz API w panelu bocznym")
elif not prompt_a or not prompt_b:
st.error("❌ Oba prompty muszą być wypełnione")
else:
clear_logs()
log_message("🚀 Rozpoczynanie testu A/B")
log_message(f"Model: {model}, Temperature: {temperature}, Max tokens: {max_tokens}")
log_message(f"Liczba odpowiedzi na prompt: {num_responses}")
# Progress tracking
progress_bar = st.progress(0)
status_text = st.empty()
total_iterations = num_responses * 2
current = 0
st.session_state.test_running = True
# Generate responses for Prompt A
log_message("🔄 Generowanie odpowiedzi dla PROMPTU A...")
for i in range(num_responses):
current += 1
progress_bar.progress(current / total_iterations)
status_text.text(f"Generowanie: {current}/{total_iterations} (A-{i+1})")
response = api_handler.generate_response(
prompt_a, model, temperature, max_tokens
)
st.session_state.responses.append({
'Option': 'A',
'Response_ID': i + 1,
'Response': response,
'Score': None
})
if response.startswith("ERROR"):
log_message(f" A-{i+1}/{num_responses}... ❌ {response}")
else:
log_message(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
time.sleep(0.5)
# Generate responses for Prompt B
log_message("\n🔄 Generowanie odpowiedzi dla PROMPTU B...")
for i in range(num_responses):
current += 1
progress_bar.progress(current / total_iterations)
status_text.text(f"Generowanie: {current}/{total_iterations} (B-{i+1})")
response = api_handler.generate_response(
prompt_b, model, temperature, max_tokens
)
st.session_state.responses.append({
'Option': 'B',
'Response_ID': i + 1,
'Response': response,
'Score': None
})
if response.startswith("ERROR"):
log_message(f" B-{i+1}/{num_responses}... ❌ {response}")
else:
log_message(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
time.sleep(0.5)
progress_bar.progress(1.0)
status_text.text("✅ Zakończono!")
log_message(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(st.session_state.responses)} odpowiedzi")
st.session_state.test_running = False
st.success("🎉 Test zakończony! Przejdź do zakładki '⭐ Ocenianie'")
# Tab 2: Rating
with tab2:
st.header("⭐ Oceń Odpowiedzi")
if not st.session_state.responses:
st.info("ℹ️ Najpierw uruchom test w zakładce '🚀 Test'")
else:
st.markdown("**Oceń każdą odpowiedź w skali 1-5:**")
st.markdown("1 = Bardzo słaba | 2 = Słaba | 3 = Średnia | 4 = Dobra | 5 = Bardzo dobra")
st.divider()
# Display responses for rating
for idx, resp in enumerate(st.session_state.responses):
with st.container():
col1, col2 = st.columns([4, 1])
with col1:
option_label = f"{resp['Option']}-{resp['Response_ID']}"
st.subheader(f"Option: {option_label}")
st.markdown("**Odpowiedź:**")
st.text_area(
"Odpowiedź:",
value=resp['Response'],
height=150,
key=f"response_display_{idx}",
disabled=True,
label_visibility="collapsed"
)
with col2:
st.markdown("**Ocena**")
score = st.number_input(
"Ocena (1-5)",
min_value=1,
max_value=5,
value=resp.get('Score', 3),
step=1,
key=f"score_{idx}",
label_visibility="collapsed"
)
st.session_state.responses[idx]['Score'] = score
st.divider()
# Calculate results button
if st.button("📊 Oblicz Wyniki", type="primary", use_container_width=True):
st.session_state.rated_responses = st.session_state.responses.copy()
st.session_state.results = test_runner.calculate_results(st.session_state.rated_responses)
st.success("✅ Wyniki obliczone! Przejdź do zakładki '📊 Wyniki'")
st.rerun()
# Tab 3: Results
with tab3:
st.header("📊 Wyniki Testu A/B")
if not st.session_state.results:
st.info("ℹ️ Najpierw oceń odpowiedzi i kliknij 'Oblicz Wyniki' w zakładce '⭐ Ocenianie'")
else:
results = st.session_state.results
# Results table
st.subheader("Podsumowanie")
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("Option", "A")
with col2:
st.metric("Count", results['A']['count'])
with col3:
st.metric("Score", f"{results['A']['score']:.2f}")
with col4:
st.metric("Min", results['A']['min'])
with col5:
st.metric("Max", results['A']['max'])
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("Option", "B")
with col2:
st.metric("Count", results['B']['count'])
with col3:
st.metric("Score", f"{results['B']['score']:.2f}")
with col4:
st.metric("Min", results['B']['min'])
with col5:
st.metric("Max", results['B']['max'])
st.divider()
# Winner
if results['A']['score'] > results['B']['score']:
winner = 'A'
diff = results['A']['score'] - results['B']['score']
st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
elif results['B']['score'] > results['A']['score']:
winner = 'B'
diff = results['B']['score'] - results['A']['score']
st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
else:
st.info("🤝 **REMIS**\n\nObie opcje uzyskały identyczny wynik")
st.divider()
# Export section
st.subheader("📥 Eksport Wyników")
st.markdown("Pobierz wyniki w preferowanym formacie:")
settings = {
"model": config.get("model"),
"temperature": config.get("temperature"),
"max_tokens": config.get("max_tokens"),
"num_responses": config.get("num_responses")
}
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Create 3 columns for export buttons
col1, col2, col3 = st.columns(3)
with col1:
# CSV Export
csv_buffer = test_runner.export_to_csv(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="📄 CSV",
data=csv_buffer,
file_name=f"prompt_ab_test_{timestamp}.csv",
mime="text/csv",
use_container_width=True,
help="Pobierz w formacie CSV"
)
# JSON Export
json_buffer = test_runner.export_to_json(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="🔧 JSON",
data=json_buffer,
file_name=f"prompt_ab_test_{timestamp}.json",
mime="application/json",
use_container_width=True,
help="Pobierz w formacie JSON"
)
with col2:
# Excel Export
excel_buffer = test_runner.export_to_excel(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="📊 Excel",
data=excel_buffer,
file_name=f"prompt_ab_test_{timestamp}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True,
help="Pobierz w formacie Excel"
)
# TXT Export
txt_buffer = test_runner.export_to_txt(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="📝 TXT",
data=txt_buffer,
file_name=f"prompt_ab_test_{timestamp}.txt",
mime="text/plain",
use_container_width=True,
help="Pobierz w formacie TXT"
)
with col3:
# Markdown Export
markdown_buffer = test_runner.export_to_markdown(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="📋 Markdown",
data=markdown_buffer,
file_name=f"prompt_ab_test_{timestamp}.md",
mime="text/markdown",
use_container_width=True,
help="Pobierz w formacie Markdown"
)
# Word Export
word_buffer = test_runner.export_to_word(
st.session_state.rated_responses,
results,
settings
)
st.download_button(
label="📄 Word",
data=word_buffer,
file_name=f"prompt_ab_test_{timestamp}.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
use_container_width=True,
help="Pobierz w formacie Word"
)
# Tab 4: Logs
with tab4:
st.header("📋 Logi Testów")
if st.button("🗑️ Wyczyść logi"):
clear_logs()
st.rerun()
if st.session_state.logs:
log_text = "\n".join(st.session_state.logs)
st.text_area(
"Logi:",
value=log_text,
height=500,
disabled=True,
label_visibility="collapsed"
)
else:
st.info("ℹ️ Brak logów do wyświetlenia")
# Footer
st.divider()
st.markdown(
"""
<div style='text-align: center; color: gray; font-size: 0.9em;'>
Prompt A/B Tester v2.0 (Streamlit) | Created by Heuristica.pl - Marek Staniszewski
</div>
""",
unsafe_allow_html=True
)