Spaces:
Running
Running
| """ | |
| Prompt A/B Tester - Streamlit Web Application | |
| Aplikacja do testowania promptów OpenAI GPT z metodologią A/B | |
| """ | |
| import streamlit as st | |
| import time | |
| from datetime import datetime | |
| from config import ConfigManager | |
| from api_handler import APIHandler | |
| from test_runner import TestRunner | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Prompt A/B Tester", | |
| page_icon="🔬", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better text visibility in disabled text areas | |
| st.markdown(""" | |
| <style> | |
| /* Improve visibility of disabled text areas */ | |
| .stTextArea textarea[disabled] { | |
| color: #FFFFFF !important; | |
| opacity: 1 !important; | |
| -webkit-text-fill-color: #FFFFFF !important; | |
| } | |
| /* For light theme */ | |
| [data-theme="light"] .stTextArea textarea[disabled] { | |
| color: #000000 !important; | |
| -webkit-text-fill-color: #000000 !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize session state | |
| if 'initialized' not in st.session_state: | |
| st.session_state.initialized = True | |
| st.session_state.responses = [] | |
| st.session_state.test_running = False | |
| st.session_state.logs = [] | |
| st.session_state.results = None | |
| st.session_state.rated_responses = [] | |
| # Initialize components | |
| config = ConfigManager() | |
| api_handler = APIHandler(config.get_api_key()) | |
| test_runner = TestRunner(api_handler) | |
| # Helper functions | |
| def log_message(message): | |
| """Add message to logs""" | |
| timestamp = datetime.now().strftime("%H:%M:%S") | |
| st.session_state.logs.append(f"[{timestamp}] {message}") | |
| def clear_logs(): | |
| """Clear all logs""" | |
| st.session_state.logs = [] | |
| def update_progress_callback(current, total): | |
| """Progress callback for test runner""" | |
| # Progress is handled by st.progress in Streamlit | |
| pass | |
| # Title | |
| st.title("🔬 Prompt A/B Tester") | |
| st.markdown("**Testuj i porównuj wydajność dwóch wersji promptów OpenAI GPT**") | |
| # Sidebar - Configuration | |
| with st.sidebar: | |
| st.header("⚙️ Konfiguracja") | |
| # API Key | |
| st.subheader("🔑 Klucz API OpenAI") | |
| # Check if API key is in secrets (Hugging Face deployment) | |
| api_key_from_secrets = None | |
| try: | |
| if hasattr(st, 'secrets') and 'OPENAI_API_KEY' in st.secrets: | |
| api_key_from_secrets = st.secrets['OPENAI_API_KEY'] | |
| st.success("✅ Klucz API załadowany z Secrets") | |
| except: | |
| pass | |
| if not api_key_from_secrets: | |
| api_key_input = st.text_input( | |
| "Klucz API", | |
| type="password", | |
| value=config.get("api_key", ""), | |
| help="Wprowadź swój klucz API OpenAI" | |
| ) | |
| if st.button("💾 Zapisz i Waliduj API Key"): | |
| if api_key_input: | |
| api_handler.set_api_key(api_key_input) | |
| config.set_api_key(api_key_input) | |
| success, message = api_handler.validate_api_key() | |
| if success: | |
| st.success(f"✅ {message}") | |
| else: | |
| st.error(f"❌ {message}") | |
| else: | |
| st.warning("⚠️ Klucz API nie może być pusty") | |
| st.divider() | |
| # Model and Parameters | |
| st.subheader("🤖 Model i Parametry") | |
| # Refresh models | |
| if st.button("🔄 Odśwież modele"): | |
| models = api_handler.get_available_models() | |
| st.success(f"Znaleziono {len(models)} modeli") | |
| models = api_handler.get_available_models() | |
| model = st.selectbox( | |
| "Model", | |
| options=models, | |
| index=models.index(config.get("model", "gpt-4o")) if config.get("model", "gpt-4o") in models else 0, | |
| help="Wybierz model OpenAI do testowania. Aplikacja automatycznie dostosowuje parametry API dla wszystkich modeli." | |
| ) | |
| temperature = st.slider( | |
| "Temperature", | |
| min_value=0.0, | |
| max_value=2.0, | |
| value=config.get("temperature", 0.1), | |
| step=0.1, | |
| help="0.0 = deterministyczne, 2.0 = kreatywne" | |
| ) | |
| max_tokens = st.number_input( | |
| "Max Tokens", | |
| min_value=100, | |
| max_value=4000, | |
| value=config.get("max_tokens", 2000), | |
| step=100 | |
| ) | |
| num_responses = st.number_input( | |
| "Liczba odpowiedzi (na prompt)", | |
| min_value=1, | |
| max_value=50, | |
| value=config.get("num_responses", 5), | |
| step=1, | |
| help="Ile odpowiedzi wygenerować dla każdego promptu" | |
| ) | |
| # Save settings | |
| if st.button("💾 Zapisz ustawienia"): | |
| settings = { | |
| "model": model, | |
| "temperature": temperature, | |
| "max_tokens": max_tokens, | |
| "num_responses": num_responses | |
| } | |
| config.update_settings(settings) | |
| st.success("✅ Ustawienia zapisane") | |
| # Cost estimation | |
| if st.button("💰 Szacuj koszt"): | |
| total_responses = num_responses * 2 | |
| cost = api_handler.estimate_cost(model, total_responses) | |
| st.info(f"**Szacunkowy koszt:**\n\n" | |
| f"Model: {model}\n\n" | |
| f"Odpowiedzi: {total_responses}\n\n" | |
| f"Koszt: ${cost:.4f} USD") | |
| # Main tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["🚀 Test", "⭐ Ocenianie", "📊 Wyniki", "📋 Logi"]) | |
| # Tab 1: Test | |
| with tab1: | |
| st.header("🚀 Przeprowadź Test A/B") | |
| # Input method selection | |
| input_method = st.radio( | |
| "Metoda wprowadzania promptów:", | |
| options=["📁 Upload plików", "✏️ Wklej tekst"], | |
| horizontal=True | |
| ) | |
| prompt_a = None | |
| prompt_b = None | |
| if input_method == "📁 Upload plików": | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Prompt A") | |
| file_a = st.file_uploader( | |
| "Wybierz plik dla Promptu A", | |
| type=['md', 'txt'], | |
| key="file_a" | |
| ) | |
| if file_a: | |
| prompt_a = file_a.read().decode('utf-8') | |
| st.success(f"✅ Wczytano {len(prompt_a)} znaków") | |
| with col2: | |
| st.subheader("Prompt B") | |
| file_b = st.file_uploader( | |
| "Wybierz plik dla Promptu B", | |
| type=['md', 'txt'], | |
| key="file_b" | |
| ) | |
| if file_b: | |
| prompt_b = file_b.read().decode('utf-8') | |
| st.success(f"✅ Wczytano {len(prompt_b)} znaków") | |
| else: # Wklej tekst | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Prompt A") | |
| prompt_a = st.text_area( | |
| "Wklej treść Promptu A", | |
| height=300, | |
| key="text_a" | |
| ) | |
| with col2: | |
| st.subheader("Prompt B") | |
| prompt_b = st.text_area( | |
| "Wklej treść Promptu B", | |
| height=300, | |
| key="text_b" | |
| ) | |
| # Test controls | |
| st.divider() | |
| col1, col2, col3 = st.columns([1, 1, 3]) | |
| with col1: | |
| run_button = st.button( | |
| "🚀 Uruchom Test", | |
| type="primary", | |
| disabled=st.session_state.test_running, | |
| use_container_width=True | |
| ) | |
| with col2: | |
| if st.button("🗑️ Wyczyść", use_container_width=True): | |
| st.session_state.responses = [] | |
| st.session_state.results = None | |
| st.session_state.rated_responses = [] | |
| clear_logs() | |
| st.rerun() | |
| # Run test | |
| if run_button: | |
| # Validation | |
| if not config.get_api_key(): | |
| st.error("❌ Najpierw ustaw klucz API w panelu bocznym") | |
| elif not prompt_a or not prompt_b: | |
| st.error("❌ Oba prompty muszą być wypełnione") | |
| else: | |
| clear_logs() | |
| log_message("🚀 Rozpoczynanie testu A/B") | |
| log_message(f"Model: {model}, Temperature: {temperature}, Max tokens: {max_tokens}") | |
| log_message(f"Liczba odpowiedzi na prompt: {num_responses}") | |
| # Progress tracking | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| total_iterations = num_responses * 2 | |
| current = 0 | |
| st.session_state.test_running = True | |
| # Generate responses for Prompt A | |
| log_message("🔄 Generowanie odpowiedzi dla PROMPTU A...") | |
| for i in range(num_responses): | |
| current += 1 | |
| progress_bar.progress(current / total_iterations) | |
| status_text.text(f"Generowanie: {current}/{total_iterations} (A-{i+1})") | |
| response = api_handler.generate_response( | |
| prompt_a, model, temperature, max_tokens | |
| ) | |
| st.session_state.responses.append({ | |
| 'Option': 'A', | |
| 'Response_ID': i + 1, | |
| 'Response': response, | |
| 'Score': None | |
| }) | |
| if response.startswith("ERROR"): | |
| log_message(f" A-{i+1}/{num_responses}... ❌ {response}") | |
| else: | |
| log_message(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)") | |
| time.sleep(0.5) | |
| # Generate responses for Prompt B | |
| log_message("\n🔄 Generowanie odpowiedzi dla PROMPTU B...") | |
| for i in range(num_responses): | |
| current += 1 | |
| progress_bar.progress(current / total_iterations) | |
| status_text.text(f"Generowanie: {current}/{total_iterations} (B-{i+1})") | |
| response = api_handler.generate_response( | |
| prompt_b, model, temperature, max_tokens | |
| ) | |
| st.session_state.responses.append({ | |
| 'Option': 'B', | |
| 'Response_ID': i + 1, | |
| 'Response': response, | |
| 'Score': None | |
| }) | |
| if response.startswith("ERROR"): | |
| log_message(f" B-{i+1}/{num_responses}... ❌ {response}") | |
| else: | |
| log_message(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)") | |
| time.sleep(0.5) | |
| progress_bar.progress(1.0) | |
| status_text.text("✅ Zakończono!") | |
| log_message(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(st.session_state.responses)} odpowiedzi") | |
| st.session_state.test_running = False | |
| st.success("🎉 Test zakończony! Przejdź do zakładki '⭐ Ocenianie'") | |
| # Tab 2: Rating | |
| with tab2: | |
| st.header("⭐ Oceń Odpowiedzi") | |
| if not st.session_state.responses: | |
| st.info("ℹ️ Najpierw uruchom test w zakładce '🚀 Test'") | |
| else: | |
| st.markdown("**Oceń każdą odpowiedź w skali 1-5:**") | |
| st.markdown("1 = Bardzo słaba | 2 = Słaba | 3 = Średnia | 4 = Dobra | 5 = Bardzo dobra") | |
| st.divider() | |
| # Display responses for rating | |
| for idx, resp in enumerate(st.session_state.responses): | |
| with st.container(): | |
| col1, col2 = st.columns([4, 1]) | |
| with col1: | |
| option_label = f"{resp['Option']}-{resp['Response_ID']}" | |
| st.subheader(f"Option: {option_label}") | |
| st.markdown("**Odpowiedź:**") | |
| st.text_area( | |
| "Odpowiedź:", | |
| value=resp['Response'], | |
| height=150, | |
| key=f"response_display_{idx}", | |
| disabled=True, | |
| label_visibility="collapsed" | |
| ) | |
| with col2: | |
| st.markdown("**Ocena**") | |
| score = st.number_input( | |
| "Ocena (1-5)", | |
| min_value=1, | |
| max_value=5, | |
| value=resp.get('Score', 3), | |
| step=1, | |
| key=f"score_{idx}", | |
| label_visibility="collapsed" | |
| ) | |
| st.session_state.responses[idx]['Score'] = score | |
| st.divider() | |
| # Calculate results button | |
| if st.button("📊 Oblicz Wyniki", type="primary", use_container_width=True): | |
| st.session_state.rated_responses = st.session_state.responses.copy() | |
| st.session_state.results = test_runner.calculate_results(st.session_state.rated_responses) | |
| st.success("✅ Wyniki obliczone! Przejdź do zakładki '📊 Wyniki'") | |
| st.rerun() | |
| # Tab 3: Results | |
| with tab3: | |
| st.header("📊 Wyniki Testu A/B") | |
| if not st.session_state.results: | |
| st.info("ℹ️ Najpierw oceń odpowiedzi i kliknij 'Oblicz Wyniki' w zakładce '⭐ Ocenianie'") | |
| else: | |
| results = st.session_state.results | |
| # Results table | |
| st.subheader("Podsumowanie") | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| with col1: | |
| st.metric("Option", "A") | |
| with col2: | |
| st.metric("Count", results['A']['count']) | |
| with col3: | |
| st.metric("Score", f"{results['A']['score']:.2f}") | |
| with col4: | |
| st.metric("Min", results['A']['min']) | |
| with col5: | |
| st.metric("Max", results['A']['max']) | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| with col1: | |
| st.metric("Option", "B") | |
| with col2: | |
| st.metric("Count", results['B']['count']) | |
| with col3: | |
| st.metric("Score", f"{results['B']['score']:.2f}") | |
| with col4: | |
| st.metric("Min", results['B']['min']) | |
| with col5: | |
| st.metric("Max", results['B']['max']) | |
| st.divider() | |
| # Winner | |
| if results['A']['score'] > results['B']['score']: | |
| winner = 'A' | |
| diff = results['A']['score'] - results['B']['score'] | |
| st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu") | |
| elif results['B']['score'] > results['A']['score']: | |
| winner = 'B' | |
| diff = results['B']['score'] - results['A']['score'] | |
| st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu") | |
| else: | |
| st.info("🤝 **REMIS**\n\nObie opcje uzyskały identyczny wynik") | |
| st.divider() | |
| # Export section | |
| st.subheader("📥 Eksport Wyników") | |
| st.markdown("Pobierz wyniki w preferowanym formacie:") | |
| settings = { | |
| "model": config.get("model"), | |
| "temperature": config.get("temperature"), | |
| "max_tokens": config.get("max_tokens"), | |
| "num_responses": config.get("num_responses") | |
| } | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # Create 3 columns for export buttons | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| # CSV Export | |
| csv_buffer = test_runner.export_to_csv( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="📄 CSV", | |
| data=csv_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.csv", | |
| mime="text/csv", | |
| use_container_width=True, | |
| help="Pobierz w formacie CSV" | |
| ) | |
| # JSON Export | |
| json_buffer = test_runner.export_to_json( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="🔧 JSON", | |
| data=json_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.json", | |
| mime="application/json", | |
| use_container_width=True, | |
| help="Pobierz w formacie JSON" | |
| ) | |
| with col2: | |
| # Excel Export | |
| excel_buffer = test_runner.export_to_excel( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="📊 Excel", | |
| data=excel_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| use_container_width=True, | |
| help="Pobierz w formacie Excel" | |
| ) | |
| # TXT Export | |
| txt_buffer = test_runner.export_to_txt( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="📝 TXT", | |
| data=txt_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.txt", | |
| mime="text/plain", | |
| use_container_width=True, | |
| help="Pobierz w formacie TXT" | |
| ) | |
| with col3: | |
| # Markdown Export | |
| markdown_buffer = test_runner.export_to_markdown( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="📋 Markdown", | |
| data=markdown_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.md", | |
| mime="text/markdown", | |
| use_container_width=True, | |
| help="Pobierz w formacie Markdown" | |
| ) | |
| # Word Export | |
| word_buffer = test_runner.export_to_word( | |
| st.session_state.rated_responses, | |
| results, | |
| settings | |
| ) | |
| st.download_button( | |
| label="📄 Word", | |
| data=word_buffer, | |
| file_name=f"prompt_ab_test_{timestamp}.docx", | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| use_container_width=True, | |
| help="Pobierz w formacie Word" | |
| ) | |
| # Tab 4: Logs | |
| with tab4: | |
| st.header("📋 Logi Testów") | |
| if st.button("🗑️ Wyczyść logi"): | |
| clear_logs() | |
| st.rerun() | |
| if st.session_state.logs: | |
| log_text = "\n".join(st.session_state.logs) | |
| st.text_area( | |
| "Logi:", | |
| value=log_text, | |
| height=500, | |
| disabled=True, | |
| label_visibility="collapsed" | |
| ) | |
| else: | |
| st.info("ℹ️ Brak logów do wyświetlenia") | |
| # Footer | |
| st.divider() | |
| st.markdown( | |
| """ | |
| <div style='text-align: center; color: gray; font-size: 0.9em;'> | |
| Prompt A/B Tester v2.0 (Streamlit) | Created by Heuristica.pl - Marek Staniszewski | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) |