Marek4321 commited on
Commit
c7bea8b
·
verified ·
1 Parent(s): fd5c1e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -471
app.py CHANGED
@@ -1,471 +1,471 @@
1
- """
2
- Prompt A/B Tester - Streamlit Web Application
3
- Aplikacja do testowania promptów OpenAI GPT z metodologią A/B
4
- """
5
-
6
- import streamlit as st
7
- import time
8
- from datetime import datetime
9
- from config import ConfigManager
10
- from api_handler import APIHandler
11
- from test_runner import TestRunner
12
-
13
- # Page configuration
14
- st.set_page_config(
15
- page_title="Prompt A/B Tester",
16
- page_icon="🔬",
17
- layout="wide",
18
- initial_sidebar_state="expanded"
19
- )
20
-
21
- # Initialize session state
22
- if 'initialized' not in st.session_state:
23
- st.session_state.initialized = True
24
- st.session_state.responses = []
25
- st.session_state.test_running = False
26
- st.session_state.logs = []
27
- st.session_state.results = None
28
- st.session_state.rated_responses = []
29
-
30
- # Initialize components
31
- config = ConfigManager()
32
- api_handler = APIHandler(config.get_api_key())
33
- test_runner = TestRunner(api_handler)
34
-
35
- # Helper functions
36
- def log_message(message):
37
- """Add message to logs"""
38
- timestamp = datetime.now().strftime("%H:%M:%S")
39
- st.session_state.logs.append(f"[{timestamp}] {message}")
40
-
41
- def clear_logs():
42
- """Clear all logs"""
43
- st.session_state.logs = []
44
-
45
- def update_progress_callback(current, total):
46
- """Progress callback for test runner"""
47
- # Progress is handled by st.progress in Streamlit
48
- pass
49
-
50
- # Title
51
- st.title("🔬 Prompt A/B Tester")
52
- st.markdown("**Testuj i porównuj wydajność dwóch wersji promptów OpenAI GPT**")
53
-
54
- # Sidebar - Configuration
55
- with st.sidebar:
56
- st.header("⚙️ Konfiguracja")
57
-
58
- # API Key
59
- st.subheader("🔑 Klucz API OpenAI")
60
-
61
- # Check if API key is in secrets (Hugging Face deployment)
62
- api_key_from_secrets = None
63
- try:
64
- if hasattr(st, 'secrets') and 'OPENAI_API_KEY' in st.secrets:
65
- api_key_from_secrets = st.secrets['OPENAI_API_KEY']
66
- st.success("✅ Klucz API załadowany z Secrets")
67
- except:
68
- pass
69
-
70
- if not api_key_from_secrets:
71
- api_key_input = st.text_input(
72
- "Klucz API",
73
- type="password",
74
- value=config.get("api_key", ""),
75
- help="Wprowadź swój klucz API OpenAI"
76
- )
77
-
78
- if st.button("💾 Zapisz i Waliduj API Key"):
79
- if api_key_input:
80
- api_handler.set_api_key(api_key_input)
81
- config.set_api_key(api_key_input)
82
- success, message = api_handler.validate_api_key()
83
- if success:
84
- st.success(f"✅ {message}")
85
- else:
86
- st.error(f"❌ {message}")
87
- else:
88
- st.warning("⚠️ Klucz API nie może być pusty")
89
-
90
- st.divider()
91
-
92
- # Model and Parameters
93
- st.subheader("🤖 Model i Parametry")
94
-
95
- # Refresh models
96
- if st.button("🔄 Odśwież modele"):
97
- models = api_handler.get_available_models()
98
- st.success(f"Znaleziono {len(models)} modeli")
99
-
100
- models = api_handler.get_available_models()
101
- model = st.selectbox(
102
- "Model",
103
- options=models,
104
- index=models.index(config.get("model", "gpt-4o")) if config.get("model", "gpt-4o") in models else 0
105
- )
106
-
107
- temperature = st.slider(
108
- "Temperature",
109
- min_value=0.0,
110
- max_value=2.0,
111
- value=config.get("temperature", 0.1),
112
- step=0.1,
113
- help="0.0 = deterministyczne, 2.0 = kreatywne"
114
- )
115
-
116
- max_tokens = st.number_input(
117
- "Max Tokens",
118
- min_value=100,
119
- max_value=4000,
120
- value=config.get("max_tokens", 2000),
121
- step=100
122
- )
123
-
124
- num_responses = st.number_input(
125
- "Liczba odpowiedzi (na prompt)",
126
- min_value=1,
127
- max_value=50,
128
- value=config.get("num_responses", 5),
129
- step=1,
130
- help="Ile odpowiedzi wygenerować dla każdego promptu"
131
- )
132
-
133
- # Save settings
134
- if st.button("💾 Zapisz ustawienia"):
135
- settings = {
136
- "model": model,
137
- "temperature": temperature,
138
- "max_tokens": max_tokens,
139
- "num_responses": num_responses
140
- }
141
- config.update_settings(settings)
142
- st.success("✅ Ustawienia zapisane")
143
-
144
- # Cost estimation
145
- if st.button("💰 Szacuj koszt"):
146
- total_responses = num_responses * 2
147
- cost = api_handler.estimate_cost(model, total_responses)
148
- st.info(f"**Szacunkowy koszt:**\n\n"
149
- f"Model: {model}\n\n"
150
- f"Odpowiedzi: {total_responses}\n\n"
151
- f"Koszt: ${cost:.4f} USD")
152
-
153
- # Main tabs
154
- tab1, tab2, tab3, tab4 = st.tabs(["🚀 Test", "⭐ Ocenianie", "📊 Wyniki", "📋 Logi"])
155
-
156
- # Tab 1: Test
157
- with tab1:
158
- st.header("🚀 Przeprowadź Test A/B")
159
-
160
- # Input method selection
161
- input_method = st.radio(
162
- "Metoda wprowadzania promptów:",
163
- options=["📁 Upload plików", "✏️ Wklej tekst"],
164
- horizontal=True
165
- )
166
-
167
- prompt_a = None
168
- prompt_b = None
169
-
170
- if input_method == "📁 Upload plików":
171
- col1, col2 = st.columns(2)
172
-
173
- with col1:
174
- st.subheader("Prompt A")
175
- file_a = st.file_uploader(
176
- "Wybierz plik dla Promptu A",
177
- type=['md', 'txt'],
178
- key="file_a"
179
- )
180
- if file_a:
181
- prompt_a = file_a.read().decode('utf-8')
182
- st.success(f"✅ Wczytano {len(prompt_a)} znaków")
183
-
184
- with col2:
185
- st.subheader("Prompt B")
186
- file_b = st.file_uploader(
187
- "Wybierz plik dla Promptu B",
188
- type=['md', 'txt'],
189
- key="file_b"
190
- )
191
- if file_b:
192
- prompt_b = file_b.read().decode('utf-8')
193
- st.success(f"✅ Wczytano {len(prompt_b)} znaków")
194
-
195
- else: # Wklej tekst
196
- col1, col2 = st.columns(2)
197
-
198
- with col1:
199
- st.subheader("Prompt A")
200
- prompt_a = st.text_area(
201
- "Wklej treść Promptu A",
202
- height=300,
203
- key="text_a"
204
- )
205
-
206
- with col2:
207
- st.subheader("Prompt B")
208
- prompt_b = st.text_area(
209
- "Wklej treść Promptu B",
210
- height=300,
211
- key="text_b"
212
- )
213
-
214
- # Test controls
215
- st.divider()
216
-
217
- col1, col2, col3 = st.columns([1, 1, 3])
218
-
219
- with col1:
220
- run_button = st.button(
221
- "🚀 Uruchom Test",
222
- type="primary",
223
- disabled=st.session_state.test_running,
224
- use_container_width=True
225
- )
226
-
227
- with col2:
228
- if st.button("🗑️ Wyczyść", use_container_width=True):
229
- st.session_state.responses = []
230
- st.session_state.results = None
231
- st.session_state.rated_responses = []
232
- clear_logs()
233
- st.rerun()
234
-
235
- # Run test
236
- if run_button:
237
- # Validation
238
- if not config.get_api_key():
239
- st.error("❌ Najpierw ustaw klucz API w panelu bocznym")
240
- elif not prompt_a or not prompt_b:
241
- st.error("❌ Oba prompty muszą być wypełnione")
242
- else:
243
- clear_logs()
244
- log_message("🚀 Rozpoczynanie testu A/B")
245
- log_message(f"Model: {model}, Temperature: {temperature}, Max tokens: {max_tokens}")
246
- log_message(f"Liczba odpowiedzi na prompt: {num_responses}")
247
-
248
- # Progress tracking
249
- progress_bar = st.progress(0)
250
- status_text = st.empty()
251
-
252
- total_iterations = num_responses * 2
253
- current = 0
254
-
255
- st.session_state.test_running = True
256
-
257
- # Generate responses for Prompt A
258
- log_message("🔄 Generowanie odpowiedzi dla PROMPTU A...")
259
- for i in range(num_responses):
260
- current += 1
261
- progress_bar.progress(current / total_iterations)
262
- status_text.text(f"Generowanie: {current}/{total_iterations} (A-{i+1})")
263
-
264
- response = api_handler.generate_response(
265
- prompt_a, model, temperature, max_tokens
266
- )
267
-
268
- st.session_state.responses.append({
269
- 'Option': 'A',
270
- 'Response_ID': i + 1,
271
- 'Response': response,
272
- 'Score': None
273
- })
274
-
275
- if response.startswith("ERROR"):
276
- log_message(f" A-{i+1}/{num_responses}... ❌ {response}")
277
- else:
278
- log_message(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
279
-
280
- time.sleep(0.5)
281
-
282
- # Generate responses for Prompt B
283
- log_message("\n🔄 Generowanie odpowiedzi dla PROMPTU B...")
284
- for i in range(num_responses):
285
- current += 1
286
- progress_bar.progress(current / total_iterations)
287
- status_text.text(f"Generowanie: {current}/{total_iterations} (B-{i+1})")
288
-
289
- response = api_handler.generate_response(
290
- prompt_b, model, temperature, max_tokens
291
- )
292
-
293
- st.session_state.responses.append({
294
- 'Option': 'B',
295
- 'Response_ID': i + 1,
296
- 'Response': response,
297
- 'Score': None
298
- })
299
-
300
- if response.startswith("ERROR"):
301
- log_message(f" B-{i+1}/{num_responses}... ❌ {response}")
302
- else:
303
- log_message(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
304
-
305
- time.sleep(0.5)
306
-
307
- progress_bar.progress(1.0)
308
- status_text.text("✅ Zakończono!")
309
- log_message(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(st.session_state.responses)} odpowiedzi")
310
-
311
- st.session_state.test_running = False
312
- st.success("🎉 Test zakończony! Przejdź do zakładki '⭐ Ocenianie'")
313
-
314
- # Tab 2: Rating
315
- with tab2:
316
- st.header("⭐ Oceń Odpowiedzi")
317
-
318
- if not st.session_state.responses:
319
- st.info("ℹ️ Najpierw uruchom test w zakładce '🚀 Test'")
320
- else:
321
- st.markdown("**Oceń każdą odpowiedź w skali 1-5:**")
322
- st.markdown("1 = Bardzo słaba | 2 = Słaba | 3 = Średnia | 4 = Dobra | 5 = Bardzo dobra")
323
- st.divider()
324
-
325
- # Display responses for rating
326
- for idx, resp in enumerate(st.session_state.responses):
327
- with st.container():
328
- col1, col2 = st.columns([4, 1])
329
-
330
- with col1:
331
- option_label = f"{resp['Option']}-{resp['Response_ID']}"
332
- st.subheader(f"Option: {option_label}")
333
- st.text_area(
334
- "Odpowiedź:",
335
- value=resp['Response'],
336
- height=150,
337
- key=f"response_display_{idx}",
338
- disabled=True
339
- )
340
-
341
- with col2:
342
- st.markdown("**Ocena**")
343
- score = st.number_input(
344
- "Ocena (1-5)",
345
- min_value=1,
346
- max_value=5,
347
- value=resp.get('Score', 3),
348
- step=1,
349
- key=f"score_{idx}",
350
- label_visibility="collapsed"
351
- )
352
- st.session_state.responses[idx]['Score'] = score
353
-
354
- st.divider()
355
-
356
- # Calculate results button
357
- if st.button("📊 Oblicz Wyniki", type="primary", use_container_width=True):
358
- st.session_state.rated_responses = st.session_state.responses.copy()
359
- st.session_state.results = test_runner.calculate_results(st.session_state.rated_responses)
360
- st.success("✅ Wyniki obliczone! Przejdź do zakładki '📊 Wyniki'")
361
- st.rerun()
362
-
363
- # Tab 3: Results
364
- with tab3:
365
- st.header("📊 Wyniki Testu A/B")
366
-
367
- if not st.session_state.results:
368
- st.info("ℹ️ Najpierw oceń odpowiedzi i kliknij 'Oblicz Wyniki' w zakładce '⭐ Ocenianie'")
369
- else:
370
- results = st.session_state.results
371
-
372
- # Results table
373
- st.subheader("Podsumowanie")
374
-
375
- col1, col2, col3, col4, col5 = st.columns(5)
376
-
377
- with col1:
378
- st.metric("Option", "A")
379
- with col2:
380
- st.metric("Count", results['A']['count'])
381
- with col3:
382
- st.metric("Score", f"{results['A']['score']:.2f}")
383
- with col4:
384
- st.metric("Min", results['A']['min'])
385
- with col5:
386
- st.metric("Max", results['A']['max'])
387
-
388
- col1, col2, col3, col4, col5 = st.columns(5)
389
-
390
- with col1:
391
- st.metric("Option", "B")
392
- with col2:
393
- st.metric("Count", results['B']['count'])
394
- with col3:
395
- st.metric("Score", f"{results['B']['score']:.2f}")
396
- with col4:
397
- st.metric("Min", results['B']['min'])
398
- with col5:
399
- st.metric("Max", results['B']['max'])
400
-
401
- st.divider()
402
-
403
- # Winner
404
- if results['A']['score'] > results['B']['score']:
405
- winner = 'A'
406
- diff = results['A']['score'] - results['B']['score']
407
- st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
408
- elif results['B']['score'] > results['A']['score']:
409
- winner = 'B'
410
- diff = results['B']['score'] - results['A']['score']
411
- st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
412
- else:
413
- st.info("🤝 **REMIS**\n\nObie opcje uzyskały identyczny wynik")
414
-
415
- st.divider()
416
-
417
- # Export to CSV
418
- settings = {
419
- "model": config.get("model"),
420
- "temperature": config.get("temperature"),
421
- "max_tokens": config.get("max_tokens"),
422
- "num_responses": config.get("num_responses")
423
- }
424
-
425
- csv_buffer = test_runner.export_to_csv(
426
- st.session_state.rated_responses,
427
- results,
428
- settings
429
- )
430
-
431
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
432
- filename = f"prompt_ab_test_{timestamp}.csv"
433
-
434
- st.download_button(
435
- label="💾 Pobierz wyniki (CSV)",
436
- data=csv_buffer,
437
- file_name=filename,
438
- mime="text/csv",
439
- use_container_width=True
440
- )
441
-
442
- # Tab 4: Logs
443
- with tab4:
444
- st.header("📋 Logi Testów")
445
-
446
- if st.button("🗑️ Wyczyść logi"):
447
- clear_logs()
448
- st.rerun()
449
-
450
- if st.session_state.logs:
451
- log_text = "\n".join(st.session_state.logs)
452
- st.text_area(
453
- "Logi:",
454
- value=log_text,
455
- height=500,
456
- disabled=True,
457
- label_visibility="collapsed"
458
- )
459
- else:
460
- st.info("ℹ️ Brak logów do wyświetlenia")
461
-
462
- # Footer
463
- st.divider()
464
- st.markdown(
465
- """
466
- <div style='text-align: center; color: gray; font-size: 0.9em;'>
467
- Prompt A/B Tester v2.0 (Streamlit) | Created for Manuscripto - Medical Editorial Assistant
468
- </div>
469
- """,
470
- unsafe_allow_html=True
471
- )
 
1
+ """
2
+ Prompt A/B Tester - Streamlit Web Application
3
+ Aplikacja do testowania promptów OpenAI GPT z metodologią A/B
4
+ """
5
+
6
+ import streamlit as st
7
+ import time
8
+ from datetime import datetime
9
+ from config import ConfigManager
10
+ from api_handler import APIHandler
11
+ from test_runner import TestRunner
12
+
13
+ # Page configuration
14
+ st.set_page_config(
15
+ page_title="Prompt A/B Tester",
16
+ page_icon="🔬",
17
+ layout="wide",
18
+ initial_sidebar_state="expanded"
19
+ )
20
+
21
+ # Initialize session state
22
+ if 'initialized' not in st.session_state:
23
+ st.session_state.initialized = True
24
+ st.session_state.responses = []
25
+ st.session_state.test_running = False
26
+ st.session_state.logs = []
27
+ st.session_state.results = None
28
+ st.session_state.rated_responses = []
29
+
30
+ # Initialize components
31
+ config = ConfigManager()
32
+ api_handler = APIHandler(config.get_api_key())
33
+ test_runner = TestRunner(api_handler)
34
+
35
+ # Helper functions
36
+ def log_message(message):
37
+ """Add message to logs"""
38
+ timestamp = datetime.now().strftime("%H:%M:%S")
39
+ st.session_state.logs.append(f"[{timestamp}] {message}")
40
+
41
+ def clear_logs():
42
+ """Clear all logs"""
43
+ st.session_state.logs = []
44
+
45
+ def update_progress_callback(current, total):
46
+ """Progress callback for test runner"""
47
+ # Progress is handled by st.progress in Streamlit
48
+ pass
49
+
50
+ # Title
51
+ st.title("🔬 Prompt A/B Tester")
52
+ st.markdown("**Testuj i porównuj wydajność dwóch wersji promptów OpenAI GPT**")
53
+
54
+ # Sidebar - Configuration
55
+ with st.sidebar:
56
+ st.header("⚙️ Konfiguracja")
57
+
58
+ # API Key
59
+ st.subheader("🔑 Klucz API OpenAI")
60
+
61
+ # Check if API key is in secrets (Hugging Face deployment)
62
+ api_key_from_secrets = None
63
+ try:
64
+ if hasattr(st, 'secrets') and 'OPENAI_API_KEY' in st.secrets:
65
+ api_key_from_secrets = st.secrets['OPENAI_API_KEY']
66
+ st.success("✅ Klucz API załadowany z Secrets")
67
+ except:
68
+ pass
69
+
70
+ if not api_key_from_secrets:
71
+ api_key_input = st.text_input(
72
+ "Klucz API",
73
+ type="password",
74
+ value=config.get("api_key", ""),
75
+ help="Wprowadź swój klucz API OpenAI"
76
+ )
77
+
78
+ if st.button("💾 Zapisz i Waliduj API Key"):
79
+ if api_key_input:
80
+ api_handler.set_api_key(api_key_input)
81
+ config.set_api_key(api_key_input)
82
+ success, message = api_handler.validate_api_key()
83
+ if success:
84
+ st.success(f"✅ {message}")
85
+ else:
86
+ st.error(f"❌ {message}")
87
+ else:
88
+ st.warning("⚠️ Klucz API nie może być pusty")
89
+
90
+ st.divider()
91
+
92
+ # Model and Parameters
93
+ st.subheader("🤖 Model i Parametry")
94
+
95
+ # Refresh models
96
+ if st.button("🔄 Odśwież modele"):
97
+ models = api_handler.get_available_models()
98
+ st.success(f"Znaleziono {len(models)} modeli")
99
+
100
+ models = api_handler.get_available_models()
101
+ model = st.selectbox(
102
+ "Model",
103
+ options=models,
104
+ index=models.index(config.get("model", "gpt-4o")) if config.get("model", "gpt-4o") in models else 0
105
+ )
106
+
107
+ temperature = st.slider(
108
+ "Temperature",
109
+ min_value=0.0,
110
+ max_value=2.0,
111
+ value=config.get("temperature", 0.1),
112
+ step=0.1,
113
+ help="0.0 = deterministyczne, 2.0 = kreatywne"
114
+ )
115
+
116
+ max_tokens = st.number_input(
117
+ "Max Tokens",
118
+ min_value=100,
119
+ max_value=4000,
120
+ value=config.get("max_tokens", 2000),
121
+ step=100
122
+ )
123
+
124
+ num_responses = st.number_input(
125
+ "Liczba odpowiedzi (na prompt)",
126
+ min_value=1,
127
+ max_value=50,
128
+ value=config.get("num_responses", 5),
129
+ step=1,
130
+ help="Ile odpowiedzi wygenerować dla każdego promptu"
131
+ )
132
+
133
+ # Save settings
134
+ if st.button("💾 Zapisz ustawienia"):
135
+ settings = {
136
+ "model": model,
137
+ "temperature": temperature,
138
+ "max_tokens": max_tokens,
139
+ "num_responses": num_responses
140
+ }
141
+ config.update_settings(settings)
142
+ st.success("✅ Ustawienia zapisane")
143
+
144
+ # Cost estimation
145
+ if st.button("💰 Szacuj koszt"):
146
+ total_responses = num_responses * 2
147
+ cost = api_handler.estimate_cost(model, total_responses)
148
+ st.info(f"**Szacunkowy koszt:**\n\n"
149
+ f"Model: {model}\n\n"
150
+ f"Odpowiedzi: {total_responses}\n\n"
151
+ f"Koszt: ${cost:.4f} USD")
152
+
153
+ # Main tabs
154
+ tab1, tab2, tab3, tab4 = st.tabs(["🚀 Test", "⭐ Ocenianie", "📊 Wyniki", "📋 Logi"])
155
+
156
+ # Tab 1: Test
157
+ with tab1:
158
+ st.header("🚀 Przeprowadź Test A/B")
159
+
160
+ # Input method selection
161
+ input_method = st.radio(
162
+ "Metoda wprowadzania promptów:",
163
+ options=["📁 Upload plików", "✏️ Wklej tekst"],
164
+ horizontal=True
165
+ )
166
+
167
+ prompt_a = None
168
+ prompt_b = None
169
+
170
+ if input_method == "📁 Upload plików":
171
+ col1, col2 = st.columns(2)
172
+
173
+ with col1:
174
+ st.subheader("Prompt A")
175
+ file_a = st.file_uploader(
176
+ "Wybierz plik dla Promptu A",
177
+ type=['md', 'txt'],
178
+ key="file_a"
179
+ )
180
+ if file_a:
181
+ prompt_a = file_a.read().decode('utf-8')
182
+ st.success(f"✅ Wczytano {len(prompt_a)} znaków")
183
+
184
+ with col2:
185
+ st.subheader("Prompt B")
186
+ file_b = st.file_uploader(
187
+ "Wybierz plik dla Promptu B",
188
+ type=['md', 'txt'],
189
+ key="file_b"
190
+ )
191
+ if file_b:
192
+ prompt_b = file_b.read().decode('utf-8')
193
+ st.success(f"✅ Wczytano {len(prompt_b)} znaków")
194
+
195
+ else: # Wklej tekst
196
+ col1, col2 = st.columns(2)
197
+
198
+ with col1:
199
+ st.subheader("Prompt A")
200
+ prompt_a = st.text_area(
201
+ "Wklej treść Promptu A",
202
+ height=300,
203
+ key="text_a"
204
+ )
205
+
206
+ with col2:
207
+ st.subheader("Prompt B")
208
+ prompt_b = st.text_area(
209
+ "Wklej treść Promptu B",
210
+ height=300,
211
+ key="text_b"
212
+ )
213
+
214
+ # Test controls
215
+ st.divider()
216
+
217
+ col1, col2, col3 = st.columns([1, 1, 3])
218
+
219
+ with col1:
220
+ run_button = st.button(
221
+ "🚀 Uruchom Test",
222
+ type="primary",
223
+ disabled=st.session_state.test_running,
224
+ use_container_width=True
225
+ )
226
+
227
+ with col2:
228
+ if st.button("🗑️ Wyczyść", use_container_width=True):
229
+ st.session_state.responses = []
230
+ st.session_state.results = None
231
+ st.session_state.rated_responses = []
232
+ clear_logs()
233
+ st.rerun()
234
+
235
+ # Run test
236
+ if run_button:
237
+ # Validation
238
+ if not config.get_api_key():
239
+ st.error("❌ Najpierw ustaw klucz API w panelu bocznym")
240
+ elif not prompt_a or not prompt_b:
241
+ st.error("❌ Oba prompty muszą być wypełnione")
242
+ else:
243
+ clear_logs()
244
+ log_message("🚀 Rozpoczynanie testu A/B")
245
+ log_message(f"Model: {model}, Temperature: {temperature}, Max tokens: {max_tokens}")
246
+ log_message(f"Liczba odpowiedzi na prompt: {num_responses}")
247
+
248
+ # Progress tracking
249
+ progress_bar = st.progress(0)
250
+ status_text = st.empty()
251
+
252
+ total_iterations = num_responses * 2
253
+ current = 0
254
+
255
+ st.session_state.test_running = True
256
+
257
+ # Generate responses for Prompt A
258
+ log_message("🔄 Generowanie odpowiedzi dla PROMPTU A...")
259
+ for i in range(num_responses):
260
+ current += 1
261
+ progress_bar.progress(current / total_iterations)
262
+ status_text.text(f"Generowanie: {current}/{total_iterations} (A-{i+1})")
263
+
264
+ response = api_handler.generate_response(
265
+ prompt_a, model, temperature, max_tokens
266
+ )
267
+
268
+ st.session_state.responses.append({
269
+ 'Option': 'A',
270
+ 'Response_ID': i + 1,
271
+ 'Response': response,
272
+ 'Score': None
273
+ })
274
+
275
+ if response.startswith("ERROR"):
276
+ log_message(f" A-{i+1}/{num_responses}... ❌ {response}")
277
+ else:
278
+ log_message(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
279
+
280
+ time.sleep(0.5)
281
+
282
+ # Generate responses for Prompt B
283
+ log_message("\n🔄 Generowanie odpowiedzi dla PROMPTU B...")
284
+ for i in range(num_responses):
285
+ current += 1
286
+ progress_bar.progress(current / total_iterations)
287
+ status_text.text(f"Generowanie: {current}/{total_iterations} (B-{i+1})")
288
+
289
+ response = api_handler.generate_response(
290
+ prompt_b, model, temperature, max_tokens
291
+ )
292
+
293
+ st.session_state.responses.append({
294
+ 'Option': 'B',
295
+ 'Response_ID': i + 1,
296
+ 'Response': response,
297
+ 'Score': None
298
+ })
299
+
300
+ if response.startswith("ERROR"):
301
+ log_message(f" B-{i+1}/{num_responses}... ❌ {response}")
302
+ else:
303
+ log_message(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
304
+
305
+ time.sleep(0.5)
306
+
307
+ progress_bar.progress(1.0)
308
+ status_text.text("✅ Zakończono!")
309
+ log_message(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(st.session_state.responses)} odpowiedzi")
310
+
311
+ st.session_state.test_running = False
312
+ st.success("🎉 Test zakończony! Przejdź do zakładki '⭐ Ocenianie'")
313
+
314
+ # Tab 2: Rating
315
+ with tab2:
316
+ st.header("⭐ Oceń Odpowiedzi")
317
+
318
+ if not st.session_state.responses:
319
+ st.info("ℹ️ Najpierw uruchom test w zakładce '🚀 Test'")
320
+ else:
321
+ st.markdown("**Oceń każdą odpowiedź w skali 1-5:**")
322
+ st.markdown("1 = Bardzo słaba | 2 = Słaba | 3 = Średnia | 4 = Dobra | 5 = Bardzo dobra")
323
+ st.divider()
324
+
325
+ # Display responses for rating
326
+ for idx, resp in enumerate(st.session_state.responses):
327
+ with st.container():
328
+ col1, col2 = st.columns([4, 1])
329
+
330
+ with col1:
331
+ option_label = f"{resp['Option']}-{resp['Response_ID']}"
332
+ st.subheader(f"Option: {option_label}")
333
+ st.text_area(
334
+ "Odpowiedź:",
335
+ value=resp['Response'],
336
+ height=150,
337
+ key=f"response_display_{idx}",
338
+ disabled=True
339
+ )
340
+
341
+ with col2:
342
+ st.markdown("**Ocena**")
343
+ score = st.number_input(
344
+ "Ocena (1-5)",
345
+ min_value=1,
346
+ max_value=5,
347
+ value=resp.get('Score', 3),
348
+ step=1,
349
+ key=f"score_{idx}",
350
+ label_visibility="collapsed"
351
+ )
352
+ st.session_state.responses[idx]['Score'] = score
353
+
354
+ st.divider()
355
+
356
+ # Calculate results button
357
+ if st.button("📊 Oblicz Wyniki", type="primary", use_container_width=True):
358
+ st.session_state.rated_responses = st.session_state.responses.copy()
359
+ st.session_state.results = test_runner.calculate_results(st.session_state.rated_responses)
360
+ st.success("✅ Wyniki obliczone! Przejdź do zakładki '📊 Wyniki'")
361
+ st.rerun()
362
+
363
+ # Tab 3: Results
364
+ with tab3:
365
+ st.header("📊 Wyniki Testu A/B")
366
+
367
+ if not st.session_state.results:
368
+ st.info("ℹ️ Najpierw oceń odpowiedzi i kliknij 'Oblicz Wyniki' w zakładce '⭐ Ocenianie'")
369
+ else:
370
+ results = st.session_state.results
371
+
372
+ # Results table
373
+ st.subheader("Podsumowanie")
374
+
375
+ col1, col2, col3, col4, col5 = st.columns(5)
376
+
377
+ with col1:
378
+ st.metric("Option", "A")
379
+ with col2:
380
+ st.metric("Count", results['A']['count'])
381
+ with col3:
382
+ st.metric("Score", f"{results['A']['score']:.2f}")
383
+ with col4:
384
+ st.metric("Min", results['A']['min'])
385
+ with col5:
386
+ st.metric("Max", results['A']['max'])
387
+
388
+ col1, col2, col3, col4, col5 = st.columns(5)
389
+
390
+ with col1:
391
+ st.metric("Option", "B")
392
+ with col2:
393
+ st.metric("Count", results['B']['count'])
394
+ with col3:
395
+ st.metric("Score", f"{results['B']['score']:.2f}")
396
+ with col4:
397
+ st.metric("Min", results['B']['min'])
398
+ with col5:
399
+ st.metric("Max", results['B']['max'])
400
+
401
+ st.divider()
402
+
403
+ # Winner
404
+ if results['A']['score'] > results['B']['score']:
405
+ winner = 'A'
406
+ diff = results['A']['score'] - results['B']['score']
407
+ st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
408
+ elif results['B']['score'] > results['A']['score']:
409
+ winner = 'B'
410
+ diff = results['B']['score'] - results['A']['score']
411
+ st.success(f"🏆 **ZWYCIĘZCA: Prompt {winner}**\n\nPrzewaga: +{diff:.2f} punktu")
412
+ else:
413
+ st.info("🤝 **REMIS**\n\nObie opcje uzyskały identyczny wynik")
414
+
415
+ st.divider()
416
+
417
+ # Export to CSV
418
+ settings = {
419
+ "model": config.get("model"),
420
+ "temperature": config.get("temperature"),
421
+ "max_tokens": config.get("max_tokens"),
422
+ "num_responses": config.get("num_responses")
423
+ }
424
+
425
+ csv_buffer = test_runner.export_to_csv(
426
+ st.session_state.rated_responses,
427
+ results,
428
+ settings
429
+ )
430
+
431
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
432
+ filename = f"prompt_ab_test_{timestamp}.csv"
433
+
434
+ st.download_button(
435
+ label="💾 Pobierz wyniki (CSV)",
436
+ data=csv_buffer,
437
+ file_name=filename,
438
+ mime="text/csv",
439
+ use_container_width=True
440
+ )
441
+
442
+ # Tab 4: Logs
443
+ with tab4:
444
+ st.header("📋 Logi Testów")
445
+
446
+ if st.button("🗑️ Wyczyść logi"):
447
+ clear_logs()
448
+ st.rerun()
449
+
450
+ if st.session_state.logs:
451
+ log_text = "\n".join(st.session_state.logs)
452
+ st.text_area(
453
+ "Logi:",
454
+ value=log_text,
455
+ height=500,
456
+ disabled=True,
457
+ label_visibility="collapsed"
458
+ )
459
+ else:
460
+ st.info("ℹ️ Brak logów do wyświetlenia")
461
+
462
+ # Footer
463
+ st.divider()
464
+ st.markdown(
465
+ """
466
+ <div style='text-align: center; color: gray; font-size: 0.9em;'>
467
+ Prompt A/B Tester v2.0 (Streamlit) | Created by Heuristica.pl - Marek Staniszewski
468
+ </div>
469
+ """,
470
+ unsafe_allow_html=True
471
+ )