Marek4321 commited on
Commit
40e80d0
·
verified ·
1 Parent(s): 2e74c4e

Update test_runner.py

Browse files
Files changed (1) hide show
  1. test_runner.py +514 -197
test_runner.py CHANGED
@@ -1,197 +1,514 @@
1
- """
2
- Test Runner - Logika przeprowadzania testów A/B (wersja Streamlit)
3
- """
4
-
5
- import time
6
- import pandas as pd
7
- from datetime import datetime
8
- from pathlib import Path
9
- from io import BytesIO
10
-
11
-
12
- class TestRunner:
13
- """Zarządza przeprowadzaniem testów A/B promptów"""
14
-
15
- def __init__(self, api_handler):
16
- """
17
- Args:
18
- api_handler: Instancja APIHandler
19
- """
20
- self.api_handler = api_handler
21
- self.responses = []
22
- self.is_running = False
23
- self.should_cancel = False
24
-
25
- def run_test(self, prompt_a, prompt_b, num_responses, model, temperature, max_tokens, progress_callback=None, log_callback=None):
26
- """
27
- Przeprowadza test A/B
28
-
29
- Args:
30
- prompt_a: Treść promptu A (string)
31
- prompt_b: Treść promptu B (string)
32
- num_responses: Liczba odpowiedzi dla każdego promptu
33
- model: Model OpenAI
34
- temperature: Temperatura
35
- max_tokens: Max tokens
36
- progress_callback: Opcjonalna funkcja do aktualizacji progress bara
37
- log_callback: Opcjonalna funkcja do logowania
38
-
39
- Returns:
40
- list: Lista słowników z odpowiedziami
41
- """
42
- self.responses = []
43
- self.is_running = True
44
- self.should_cancel = False
45
-
46
- total_iterations = num_responses * 2
47
- current = 0
48
-
49
- # Generowanie odpowiedzi dla promptu A
50
- if log_callback:
51
- log_callback(f"🔄 Generowanie odpowiedzi dla PROMPTU A...")
52
-
53
- for i in range(num_responses):
54
- if self.should_cancel:
55
- if log_callback:
56
- log_callback("⚠️ Test anulowany przez użytkownika")
57
- self.is_running = False
58
- return []
59
-
60
- current += 1
61
-
62
- if progress_callback:
63
- progress_callback(current, total_iterations)
64
-
65
- response = self.api_handler.generate_response(
66
- prompt_a, model, temperature, max_tokens
67
- )
68
-
69
- self.responses.append({
70
- 'Option': 'A',
71
- 'Response_ID': i + 1,
72
- 'Response': response,
73
- 'Score': None
74
- })
75
-
76
- if log_callback:
77
- if response.startswith("ERROR"):
78
- log_callback(f" A-{i+1}/{num_responses}... ❌ {response}")
79
- else:
80
- log_callback(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
81
-
82
- time.sleep(0.5) # Krótka pauza między requestami
83
-
84
- # Generowanie odpowiedzi dla promptu B
85
- if log_callback:
86
- log_callback(f"\n🔄 Generowanie odpowiedzi dla PROMPTU B...")
87
-
88
- for i in range(num_responses):
89
- if self.should_cancel:
90
- if log_callback:
91
- log_callback("⚠️ Test anulowany przez użytkownika")
92
- self.is_running = False
93
- return []
94
-
95
- current += 1
96
-
97
- if progress_callback:
98
- progress_callback(current, total_iterations)
99
-
100
- response = self.api_handler.generate_response(
101
- prompt_b, model, temperature, max_tokens
102
- )
103
-
104
- self.responses.append({
105
- 'Option': 'B',
106
- 'Response_ID': i + 1,
107
- 'Response': response,
108
- 'Score': None
109
- })
110
-
111
- if log_callback:
112
- if response.startswith("ERROR"):
113
- log_callback(f" B-{i+1}/{num_responses}... ❌ {response}")
114
- else:
115
- log_callback(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
116
-
117
- time.sleep(0.5)
118
-
119
- if log_callback:
120
- log_callback(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(self.responses)} odpowiedzi")
121
-
122
- self.is_running = False
123
- return self.responses
124
-
125
- def calculate_results(self, responses_with_scores):
126
- """
127
- Oblicza wyniki testu na podstawie ocen
128
-
129
- Args:
130
- responses_with_scores: Lista odpowiedzi z wypełnionymi ocenami
131
-
132
- Returns:
133
- dict: Wyniki w formacie {'A': {'count': X, 'score': Y}, 'B': {...}}
134
- """
135
- results = {}
136
-
137
- for option in ['A', 'B']:
138
- option_responses = [r for r in responses_with_scores if r['Option'] == option]
139
- scores = [r['Score'] for r in option_responses if r['Score'] is not None]
140
-
141
- if scores:
142
- avg_score = sum(scores) / len(scores)
143
- results[option] = {
144
- 'count': len(scores),
145
- 'score': round(avg_score, 2),
146
- 'min': min(scores),
147
- 'max': max(scores)
148
- }
149
-
150
- return results
151
-
152
- def export_to_csv(self, responses_with_scores, results, settings):
153
- """
154
- Eksportuje wyniki do CSV (zwraca BytesIO dla Streamlit download)
155
-
156
- Args:
157
- responses_with_scores: Lista odpowiedzi z ocenami
158
- results: Wyniki testu
159
- settings: Ustawienia testu
160
-
161
- Returns:
162
- BytesIO: Bufor CSV do pobrania
163
- """
164
- # Przygotuj dane do zapisu
165
- df = pd.DataFrame(responses_with_scores)
166
-
167
- # Dodaj metadane jako pierwsze wiersze (jako komentarze)
168
- metadata = [
169
- f"# Test A/B Prompt - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
170
- f"# Model: {settings.get('model', 'N/A')}",
171
- f"# Temperature: {settings.get('temperature', 'N/A')}",
172
- f"# Max Tokens: {settings.get('max_tokens', 'N/A')}",
173
- f"# Top P: {settings.get('top_p', 'N/A')}",
174
- f"# Num Responses: {settings.get('num_responses', 'N/A')}",
175
- f"#",
176
- f"# WYNIKI:",
177
- f"# Option A - Count: {results['A']['count']}, Score: {results['A']['score']}",
178
- f"# Option B - Count: {results['B']['count']}, Score: {results['B']['score']}",
179
- f"#"
180
- ]
181
-
182
- # Zapisz do bufora
183
- buffer = BytesIO()
184
-
185
- # Zapisz metadane
186
- for line in metadata:
187
- buffer.write((line + "\n").encode('utf-8'))
188
-
189
- # Zapisz DataFrame
190
- df.to_csv(buffer, index=False, encoding='utf-8')
191
-
192
- buffer.seek(0)
193
- return buffer
194
-
195
- def cancel_test(self):
196
- """Anuluje trwający test"""
197
- self.should_cancel = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Runner - Logika przeprowadzania testów A/B (wersja Streamlit)
3
+ """
4
+
5
+ import time
6
+ import json
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from io import BytesIO
11
+ from docx import Document
12
+ from docx.shared import Pt, RGBColor, Inches
13
+ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
14
+
15
+
16
+ class TestRunner:
17
+ """Zarządza przeprowadzaniem testów A/B promptów"""
18
+
19
+ def __init__(self, api_handler):
20
+ """
21
+ Args:
22
+ api_handler: Instancja APIHandler
23
+ """
24
+ self.api_handler = api_handler
25
+ self.responses = []
26
+ self.is_running = False
27
+ self.should_cancel = False
28
+
29
+ def run_test(self, prompt_a, prompt_b, num_responses, model, temperature, max_tokens, progress_callback=None, log_callback=None):
30
+ """
31
+ Przeprowadza test A/B
32
+
33
+ Args:
34
+ prompt_a: Treść promptu A (string)
35
+ prompt_b: Treść promptu B (string)
36
+ num_responses: Liczba odpowiedzi dla każdego promptu
37
+ model: Model OpenAI
38
+ temperature: Temperatura
39
+ max_tokens: Max tokens
40
+ progress_callback: Opcjonalna funkcja do aktualizacji progress bara
41
+ log_callback: Opcjonalna funkcja do logowania
42
+
43
+ Returns:
44
+ list: Lista słowników z odpowiedziami
45
+ """
46
+ self.responses = []
47
+ self.is_running = True
48
+ self.should_cancel = False
49
+
50
+ total_iterations = num_responses * 2
51
+ current = 0
52
+
53
+ # Generowanie odpowiedzi dla promptu A
54
+ if log_callback:
55
+ log_callback(f"🔄 Generowanie odpowiedzi dla PROMPTU A...")
56
+
57
+ for i in range(num_responses):
58
+ if self.should_cancel:
59
+ if log_callback:
60
+ log_callback("⚠️ Test anulowany przez użytkownika")
61
+ self.is_running = False
62
+ return []
63
+
64
+ current += 1
65
+
66
+ if progress_callback:
67
+ progress_callback(current, total_iterations)
68
+
69
+ response = self.api_handler.generate_response(
70
+ prompt_a, model, temperature, max_tokens
71
+ )
72
+
73
+ self.responses.append({
74
+ 'Option': 'A',
75
+ 'Response_ID': i + 1,
76
+ 'Response': response,
77
+ 'Score': None
78
+ })
79
+
80
+ if log_callback:
81
+ if response.startswith("ERROR"):
82
+ log_callback(f" A-{i+1}/{num_responses}... {response}")
83
+ else:
84
+ log_callback(f" A-{i+1}/{num_responses}... ({len(response)} znaków)")
85
+
86
+ time.sleep(0.5) # Krótka pauza między requestami
87
+
88
+ # Generowanie odpowiedzi dla promptu B
89
+ if log_callback:
90
+ log_callback(f"\n🔄 Generowanie odpowiedzi dla PROMPTU B...")
91
+
92
+ for i in range(num_responses):
93
+ if self.should_cancel:
94
+ if log_callback:
95
+ log_callback("⚠️ Test anulowany przez użytkownika")
96
+ self.is_running = False
97
+ return []
98
+
99
+ current += 1
100
+
101
+ if progress_callback:
102
+ progress_callback(current, total_iterations)
103
+
104
+ response = self.api_handler.generate_response(
105
+ prompt_b, model, temperature, max_tokens
106
+ )
107
+
108
+ self.responses.append({
109
+ 'Option': 'B',
110
+ 'Response_ID': i + 1,
111
+ 'Response': response,
112
+ 'Score': None
113
+ })
114
+
115
+ if log_callback:
116
+ if response.startswith("ERROR"):
117
+ log_callback(f" B-{i+1}/{num_responses}... ❌ {response}")
118
+ else:
119
+ log_callback(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)")
120
+
121
+ time.sleep(0.5)
122
+
123
+ if log_callback:
124
+ log_callback(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(self.responses)} odpowiedzi")
125
+
126
+ self.is_running = False
127
+ return self.responses
128
+
129
+ def calculate_results(self, responses_with_scores):
130
+ """
131
+ Oblicza wyniki testu na podstawie ocen
132
+
133
+ Args:
134
+ responses_with_scores: Lista odpowiedzi z wypełnionymi ocenami
135
+
136
+ Returns:
137
+ dict: Wyniki w formacie {'A': {'count': X, 'score': Y}, 'B': {...}}
138
+ """
139
+ results = {}
140
+
141
+ for option in ['A', 'B']:
142
+ option_responses = [r for r in responses_with_scores if r['Option'] == option]
143
+ scores = [r['Score'] for r in option_responses if r['Score'] is not None]
144
+
145
+ if scores:
146
+ avg_score = sum(scores) / len(scores)
147
+ results[option] = {
148
+ 'count': len(scores),
149
+ 'score': round(avg_score, 2),
150
+ 'min': min(scores),
151
+ 'max': max(scores)
152
+ }
153
+
154
+ return results
155
+
156
+ def export_to_csv(self, responses_with_scores, results, settings):
157
+ """
158
+ Eksportuje wyniki do CSV (zwraca BytesIO dla Streamlit download)
159
+
160
+ Args:
161
+ responses_with_scores: Lista odpowiedzi z ocenami
162
+ results: Wyniki testu
163
+ settings: Ustawienia testu
164
+
165
+ Returns:
166
+ BytesIO: Bufor CSV do pobrania
167
+ """
168
+ # Przygotuj dane do zapisu
169
+ df = pd.DataFrame(responses_with_scores)
170
+
171
+ # Dodaj metadane jako pierwsze wiersze (jako komentarze)
172
+ metadata = [
173
+ f"# Test A/B Prompt - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
174
+ f"# Model: {settings.get('model', 'N/A')}",
175
+ f"# Temperature: {settings.get('temperature', 'N/A')}",
176
+ f"# Max Tokens: {settings.get('max_tokens', 'N/A')}",
177
+ f"# Top P: {settings.get('top_p', 'N/A')}",
178
+ f"# Num Responses: {settings.get('num_responses', 'N/A')}",
179
+ f"#",
180
+ f"# WYNIKI:",
181
+ f"# Option A - Count: {results['A']['count']}, Score: {results['A']['score']}",
182
+ f"# Option B - Count: {results['B']['count']}, Score: {results['B']['score']}",
183
+ f"#"
184
+ ]
185
+
186
+ # Zapisz do bufora
187
+ buffer = BytesIO()
188
+
189
+ # Zapisz metadane
190
+ for line in metadata:
191
+ buffer.write((line + "\n").encode('utf-8'))
192
+
193
+ # Zapisz DataFrame
194
+ df.to_csv(buffer, index=False, encoding='utf-8')
195
+
196
+ buffer.seek(0)
197
+ return buffer
198
+
199
+ def cancel_test(self):
200
+ """Anuluje trwający test"""
201
+ self.should_cancel = True
202
+
203
+ def export_to_excel(self, responses_with_scores, results, settings):
204
+ """
205
+ Eksportuje wyniki do Excel (zwraca BytesIO dla Streamlit download)
206
+
207
+ Args:
208
+ responses_with_scores: Lista odpowiedzi z ocenami
209
+ results: Wyniki testu
210
+ settings: Ustawienia testu
211
+
212
+ Returns:
213
+ BytesIO: Bufor Excel do pobrania
214
+ """
215
+ buffer = BytesIO()
216
+
217
+ with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
218
+ # Arkusz 1: Podsumowanie
219
+ summary_data = {
220
+ 'Parametr': [
221
+ 'Data testu',
222
+ 'Model',
223
+ 'Temperature',
224
+ 'Max Tokens',
225
+ 'Liczba odpowiedzi',
226
+ '',
227
+ 'Option A - Średnia ocena',
228
+ 'Option A - Liczba',
229
+ 'Option A - Min',
230
+ 'Option A - Max',
231
+ '',
232
+ 'Option B - Średnia ocena',
233
+ 'Option B - Liczba',
234
+ 'Option B - Min',
235
+ 'Option B - Max'
236
+ ],
237
+ 'Wartość': [
238
+ datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
239
+ settings.get('model', 'N/A'),
240
+ settings.get('temperature', 'N/A'),
241
+ settings.get('max_tokens', 'N/A'),
242
+ settings.get('num_responses', 'N/A'),
243
+ '',
244
+ results['A']['score'],
245
+ results['A']['count'],
246
+ results['A']['min'],
247
+ results['A']['max'],
248
+ '',
249
+ results['B']['score'],
250
+ results['B']['count'],
251
+ results['B']['min'],
252
+ results['B']['max']
253
+ ]
254
+ }
255
+
256
+ df_summary = pd.DataFrame(summary_data)
257
+ df_summary.to_excel(writer, sheet_name='Podsumowanie', index=False)
258
+
259
+ # Arkusz 2: Wszystkie odpowiedzi
260
+ df_responses = pd.DataFrame(responses_with_scores)
261
+ df_responses.to_excel(writer, sheet_name='Odpowiedzi', index=False)
262
+
263
+ buffer.seek(0)
264
+ return buffer
265
+
266
+ def export_to_json(self, responses_with_scores, results, settings):
267
+ """
268
+ Eksportuje wyniki do JSON (zwraca BytesIO dla Streamlit download)
269
+
270
+ Args:
271
+ responses_with_scores: Lista odpowiedzi z ocenami
272
+ results: Wyniki testu
273
+ settings: Ustawienia testu
274
+
275
+ Returns:
276
+ BytesIO: Bufor JSON do pobrania
277
+ """
278
+ data = {
279
+ 'metadata': {
280
+ 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
281
+ 'model': settings.get('model', 'N/A'),
282
+ 'temperature': settings.get('temperature', 'N/A'),
283
+ 'max_tokens': settings.get('max_tokens', 'N/A'),
284
+ 'num_responses': settings.get('num_responses', 'N/A')
285
+ },
286
+ 'results': results,
287
+ 'responses': responses_with_scores
288
+ }
289
+
290
+ buffer = BytesIO()
291
+ json_str = json.dumps(data, ensure_ascii=False, indent=2)
292
+ buffer.write(json_str.encode('utf-8'))
293
+ buffer.seek(0)
294
+ return buffer
295
+
296
+ def export_to_txt(self, responses_with_scores, results, settings):
297
+ """
298
+ Eksportuje wyniki do TXT (zwraca BytesIO dla Streamlit download)
299
+
300
+ Args:
301
+ responses_with_scores: Lista odpowiedzi z ocenami
302
+ results: Wyniki testu
303
+ settings: Ustawienia testu
304
+
305
+ Returns:
306
+ BytesIO: Bufor TXT do pobrania
307
+ """
308
+ buffer = BytesIO()
309
+
310
+ # Header
311
+ lines = [
312
+ "=" * 80,
313
+ "WYNIKI TESTU A/B PROMPTÓW",
314
+ "=" * 80,
315
+ "",
316
+ f"Data testu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
317
+ f"Model: {settings.get('model', 'N/A')}",
318
+ f"Temperature: {settings.get('temperature', 'N/A')}",
319
+ f"Max Tokens: {settings.get('max_tokens', 'N/A')}",
320
+ f"Liczba odpowiedzi: {settings.get('num_responses', 'N/A')}",
321
+ "",
322
+ "=" * 80,
323
+ "PODSUMOWANIE WYNIKÓW",
324
+ "=" * 80,
325
+ "",
326
+ f"Option A:",
327
+ f" Średnia ocena: {results['A']['score']}",
328
+ f" Liczba: {results['A']['count']}",
329
+ f" Min: {results['A']['min']}",
330
+ f" Max: {results['A']['max']}",
331
+ "",
332
+ f"Option B:",
333
+ f" Średnia ocena: {results['B']['score']}",
334
+ f" Liczba: {results['B']['count']}",
335
+ f" Min: {results['B']['min']}",
336
+ f" Max: {results['B']['max']}",
337
+ "",
338
+ "=" * 80,
339
+ "WSZYSTKIE ODPOWIEDZI",
340
+ "=" * 80,
341
+ ""
342
+ ]
343
+
344
+ # Responses
345
+ for resp in responses_with_scores:
346
+ lines.extend([
347
+ f"\nOption: {resp['Option']}-{resp['Response_ID']}",
348
+ f"Ocena: {resp['Score']}",
349
+ "-" * 80,
350
+ f"{resp['Response']}",
351
+ "-" * 80
352
+ ])
353
+
354
+ text = "\n".join(lines)
355
+ buffer.write(text.encode('utf-8'))
356
+ buffer.seek(0)
357
+ return buffer
358
+
359
+ def export_to_markdown(self, responses_with_scores, results, settings):
360
+ """
361
+ Eksportuje wyniki do Markdown (zwraca BytesIO dla Streamlit download)
362
+
363
+ Args:
364
+ responses_with_scores: Lista odpowiedzi z ocenami
365
+ results: Wyniki testu
366
+ settings: Ustawienia testu
367
+
368
+ Returns:
369
+ BytesIO: Bufor Markdown do pobrania
370
+ """
371
+ buffer = BytesIO()
372
+
373
+ lines = [
374
+ "# Wyniki Testu A/B Promptów",
375
+ "",
376
+ "## Metadata",
377
+ "",
378
+ f"- **Data testu**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
379
+ f"- **Model**: {settings.get('model', 'N/A')}",
380
+ f"- **Temperature**: {settings.get('temperature', 'N/A')}",
381
+ f"- **Max Tokens**: {settings.get('max_tokens', 'N/A')}",
382
+ f"- **Liczba odpowiedzi**: {settings.get('num_responses', 'N/A')}",
383
+ "",
384
+ "## Podsumowanie Wyników",
385
+ "",
386
+ "| Option | Średnia Ocena | Liczba | Min | Max |",
387
+ "|--------|---------------|--------|-----|-----|",
388
+ f"| A | {results['A']['score']:.2f} | {results['A']['count']} | {results['A']['min']} | {results['A']['max']} |",
389
+ f"| B | {results['B']['score']:.2f} | {results['B']['count']} | {results['B']['min']} | {results['B']['max']} |",
390
+ ""
391
+ ]
392
+
393
+ # Zwycięzca
394
+ if results['A']['score'] > results['B']['score']:
395
+ diff = results['A']['score'] - results['B']['score']
396
+ lines.append(f"### 🏆 Zwycięzca: Prompt A (przewaga: +{diff:.2f})")
397
+ elif results['B']['score'] > results['A']['score']:
398
+ diff = results['B']['score'] - results['A']['score']
399
+ lines.append(f"### 🏆 Zwycięzca: Prompt B (przewaga: +{diff:.2f})")
400
+ else:
401
+ lines.append("### 🤝 Remis")
402
+
403
+ lines.extend([
404
+ "",
405
+ "## Wszystkie Odpowiedzi",
406
+ ""
407
+ ])
408
+
409
+ # Responses
410
+ for resp in responses_with_scores:
411
+ lines.extend([
412
+ f"### Option {resp['Option']}-{resp['Response_ID']} (Ocena: {resp['Score']})",
413
+ "",
414
+ "```",
415
+ resp['Response'],
416
+ "```",
417
+ ""
418
+ ])
419
+
420
+ text = "\n".join(lines)
421
+ buffer.write(text.encode('utf-8'))
422
+ buffer.seek(0)
423
+ return buffer
424
+
425
+ def export_to_word(self, responses_with_scores, results, settings):
426
+ """
427
+ Eksportuje wyniki do Word (zwraca BytesIO dla Streamlit download)
428
+
429
+ Args:
430
+ responses_with_scores: Lista odpowiedzi z ocenami
431
+ results: Wyniki testu
432
+ settings: Ustawienia testu
433
+
434
+ Returns:
435
+ BytesIO: Bufor Word do pobrania
436
+ """
437
+ doc = Document()
438
+
439
+ # Title
440
+ title = doc.add_heading('Wyniki Testu A/B Promptów', 0)
441
+ title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
442
+
443
+ # Metadata
444
+ doc.add_heading('Metadata', level=1)
445
+ metadata_items = [
446
+ f"Data testu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
447
+ f"Model: {settings.get('model', 'N/A')}",
448
+ f"Temperature: {settings.get('temperature', 'N/A')}",
449
+ f"Max Tokens: {settings.get('max_tokens', 'N/A')}",
450
+ f"Liczba odpowiedzi: {settings.get('num_responses', 'N/A')}"
451
+ ]
452
+ for item in metadata_items:
453
+ doc.add_paragraph(item, style='List Bullet')
454
+
455
+ # Results Summary
456
+ doc.add_heading('Podsumowanie Wyników', level=1)
457
+
458
+ # Table
459
+ table = doc.add_table(rows=3, cols=5)
460
+ table.style = 'Light Grid Accent 1'
461
+
462
+ # Header
463
+ headers = ['Option', 'Średnia Ocena', 'Liczba', 'Min', 'Max']
464
+ for i, header in enumerate(headers):
465
+ table.rows[0].cells[i].text = header
466
+
467
+ # Option A
468
+ table.rows[1].cells[0].text = 'A'
469
+ table.rows[1].cells[1].text = f"{results['A']['score']:.2f}"
470
+ table.rows[1].cells[2].text = str(results['A']['count'])
471
+ table.rows[1].cells[3].text = str(results['A']['min'])
472
+ table.rows[1].cells[4].text = str(results['A']['max'])
473
+
474
+ # Option B
475
+ table.rows[2].cells[0].text = 'B'
476
+ table.rows[2].cells[1].text = f"{results['B']['score']:.2f}"
477
+ table.rows[2].cells[2].text = str(results['B']['count'])
478
+ table.rows[2].cells[3].text = str(results['B']['min'])
479
+ table.rows[2].cells[4].text = str(results['B']['max'])
480
+
481
+ # Winner
482
+ doc.add_paragraph()
483
+ if results['A']['score'] > results['B']['score']:
484
+ diff = results['A']['score'] - results['B']['score']
485
+ winner_para = doc.add_paragraph()
486
+ winner_run = winner_para.add_run(f"🏆 Zwycięzca: Prompt A (przewaga: +{diff:.2f})")
487
+ winner_run.bold = True
488
+ winner_run.font.size = Pt(14)
489
+ elif results['B']['score'] > results['A']['score']:
490
+ diff = results['B']['score'] - results['A']['score']
491
+ winner_para = doc.add_paragraph()
492
+ winner_run = winner_para.add_run(f"🏆 Zwycięzca: Prompt B (przewaga: +{diff:.2f})")
493
+ winner_run.bold = True
494
+ winner_run.font.size = Pt(14)
495
+ else:
496
+ winner_para = doc.add_paragraph()
497
+ winner_run = winner_para.add_run("🤝 Remis")
498
+ winner_run.bold = True
499
+ winner_run.font.size = Pt(14)
500
+
501
+ # All responses
502
+ doc.add_page_break()
503
+ doc.add_heading('Wszystkie Odpowiedzi', level=1)
504
+
505
+ for resp in responses_with_scores:
506
+ doc.add_heading(f"Option {resp['Option']}-{resp['Response_ID']} (Ocena: {resp['Score']})", level=2)
507
+ doc.add_paragraph(resp['Response'])
508
+ doc.add_paragraph()
509
+
510
+ # Save to buffer
511
+ buffer = BytesIO()
512
+ doc.save(buffer)
513
+ buffer.seek(0)
514
+ return buffer