File size: 18,578 Bytes
e69482a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
"""
Dataset Builder v3 โ€” Football Prediction Extractor
- Always outputs JSON array (even single tip)
- 70% single-tip / 30% multi-tip (2-4 events)
- Noise: random emojis, typos, missing fields, varied separators
- Varied date formats, bookmakers, times, headers
- Pure stdlib โ€” no pip installs needed
"""

import csv
import json
import random
from pathlib import Path
from collections import defaultdict

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# CONFIG
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
TEAMS_CSV      = "teams_tier1_tier2.csv"
OUTPUT_TRAIN   = "train_dataset.jsonl"
OUTPUT_VAL     = "val_dataset.jsonl"
EXAMPLES_COUNT = 300
VAL_SPLIT      = 0.1

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# SYSTEM PROMPT โ€” always array
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SYSTEM_PROMPT = (
    "You are a football data extraction assistant. "
    "Extract structured data from the message and return ONLY a valid JSON array. "
    "Each object in the array must have exactly these keys: "
    "league, team_1, team_2, prediction, date, odds. "
    "If a field is missing, use null. No extra text, no markdown."
)

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# VOCABULARY
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
PREDICTIONS = [
    "Over 1.5", "Over 2.5", "Over 3.5",
    "Under 2.5", "Under 3.5",
    "1X", "X2", "12",
    "Home Win", "Away Win", "Draw",
    "Both Teams to Score",
    "Home Win or Draw",
    "Away Win or Draw",
    "GG", "NG",
]

DATE_FORMATS = [
    lambda d, m, y: f"{d:02d}/{m:02d}/{y}",
    lambda d, m, y: f"{d:02d}-{m:02d}-{y}",
    lambda d, m, y: f"{d:02d}.{m:02d}.{y}",
    lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}",
    lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}",
]

TIMES        = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"]
BOOKS        = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"]
HEADERS      = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"]
SEPARATORS   = [" - ", " vs ", " v ", " โ€“ ", " VS ", " x "]
EXTRA_EMOJIS = ["๐Ÿ”ฅ","๐Ÿ’ฅ","๐ŸŽฏ","๐Ÿ‘€","๐Ÿ’ฐ","๐Ÿ…","โšก๏ธ","๐Ÿ™Œ","๐Ÿ‘‡","โœจ","๐Ÿ“Š","๐Ÿ’Ž","๐Ÿค‘","๐Ÿšจ","โœ…","โ‡๏ธ","๐Ÿ†•","๐Ÿ“Œ","๐Ÿ‘‘","๐Ÿƒ"]

MULTI_HEADERS = [
    "โšฝ๏ธ ๐๐ซ๐ž๐๐ข๐œ๐ญ๐ข๐จ๐ง๐ฌ ๐จ๐Ÿ ๐ญ๐ก๐ž ๐ƒ๐š๐ฒ โšฝ๏ธ",
    "๐Ÿ”ฅ TODAY'S FOOTBALL TIPS ๐Ÿ”ฅ",
    "๐Ÿ’ฐ Daily Predictions ๐Ÿ’ฐ",
    "โšก๏ธ Best Bets Today โšก๏ธ",
    "๐Ÿ“Š Football Tips",
    "๐ŸŽฏ Today's Picks",
]

MULTI_FOOTERS = [
    "For more predictions visit www.eaglepredict.com",
    "Follow us for daily tips! ๐Ÿ™Œ",
    "Good luck everyone! ๐Ÿ€",
    "Join our VIP channel for more! ๐Ÿ’Ž",
    "Win big today! ๐Ÿค‘",
    "",  # no footer sometimes
]

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# SINGLE TIP TEMPLATES
# placeholders: {league} {team_1} {team_2} {prediction}
#               {date} {odds} {time} {header} {book} {sep}
# templates 7 and 8 intentionally omit odds/date
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SINGLE_TEMPLATES = [
    # 1 structured Telegram bold style
    "โšฝ๏ธ {header} โšฝ๏ธ\n๐ƒ๐š๐ญ๐ž: {date}\n๐‹๐ž๐š๐ ๐ฎ๐ž: {league}\n๐Œ๐š๐ญ๐œ๐ก: {team_1}{sep}{team_2}\n๐Š๐ข๐œ๐ค ๐จ๐Ÿ๐Ÿ: {time} WAT\nโœ…{prediction}\nโœ…Odds @{odds} on {book}",

    # 2 plain structured
    "โšฝ๏ธ {header} โšฝ๏ธ\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\nโœ…{prediction}\nโœ…Odds @{odds} on {book}",

    # 3 emoji compact
    "๐Ÿ† {league}\n{team_1}{sep}{team_2}\n๐Ÿ“… {date} | โฐ {time}\n๐ŸŽฏ Tip: {prediction}\n๐Ÿ’ฐ Odds: {odds}",

    # 4 casual noisy
    "wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}",

    # 5 one-liner
    "{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}",

    # 6 verbose channel
    "๐Ÿ”ฅ Today's football tip ๐Ÿ”ฅ\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! โšฝ",

    # 7 minimal no emojis
    "Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}",

    # 8 different field order
    "๐Ÿ“† {date} | {time}\nโšฝ {league}: {team_1}{sep}{team_2}\nโœ”๏ธ {prediction} | @{odds}",

    # 9 ALL CAPS noisy
    "MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}",

    # 10 missing odds intentionally
    "โšฝ๏ธ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}",

    # 11 missing date intentionally
    "๐ŸŸ๏ธ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}",

    # 12 missing league intentionally
    "{team_1}{sep}{team_2}\n๐Ÿ“… {date}\nโœ… {prediction} @ {odds}",

    # 13 telegram minimal
    "๐Ÿ“Œ {league}\n{team_1}{sep}{team_2} โ€” {date}\n{prediction} | {odds}",

    # 14 with extra commentary noise
    "Today I really like this match ๐Ÿ‘‡\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}",
]

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# MULTI-TIP BLOCK TEMPLATES (per tip)
# extra placeholder: {n} = tip number
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
MULTI_BLOCK_TEMPLATES = [
    # Telegram numbered bold
    "โšฝ๏ธ ๐—™๐—ผ๐—ผ๐˜๐—ฏ๐—ฎ๐—น๐—น ๐—ง๐—ถ๐—ฝ {n} โšฝ๏ธ\n๐ƒ๐š๐ญ๐ž: {date}\n๐‹๐ž๐š๐ ๐ฎ๐ž: {league}\n๐Œ๐š๐ญ๐œ๐ก: {team_1}{sep}{team_2}\n๐Š๐ข๐œ๐ค ๐จ๐Ÿ๐Ÿ: {time} WAT\nโœ…{prediction}\nโœ…Odds @{odds} on {book}",

    # plain numbered
    "Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}",

    # compact numbered
    "#{n} {league} | {team_1}{sep}{team_2} | {date}\nโ†’ {prediction} @ {odds}",

    # emoji numbered
    "๐ŸŽฏ Pick #{n}\n{team_1}{sep}{team_2} ({league})\n๐Ÿ“… {date} โฐ {time}\nโœ… {prediction} | odds: {odds}",

    # minimal numbered
    "{n}. {team_1}{sep}{team_2} โ€” {league} โ€” {prediction} @ {odds} ({date})",
]

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# LOAD TEAMS FROM CSV
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def load_teams(csv_path: str) -> dict:
    leagues = defaultdict(list)
    path = Path(csv_path)
    if not path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")
    with open(path, encoding="utf-8") as f:
        sample = f.read(2048)
        f.seek(0)
        delimiter = "\t" if "\t" in sample else ","
        reader = csv.DictReader(f, delimiter=delimiter)
        for row in reader:
            row = {k.strip(): v.strip() for k, v in row.items()}
            country = row.get("Country", "")
            league  = row.get("League", "")
            team    = row.get("Team", "")
            if country and league and team:
                leagues[(country, league)].append(team)
    total = sum(len(v) for v in leagues.values())
    print(f"[โœ“] Loaded {total} teams across {len(leagues)} leagues")
    return leagues

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# RANDOM HELPERS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def random_date() -> str:
    month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5)
    year  = 2025 if month >= 8 else 2026
    day   = random.randint(1, 28)
    return random.choice(DATE_FORMATS)(day, month, year)

def random_odds() -> float:
    return round(random.uniform(1.05, 3.50), 2)

def random_fixture(leagues: dict) -> dict | None:
    key = random.choice(list(leagues.keys()))
    teams = leagues[key]
    if len(teams) < 2:
        return None
    _, league = key
    team_1, team_2 = random.sample(teams, 2)
    return {
        "league":     league,
        "team_1":     team_1,
        "team_2":     team_2,
        "prediction": random.choice(PREDICTIONS),
        "date":       random_date(),
        "odds":       random_odds(),
    }

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# NOISE FUNCTIONS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def inject_emojis(text: str) -> str:
    """40% chance: sprinkle 1-3 random emojis into random lines."""
    if random.random() < 0.40:
        emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3))
        lines  = text.split("\n")
        for e in emojis:
            idx = random.randint(0, len(lines) - 1)
            lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e)
        return "\n".join(lines)
    return text

def inject_typos(text: str) -> str:
    """15% chance: swap two adjacent chars in a random word."""
    if random.random() < 0.15:
        words = text.split(" ")
        idx   = random.randint(0, len(words) - 1)
        w     = words[idx]
        if len(w) > 3 and w.isalpha():
            i       = random.randint(0, len(w) - 2)
            w       = w[:i] + w[i+1] + w[i] + w[i+2:]
            words[idx] = w
        return " ".join(words)
    return text

def inject_extra_lines(text: str) -> str:
    """20% chance: add irrelevant noise lines."""
    noise_lines = [
        "For more predictions visit www.eaglepredict.com",
        "Join our VIP channel ๐Ÿ’Ž",
        "Yesterday result: WIN โœ…",
        "Record this week: 8W 2L",
        "All tips are for 18+ only",
        "Use responsible gambling ๐Ÿ™",
    ]
    if random.random() < 0.20:
        line = random.choice(noise_lines)
        if random.random() < 0.5:
            return line + "\n" + text
        else:
            return text + "\n" + line
    return text

def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict:
    """
    Randomly null out one field (20% chance).
    Respects whether template already omits it.
    """
    f = dict(fixture)
    if not has_odds:
        f["odds"] = None
    if not has_date:
        f["date"] = None
    if not has_league:
        f["league"] = None
    # extra random null on top
    if random.random() < 0.20:
        field = random.choice(["odds", "date", "league"])
        f[field] = None
    return f

def apply_noise(text: str) -> str:
    text = inject_emojis(text)
    text = inject_typos(text)
    text = inject_extra_lines(text)
    return text

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# EXAMPLE GENERATORS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def make_single_example(leagues: dict) -> dict | None:
    fixture  = random_fixture(leagues)
    if not fixture:
        return None

    template  = random.choice(SINGLE_TEMPLATES)
    has_odds  = "{odds}"   in template
    has_date  = "{date}"   in template
    has_league= "{league}" in template
    sep       = random.choice(SEPARATORS)

    input_text = template.format(
        sep        = sep,
        league     = fixture["league"],
        team_1     = fixture["team_1"],
        team_2     = fixture["team_2"],
        prediction = fixture["prediction"],
        date       = fixture["date"],
        odds       = fixture["odds"],
        time       = random.choice(TIMES),
        header     = random.choice(HEADERS),
        book       = random.choice(BOOKS),
    )
    input_text = apply_noise(input_text)
    output_json = maybe_null_field(fixture, has_odds, has_date, has_league)

    return {
        "input":  input_text,
        "output": [output_json],   # always array
    }


def make_multi_example(leagues: dict) -> dict | None:
    n_tips   = random.randint(2, 4)
    fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)]
    fixtures = [f for f in fixtures if f][:n_tips]
    if len(fixtures) < 2:
        return None

    block_template = random.choice(MULTI_BLOCK_TEMPLATES)
    sep    = random.choice(SEPARATORS)
    blocks = []

    for i, f in enumerate(fixtures, 1):
        has_odds  = "{odds}"   in block_template
        has_date  = "{date}"   in block_template
        has_league= "{league}" in block_template
        block = block_template.format(
            n          = i,
            sep        = sep,
            league     = f["league"],
            team_1     = f["team_1"],
            team_2     = f["team_2"],
            prediction = f["prediction"],
            date       = f["date"],
            odds       = f["odds"],
            time       = random.choice(TIMES),
            book       = random.choice(BOOKS),
        )
        blocks.append((block, f, has_odds, has_date, has_league))

    header   = random.choice(MULTI_HEADERS)
    footer   = random.choice(MULTI_FOOTERS)
    parts    = [header] + [b[0] for b in blocks] + ([footer] if footer else [])
    input_text = "\n".join(parts)
    input_text = apply_noise(input_text)

    output = [
        maybe_null_field(f, has_odds, has_date, has_league)
        for _, f, has_odds, has_date, has_league in blocks
    ]

    return {"input": input_text, "output": output}

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# FORMAT AS TRAINING EXAMPLE
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def make_training_example(ex: dict) -> dict:
    return {
        "messages": [
            {"role": "system",    "content": SYSTEM_PROMPT},
            {"role": "user",      "content": ex["input"].strip()},
            {"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)},
        ]
    }

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# MAIN
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def build_dataset():
    leagues  = load_teams(TEAMS_CSV)
    examples = []

    n_single = int(EXAMPLES_COUNT * 0.70)
    n_multi  = EXAMPLES_COUNT - n_single
    print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...")

    # single tip
    attempts = 0
    while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single:
        attempts += 1
        if attempts > n_single * 5:
            break
        ex = make_single_example(leagues)
        if ex:
            examples.append(make_training_example(ex))

    # multi tip
    attempts = 0
    while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi:
        attempts += 1
        if attempts > n_multi * 5:
            break
        ex = make_multi_example(leagues)
        if ex:
            examples.append(make_training_example(ex))

    print(f"      โ†’ {len(examples)} total examples generated")

    # โ”€โ”€ Write files โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
    print("[2/2] Writing dataset files...")
    random.shuffle(examples)
    split = int(len(examples) * (1 - VAL_SPLIT))
    train, val = examples[:split], examples[split:]

    for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]:
        with open(path, "w", encoding="utf-8") as f:
            for ex in data:
                f.write(json.dumps(ex, ensure_ascii=False) + "\n")

    # โ”€โ”€ Stats โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
    all_ex  = train + val
    single  = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1)
    multi   = len(all_ex) - single
    nulls   = sum(
        1 for e in all_ex
        for obj in json.loads(e["messages"][2]["content"])
        if any(v is None for v in obj.values())
    )

    print(f"\nโœ… Done!")
    print(f"   {OUTPUT_TRAIN}  โ†’ {len(train)} examples")
    print(f"   {OUTPUT_VAL}    โ†’ {len(val)} examples")
    print(f"   Single-tip      โ†’ {single}")
    print(f"   Multi-tip       โ†’ {multi}")
    print(f"   With null fieldsโ†’ {nulls}")

    # โ”€โ”€ Previews โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
    print("\nโ”€โ”€ Single-tip sample โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
    s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1)
    for msg in s["messages"]:
        print(f"[{msg['role']}]\n{msg['content'][:200]}\n")

    print("โ”€โ”€ Multi-tip sample โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€")
    m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1)
    for msg in m["messages"]:
        print(f"[{msg['role']}]\n{msg['content'][:300]}\n")


if __name__ == "__main__":
    build_dataset()