""" Dataset Builder v3 — Football Prediction Extractor - Always outputs JSON array (even single tip) - 70% single-tip / 30% multi-tip (2-4 events) - Noise: random emojis, typos, missing fields, varied separators - Varied date formats, bookmakers, times, headers - Pure stdlib — no pip installs needed """ import csv import json import random from pathlib import Path from collections import defaultdict # ───────────────────────────────────────────── # CONFIG # ───────────────────────────────────────────── TEAMS_CSV = "teams_tier1_tier2.csv" OUTPUT_TRAIN = "train_dataset.jsonl" OUTPUT_VAL = "val_dataset.jsonl" EXAMPLES_COUNT = 300 VAL_SPLIT = 0.1 # ───────────────────────────────────────────── # SYSTEM PROMPT — always array # ───────────────────────────────────────────── SYSTEM_PROMPT = ( "You are a football data extraction assistant. " "Extract structured data from the message and return ONLY a valid JSON array. " "Each object in the array must have exactly these keys: " "league, team_1, team_2, prediction, date, odds. " "If a field is missing, use null. No extra text, no markdown." ) # ───────────────────────────────────────────── # VOCABULARY # ───────────────────────────────────────────── PREDICTIONS = [ "Over 1.5", "Over 2.5", "Over 3.5", "Under 2.5", "Under 3.5", "1X", "X2", "12", "Home Win", "Away Win", "Draw", "Both Teams to Score", "Home Win or Draw", "Away Win or Draw", "GG", "NG", ] DATE_FORMATS = [ lambda d, m, y: f"{d:02d}/{m:02d}/{y}", lambda d, m, y: f"{d:02d}-{m:02d}-{y}", lambda d, m, y: f"{d:02d}.{m:02d}.{y}", lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}", lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}", ] TIMES = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"] BOOKS = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"] HEADERS = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"] SEPARATORS = [" - ", " vs ", " v ", " – ", " VS ", " x "] EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"] MULTI_HEADERS = [ "⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️", "🔥 TODAY'S FOOTBALL TIPS 🔥", "💰 Daily Predictions 💰", "⚡️ Best Bets Today ⚡️", "📊 Football Tips", "🎯 Today's Picks", ] MULTI_FOOTERS = [ "For more predictions visit www.eaglepredict.com", "Follow us for daily tips! 🙌", "Good luck everyone! 🍀", "Join our VIP channel for more! 💎", "Win big today! 🤑", "", # no footer sometimes ] # ───────────────────────────────────────────── # SINGLE TIP TEMPLATES # placeholders: {league} {team_1} {team_2} {prediction} # {date} {odds} {time} {header} {book} {sep} # templates 7 and 8 intentionally omit odds/date # ───────────────────────────────────────────── SINGLE_TEMPLATES = [ # 1 structured Telegram bold style "⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", # 2 plain structured "⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", # 3 emoji compact "🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} | ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}", # 4 casual noisy "wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}", # 5 one-liner "{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}", # 6 verbose channel "🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽", # 7 minimal no emojis "Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}", # 8 different field order "📆 {date} | {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} | @{odds}", # 9 ALL CAPS noisy "MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}", # 10 missing odds intentionally "⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}", # 11 missing date intentionally "🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}", # 12 missing league intentionally "{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}", # 13 telegram minimal "📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} | {odds}", # 14 with extra commentary noise "Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}", ] # ───────────────────────────────────────────── # MULTI-TIP BLOCK TEMPLATES (per tip) # extra placeholder: {n} = tip number # ───────────────────────────────────────────── MULTI_BLOCK_TEMPLATES = [ # Telegram numbered bold "⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", # plain numbered "Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}", # compact numbered "#{n} {league} | {team_1}{sep}{team_2} | {date}\n→ {prediction} @ {odds}", # emoji numbered "🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} | odds: {odds}", # minimal numbered "{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})", ] # ───────────────────────────────────────────── # LOAD TEAMS FROM CSV # ───────────────────────────────────────────── def load_teams(csv_path: str) -> dict: leagues = defaultdict(list) path = Path(csv_path) if not path.exists(): raise FileNotFoundError(f"CSV not found: {csv_path}") with open(path, encoding="utf-8") as f: sample = f.read(2048) f.seek(0) delimiter = "\t" if "\t" in sample else "," reader = csv.DictReader(f, delimiter=delimiter) for row in reader: row = {k.strip(): v.strip() for k, v in row.items()} country = row.get("Country", "") league = row.get("League", "") team = row.get("Team", "") if country and league and team: leagues[(country, league)].append(team) total = sum(len(v) for v in leagues.values()) print(f"[✓] Loaded {total} teams across {len(leagues)} leagues") return leagues # ───────────────────────────────────────────── # RANDOM HELPERS # ───────────────────────────────────────────── def random_date() -> str: month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5) year = 2025 if month >= 8 else 2026 day = random.randint(1, 28) return random.choice(DATE_FORMATS)(day, month, year) def random_odds() -> float: return round(random.uniform(1.05, 3.50), 2) def random_fixture(leagues: dict) -> dict | None: key = random.choice(list(leagues.keys())) teams = leagues[key] if len(teams) < 2: return None _, league = key team_1, team_2 = random.sample(teams, 2) return { "league": league, "team_1": team_1, "team_2": team_2, "prediction": random.choice(PREDICTIONS), "date": random_date(), "odds": random_odds(), } # ───────────────────────────────────────────── # NOISE FUNCTIONS # ───────────────────────────────────────────── def inject_emojis(text: str) -> str: """40% chance: sprinkle 1-3 random emojis into random lines.""" if random.random() < 0.40: emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3)) lines = text.split("\n") for e in emojis: idx = random.randint(0, len(lines) - 1) lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e) return "\n".join(lines) return text def inject_typos(text: str) -> str: """15% chance: swap two adjacent chars in a random word.""" if random.random() < 0.15: words = text.split(" ") idx = random.randint(0, len(words) - 1) w = words[idx] if len(w) > 3 and w.isalpha(): i = random.randint(0, len(w) - 2) w = w[:i] + w[i+1] + w[i] + w[i+2:] words[idx] = w return " ".join(words) return text def inject_extra_lines(text: str) -> str: """20% chance: add irrelevant noise lines.""" noise_lines = [ "For more predictions visit www.eaglepredict.com", "Join our VIP channel 💎", "Yesterday result: WIN ✅", "Record this week: 8W 2L", "All tips are for 18+ only", "Use responsible gambling 🙏", ] if random.random() < 0.20: line = random.choice(noise_lines) if random.random() < 0.5: return line + "\n" + text else: return text + "\n" + line return text def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict: """ Randomly null out one field (20% chance). Respects whether template already omits it. """ f = dict(fixture) if not has_odds: f["odds"] = None if not has_date: f["date"] = None if not has_league: f["league"] = None # extra random null on top if random.random() < 0.20: field = random.choice(["odds", "date", "league"]) f[field] = None return f def apply_noise(text: str) -> str: text = inject_emojis(text) text = inject_typos(text) text = inject_extra_lines(text) return text # ───────────────────────────────────────────── # EXAMPLE GENERATORS # ───────────────────────────────────────────── def make_single_example(leagues: dict) -> dict | None: fixture = random_fixture(leagues) if not fixture: return None template = random.choice(SINGLE_TEMPLATES) has_odds = "{odds}" in template has_date = "{date}" in template has_league= "{league}" in template sep = random.choice(SEPARATORS) input_text = template.format( sep = sep, league = fixture["league"], team_1 = fixture["team_1"], team_2 = fixture["team_2"], prediction = fixture["prediction"], date = fixture["date"], odds = fixture["odds"], time = random.choice(TIMES), header = random.choice(HEADERS), book = random.choice(BOOKS), ) input_text = apply_noise(input_text) output_json = maybe_null_field(fixture, has_odds, has_date, has_league) return { "input": input_text, "output": [output_json], # always array } def make_multi_example(leagues: dict) -> dict | None: n_tips = random.randint(2, 4) fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)] fixtures = [f for f in fixtures if f][:n_tips] if len(fixtures) < 2: return None block_template = random.choice(MULTI_BLOCK_TEMPLATES) sep = random.choice(SEPARATORS) blocks = [] for i, f in enumerate(fixtures, 1): has_odds = "{odds}" in block_template has_date = "{date}" in block_template has_league= "{league}" in block_template block = block_template.format( n = i, sep = sep, league = f["league"], team_1 = f["team_1"], team_2 = f["team_2"], prediction = f["prediction"], date = f["date"], odds = f["odds"], time = random.choice(TIMES), book = random.choice(BOOKS), ) blocks.append((block, f, has_odds, has_date, has_league)) header = random.choice(MULTI_HEADERS) footer = random.choice(MULTI_FOOTERS) parts = [header] + [b[0] for b in blocks] + ([footer] if footer else []) input_text = "\n".join(parts) input_text = apply_noise(input_text) output = [ maybe_null_field(f, has_odds, has_date, has_league) for _, f, has_odds, has_date, has_league in blocks ] return {"input": input_text, "output": output} # ───────────────────────────────────────────── # FORMAT AS TRAINING EXAMPLE # ───────────────────────────────────────────── def make_training_example(ex: dict) -> dict: return { "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": ex["input"].strip()}, {"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)}, ] } # ───────────────────────────────────────────── # MAIN # ───────────────────────────────────────────── def build_dataset(): leagues = load_teams(TEAMS_CSV) examples = [] n_single = int(EXAMPLES_COUNT * 0.70) n_multi = EXAMPLES_COUNT - n_single print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...") # single tip attempts = 0 while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single: attempts += 1 if attempts > n_single * 5: break ex = make_single_example(leagues) if ex: examples.append(make_training_example(ex)) # multi tip attempts = 0 while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi: attempts += 1 if attempts > n_multi * 5: break ex = make_multi_example(leagues) if ex: examples.append(make_training_example(ex)) print(f" → {len(examples)} total examples generated") # ── Write files ──────────────────────────── print("[2/2] Writing dataset files...") random.shuffle(examples) split = int(len(examples) * (1 - VAL_SPLIT)) train, val = examples[:split], examples[split:] for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]: with open(path, "w", encoding="utf-8") as f: for ex in data: f.write(json.dumps(ex, ensure_ascii=False) + "\n") # ── Stats ────────────────────────────────── all_ex = train + val single = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1) multi = len(all_ex) - single nulls = sum( 1 for e in all_ex for obj in json.loads(e["messages"][2]["content"]) if any(v is None for v in obj.values()) ) print(f"\n✅ Done!") print(f" {OUTPUT_TRAIN} → {len(train)} examples") print(f" {OUTPUT_VAL} → {len(val)} examples") print(f" Single-tip → {single}") print(f" Multi-tip → {multi}") print(f" With null fields→ {nulls}") # ── Previews ─────────────────────────────── print("\n── Single-tip sample ───────────────────────") s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1) for msg in s["messages"]: print(f"[{msg['role']}]\n{msg['content'][:200]}\n") print("── Multi-tip sample ────────────────────────") m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1) for msg in m["messages"]: print(f"[{msg['role']}]\n{msg['content'][:300]}\n") if __name__ == "__main__": build_dataset()