| """ |
| Dataset Builder v3 — Football Prediction Extractor |
| - Always outputs JSON array (even single tip) |
| - 70% single-tip / 30% multi-tip (2-4 events) |
| - Noise: random emojis, typos, missing fields, varied separators |
| - Varied date formats, bookmakers, times, headers |
| - Pure stdlib — no pip installs needed |
| """ |
|
|
| import csv |
| import json |
| import random |
| from pathlib import Path |
| from collections import defaultdict |
|
|
| |
| |
| |
| TEAMS_CSV = "teams_tier1_tier2.csv" |
| OUTPUT_TRAIN = "train_dataset.jsonl" |
| OUTPUT_VAL = "val_dataset.jsonl" |
| EXAMPLES_COUNT = 300 |
| VAL_SPLIT = 0.1 |
|
|
| |
| |
| |
| SYSTEM_PROMPT = ( |
| "You are a football data extraction assistant. " |
| "Extract structured data from the message and return ONLY a valid JSON array. " |
| "Each object in the array must have exactly these keys: " |
| "league, team_1, team_2, prediction, date, odds. " |
| "If a field is missing, use null. No extra text, no markdown." |
| ) |
|
|
| |
| |
| |
| PREDICTIONS = [ |
| "Over 1.5", "Over 2.5", "Over 3.5", |
| "Under 2.5", "Under 3.5", |
| "1X", "X2", "12", |
| "Home Win", "Away Win", "Draw", |
| "Both Teams to Score", |
| "Home Win or Draw", |
| "Away Win or Draw", |
| "GG", "NG", |
| ] |
|
|
| DATE_FORMATS = [ |
| lambda d, m, y: f"{d:02d}/{m:02d}/{y}", |
| lambda d, m, y: f"{d:02d}-{m:02d}-{y}", |
| lambda d, m, y: f"{d:02d}.{m:02d}.{y}", |
| lambda d, m, y: f"{['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'][m-1]} {d}, {y}", |
| lambda d, m, y: f"{d} {['January','February','March','April','May','June','July','August','September','October','November','December'][m-1]} {y}", |
| ] |
|
|
| TIMES = ["13:00", "15:00", "16:00", "17:00", "18:00", "19:00", "19:45", "20:00", "20:45", "21:00", "21:45"] |
| BOOKS = ["BETANO", "Bet365", "William Hill", "Unibet", "1xBet", "Betway", "Bwin", "Pinnacle"] |
| HEADERS = ["Prediction of the Day", "Football Tip", "Best Bet Today", "Daily Pick", "Top Prediction", "Sure Tip", "VIP Prediction"] |
| SEPARATORS = [" - ", " vs ", " v ", " – ", " VS ", " x "] |
| EXTRA_EMOJIS = ["🔥","💥","🎯","👀","💰","🏅","⚡️","🙌","👇","✨","📊","💎","🤑","🚨","✅","❇️","🆕","📌","👑","🃏"] |
|
|
| MULTI_HEADERS = [ |
| "⚽️ 𝐏𝐫𝐞𝐝𝐢𝐜𝐭𝐢𝐨𝐧𝐬 𝐨𝐟 𝐭𝐡𝐞 𝐃𝐚𝐲 ⚽️", |
| "🔥 TODAY'S FOOTBALL TIPS 🔥", |
| "💰 Daily Predictions 💰", |
| "⚡️ Best Bets Today ⚡️", |
| "📊 Football Tips", |
| "🎯 Today's Picks", |
| ] |
|
|
| MULTI_FOOTERS = [ |
| "For more predictions visit www.eaglepredict.com", |
| "Follow us for daily tips! 🙌", |
| "Good luck everyone! 🍀", |
| "Join our VIP channel for more! 💎", |
| "Win big today! 🤑", |
| "", |
| ] |
|
|
| |
| |
| |
| |
| |
| |
| SINGLE_TEMPLATES = [ |
| |
| "⚽️ {header} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", |
|
|
| |
| "⚽️ {header} ⚽️\nDate: {date}\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nKick off: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", |
|
|
| |
| "🏆 {league}\n{team_1}{sep}{team_2}\n📅 {date} | ⏰ {time}\n🎯 Tip: {prediction}\n💰 Odds: {odds}", |
|
|
| |
| "wow predictions present\nINCREDIBLE MATCH BETWEEN {team_1}{sep}{team_2}\nTime: {time}\nwe forecast {prediction}\nOdds {odds}", |
|
|
| |
| "{team_1}{sep}{team_2} | {league} | {date} | {prediction} @ {odds}", |
|
|
| |
| "🔥 Today's football tip 🔥\nCompetition: {league}\nGame: {team_1}{sep}{team_2}\nDate: {date}, KO {time}\nOur pick: {prediction}\nBest odds: {odds} ({book})\nGood luck! ⚽", |
|
|
| |
| "Match: {team_1}{sep}{team_2}\nLeague: {league}\nDate: {date}\nPrediction: {prediction}\nOdds: {odds}", |
|
|
| |
| "📆 {date} | {time}\n⚽ {league}: {team_1}{sep}{team_2}\n✔️ {prediction} | @{odds}", |
|
|
| |
| "MATCH: {team_1}{sep}{team_2}\nLEAGUE: {league}\nDATE: {date}\nPICK: {prediction}\nODDS: {odds}", |
|
|
| |
| "⚽️ {header}\n{league}\n{team_1}{sep}{team_2}\n{date}\nPrediction: {prediction}", |
|
|
| |
| "🏟️ {league}\n{team_1}{sep}{team_2}\nTip: {prediction}\nOdds: {odds} on {book}", |
|
|
| |
| "{team_1}{sep}{team_2}\n📅 {date}\n✅ {prediction} @ {odds}", |
|
|
| |
| "📌 {league}\n{team_1}{sep}{team_2} — {date}\n{prediction} | {odds}", |
|
|
| |
| "Today I really like this match 👇\n{team_1}{sep}{team_2} ({league})\nDate: {date}\nMy pick: {prediction}\nOdds: {odds} on {book}", |
| ] |
|
|
| |
| |
| |
| |
| MULTI_BLOCK_TEMPLATES = [ |
| |
| "⚽️ 𝗙𝗼𝗼𝘁𝗯𝗮𝗹𝗹 𝗧𝗶𝗽 {n} ⚽️\n𝐃𝐚𝐭𝐞: {date}\n𝐋𝐞𝐚𝐠𝐮𝐞: {league}\n𝐌𝐚𝐭𝐜𝐡: {team_1}{sep}{team_2}\n𝐊𝐢𝐜𝐤 𝐨𝐟𝐟: {time} WAT\n✅{prediction}\n✅Odds @{odds} on {book}", |
|
|
| |
| "Tip {n}:\nLeague: {league}\nMatch: {team_1}{sep}{team_2}\nDate: {date} | KO: {time}\nPrediction: {prediction} @ {odds}", |
|
|
| |
| "#{n} {league} | {team_1}{sep}{team_2} | {date}\n→ {prediction} @ {odds}", |
|
|
| |
| "🎯 Pick #{n}\n{team_1}{sep}{team_2} ({league})\n📅 {date} ⏰ {time}\n✅ {prediction} | odds: {odds}", |
|
|
| |
| "{n}. {team_1}{sep}{team_2} — {league} — {prediction} @ {odds} ({date})", |
| ] |
|
|
| |
| |
| |
| def load_teams(csv_path: str) -> dict: |
| leagues = defaultdict(list) |
| path = Path(csv_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"CSV not found: {csv_path}") |
| with open(path, encoding="utf-8") as f: |
| sample = f.read(2048) |
| f.seek(0) |
| delimiter = "\t" if "\t" in sample else "," |
| reader = csv.DictReader(f, delimiter=delimiter) |
| for row in reader: |
| row = {k.strip(): v.strip() for k, v in row.items()} |
| country = row.get("Country", "") |
| league = row.get("League", "") |
| team = row.get("Team", "") |
| if country and league and team: |
| leagues[(country, league)].append(team) |
| total = sum(len(v) for v in leagues.values()) |
| print(f"[✓] Loaded {total} teams across {len(leagues)} leagues") |
| return leagues |
|
|
| |
| |
| |
| def random_date() -> str: |
| month = random.randint(8, 12) if random.random() < 0.5 else random.randint(1, 5) |
| year = 2025 if month >= 8 else 2026 |
| day = random.randint(1, 28) |
| return random.choice(DATE_FORMATS)(day, month, year) |
|
|
| def random_odds() -> float: |
| return round(random.uniform(1.05, 3.50), 2) |
|
|
| def random_fixture(leagues: dict) -> dict | None: |
| key = random.choice(list(leagues.keys())) |
| teams = leagues[key] |
| if len(teams) < 2: |
| return None |
| _, league = key |
| team_1, team_2 = random.sample(teams, 2) |
| return { |
| "league": league, |
| "team_1": team_1, |
| "team_2": team_2, |
| "prediction": random.choice(PREDICTIONS), |
| "date": random_date(), |
| "odds": random_odds(), |
| } |
|
|
| |
| |
| |
| def inject_emojis(text: str) -> str: |
| """40% chance: sprinkle 1-3 random emojis into random lines.""" |
| if random.random() < 0.40: |
| emojis = random.sample(EXTRA_EMOJIS, k=random.randint(1, 3)) |
| lines = text.split("\n") |
| for e in emojis: |
| idx = random.randint(0, len(lines) - 1) |
| lines[idx] = (e + " " + lines[idx]) if random.random() < 0.5 else (lines[idx] + " " + e) |
| return "\n".join(lines) |
| return text |
|
|
| def inject_typos(text: str) -> str: |
| """15% chance: swap two adjacent chars in a random word.""" |
| if random.random() < 0.15: |
| words = text.split(" ") |
| idx = random.randint(0, len(words) - 1) |
| w = words[idx] |
| if len(w) > 3 and w.isalpha(): |
| i = random.randint(0, len(w) - 2) |
| w = w[:i] + w[i+1] + w[i] + w[i+2:] |
| words[idx] = w |
| return " ".join(words) |
| return text |
|
|
| def inject_extra_lines(text: str) -> str: |
| """20% chance: add irrelevant noise lines.""" |
| noise_lines = [ |
| "For more predictions visit www.eaglepredict.com", |
| "Join our VIP channel 💎", |
| "Yesterday result: WIN ✅", |
| "Record this week: 8W 2L", |
| "All tips are for 18+ only", |
| "Use responsible gambling 🙏", |
| ] |
| if random.random() < 0.20: |
| line = random.choice(noise_lines) |
| if random.random() < 0.5: |
| return line + "\n" + text |
| else: |
| return text + "\n" + line |
| return text |
|
|
| def maybe_null_field(fixture: dict, has_odds: bool, has_date: bool, has_league: bool) -> dict: |
| """ |
| Randomly null out one field (20% chance). |
| Respects whether template already omits it. |
| """ |
| f = dict(fixture) |
| if not has_odds: |
| f["odds"] = None |
| if not has_date: |
| f["date"] = None |
| if not has_league: |
| f["league"] = None |
| |
| if random.random() < 0.20: |
| field = random.choice(["odds", "date", "league"]) |
| f[field] = None |
| return f |
|
|
| def apply_noise(text: str) -> str: |
| text = inject_emojis(text) |
| text = inject_typos(text) |
| text = inject_extra_lines(text) |
| return text |
|
|
| |
| |
| |
| def make_single_example(leagues: dict) -> dict | None: |
| fixture = random_fixture(leagues) |
| if not fixture: |
| return None |
|
|
| template = random.choice(SINGLE_TEMPLATES) |
| has_odds = "{odds}" in template |
| has_date = "{date}" in template |
| has_league= "{league}" in template |
| sep = random.choice(SEPARATORS) |
|
|
| input_text = template.format( |
| sep = sep, |
| league = fixture["league"], |
| team_1 = fixture["team_1"], |
| team_2 = fixture["team_2"], |
| prediction = fixture["prediction"], |
| date = fixture["date"], |
| odds = fixture["odds"], |
| time = random.choice(TIMES), |
| header = random.choice(HEADERS), |
| book = random.choice(BOOKS), |
| ) |
| input_text = apply_noise(input_text) |
| output_json = maybe_null_field(fixture, has_odds, has_date, has_league) |
|
|
| return { |
| "input": input_text, |
| "output": [output_json], |
| } |
|
|
|
|
| def make_multi_example(leagues: dict) -> dict | None: |
| n_tips = random.randint(2, 4) |
| fixtures = [random_fixture(leagues) for _ in range(n_tips * 2)] |
| fixtures = [f for f in fixtures if f][:n_tips] |
| if len(fixtures) < 2: |
| return None |
|
|
| block_template = random.choice(MULTI_BLOCK_TEMPLATES) |
| sep = random.choice(SEPARATORS) |
| blocks = [] |
|
|
| for i, f in enumerate(fixtures, 1): |
| has_odds = "{odds}" in block_template |
| has_date = "{date}" in block_template |
| has_league= "{league}" in block_template |
| block = block_template.format( |
| n = i, |
| sep = sep, |
| league = f["league"], |
| team_1 = f["team_1"], |
| team_2 = f["team_2"], |
| prediction = f["prediction"], |
| date = f["date"], |
| odds = f["odds"], |
| time = random.choice(TIMES), |
| book = random.choice(BOOKS), |
| ) |
| blocks.append((block, f, has_odds, has_date, has_league)) |
|
|
| header = random.choice(MULTI_HEADERS) |
| footer = random.choice(MULTI_FOOTERS) |
| parts = [header] + [b[0] for b in blocks] + ([footer] if footer else []) |
| input_text = "\n".join(parts) |
| input_text = apply_noise(input_text) |
|
|
| output = [ |
| maybe_null_field(f, has_odds, has_date, has_league) |
| for _, f, has_odds, has_date, has_league in blocks |
| ] |
|
|
| return {"input": input_text, "output": output} |
|
|
| |
| |
| |
| def make_training_example(ex: dict) -> dict: |
| return { |
| "messages": [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": ex["input"].strip()}, |
| {"role": "assistant", "content": json.dumps(ex["output"], ensure_ascii=False)}, |
| ] |
| } |
|
|
| |
| |
| |
| def build_dataset(): |
| leagues = load_teams(TEAMS_CSV) |
| examples = [] |
|
|
| n_single = int(EXAMPLES_COUNT * 0.70) |
| n_multi = EXAMPLES_COUNT - n_single |
| print(f"[1/2] Generating {n_single} single-tip + {n_multi} multi-tip examples...") |
|
|
| |
| attempts = 0 |
| while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1]) < n_single: |
| attempts += 1 |
| if attempts > n_single * 5: |
| break |
| ex = make_single_example(leagues) |
| if ex: |
| examples.append(make_training_example(ex)) |
|
|
| |
| attempts = 0 |
| while len([e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1]) < n_multi: |
| attempts += 1 |
| if attempts > n_multi * 5: |
| break |
| ex = make_multi_example(leagues) |
| if ex: |
| examples.append(make_training_example(ex)) |
|
|
| print(f" → {len(examples)} total examples generated") |
|
|
| |
| print("[2/2] Writing dataset files...") |
| random.shuffle(examples) |
| split = int(len(examples) * (1 - VAL_SPLIT)) |
| train, val = examples[:split], examples[split:] |
|
|
| for path, data in [(OUTPUT_TRAIN, train), (OUTPUT_VAL, val)]: |
| with open(path, "w", encoding="utf-8") as f: |
| for ex in data: |
| f.write(json.dumps(ex, ensure_ascii=False) + "\n") |
|
|
| |
| all_ex = train + val |
| single = sum(1 for e in all_ex if len(json.loads(e["messages"][2]["content"])) == 1) |
| multi = len(all_ex) - single |
| nulls = sum( |
| 1 for e in all_ex |
| for obj in json.loads(e["messages"][2]["content"]) |
| if any(v is None for v in obj.values()) |
| ) |
|
|
| print(f"\n✅ Done!") |
| print(f" {OUTPUT_TRAIN} → {len(train)} examples") |
| print(f" {OUTPUT_VAL} → {len(val)} examples") |
| print(f" Single-tip → {single}") |
| print(f" Multi-tip → {multi}") |
| print(f" With null fields→ {nulls}") |
|
|
| |
| print("\n── Single-tip sample ───────────────────────") |
| s = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) == 1) |
| for msg in s["messages"]: |
| print(f"[{msg['role']}]\n{msg['content'][:200]}\n") |
|
|
| print("── Multi-tip sample ────────────────────────") |
| m = next(e for e in examples if len(json.loads(e["messages"][2]["content"])) > 1) |
| for msg in m["messages"]: |
| print(f"[{msg['role']}]\n{msg['content'][:300]}\n") |
|
|
|
|
| if __name__ == "__main__": |
| build_dataset() |
|
|