File size: 5,209 Bytes
6f03249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import unicodedata
from llama_cpp import Llama

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# CONFIG
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
GGUF_PATH = "./football-extractor-q4.gguf"

SYSTEM_PROMPT = (
    "You are a football data extraction assistant. "
    "Extract structured data from the message and return ONLY a valid JSON array. "
    "Each object in the array must have exactly these keys: "
    "league, team_1, team_2, prediction, date, odds. "
    "If a field is missing, use null. No extra text, no markdown."
)

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# LOAD MODEL (runs on Mac Metal / CPU)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
llm = Llama(
    model_path=GGUF_PATH,
    n_ctx=2048,        # context window
    n_gpu_layers=-1,   # offload all layers to Metal GPU
    verbose=False,
)
print("โœ… Model loaded")

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# HELPERS
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def clean_input(text: str) -> str:
    """Strip bold unicode characters (e.g. Telegram bold)."""
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

def fix_keys(results: list) -> list:
    """Fix 'match' key โ†’ team_1 / team_2 if model returns it."""
    for item in results:
        if "match" in item and "team_1" not in item:
            parts = item.pop("match").split(" - ", 1)
            item["team_1"] = parts[0].strip() if len(parts) > 0 else None
            item["team_2"] = parts[1].strip() if len(parts) > 1 else None
    return results

def normalize(result: list) -> list:
    keys = ["league", "team_1", "team_2", "prediction", "date", "odds"]
    if result and not isinstance(result[0], (dict, list)):
        return [dict(zip(keys, result))]
    normalized = []
    for item in result:
        if isinstance(item, str):
            try:
                item = json.loads(item)
            except:
                continue
        if isinstance(item, list):
            item = dict(zip(keys, item))
        if isinstance(item, dict):
            normalized.append(item)
    return normalized

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# INFERENCE
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def extract(text: str, debug: bool = False) -> list:
    text = clean_input(text)

    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": text},
        ],
        temperature=0.0,
        max_tokens=512,
        stop=["<|im_end|>", "<|endoftext|>"],
    )

    raw = response["choices"][0]["message"]["content"].strip()

    if debug:
        print(f"[raw] {repr(raw)}")

    try:
        result = json.loads(raw)
        result = normalize(result if isinstance(result, list) else [result])
        result = fix_keys(result)
        return result
    except json.JSONDecodeError:
        print(f"[!] Could not parse JSON:\n{raw}")
        return []

# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# TEST
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__ == "__main__":
    tests = [
        # single tip
        """โšฝ๏ธ Prediction of the Day โšฝ๏ธ
Date: 24/03/2026
League: Eerste divisie Netherlands
Match: FC Emmen - SC Cambuur
Kick off: 20:00 WAT
โœ…Over 1.5
โœ…Odds @1.13 on BETANO""",

        # multi tip real format
        """โšฝ๏ธ ๐๐ซ๐ž๐๐ข๐œ๐ญ๐ข๐จ๐ง ๐จ๐Ÿ ๐ญ๐ก๐ž ๐ƒ๐š๐ฒ โšฝ๏ธ
๐ƒ๐š๐ญ๐ž: 24/03/2026
๐‹๐ž๐š๐ ๐ฎ๐ž: League 1 England
๐Œ๐š๐ญ๐œ๐ก: Doncaster Rovers - Port Vale
๐Š๐ข๐œ๐ค ๐จ๐Ÿ๐Ÿ: 20:45 WAT
โœ…Under 3.5
โœ…Odds @1.36 on BETANO
โšฝ๏ธ ๐—™๐—ผ๐—ผ๐˜๐—ฏ๐—ฎ๐—น๐—น ๐—ง๐—ถ๐—ฝ ๐Ÿฎ โšฝ๏ธ
๐ƒ๐š๐ญ๐ž: 24/03/2026
๐‹๐ž๐š๐ ๐ฎ๐ž: La Liga
๐Œ๐š๐ญ๐œ๐ก: Real Madrid - Barcelona
๐Š๐ข๐œ๐ค ๐จ๐Ÿ๐Ÿ: 21:00 WAT
โœ…1X
โœ…Odds @1.42 on BETANO""",

        # noisy missing date
        """wow predictions
MATCH: Juventus VS Napoli
League: Serie A
we forecast Over 2.5
Odds 1.75""",
    ]

    for i, test in enumerate(tests, 1):
        print(f"\n{'='*50}")
        print(f"TEST {i}: {test[:80]}...")
        result = extract(test)
        print(json.dumps(result, indent=2, ensure_ascii=False))