File size: 5,209 Bytes
6f03249 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | import json
import unicodedata
from llama_cpp import Llama
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# CONFIG
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
GGUF_PATH = "./football-extractor-q4.gguf"
SYSTEM_PROMPT = (
"You are a football data extraction assistant. "
"Extract structured data from the message and return ONLY a valid JSON array. "
"Each object in the array must have exactly these keys: "
"league, team_1, team_2, prediction, date, odds. "
"If a field is missing, use null. No extra text, no markdown."
)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# LOAD MODEL (runs on Mac Metal / CPU)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
llm = Llama(
model_path=GGUF_PATH,
n_ctx=2048, # context window
n_gpu_layers=-1, # offload all layers to Metal GPU
verbose=False,
)
print("โ
Model loaded")
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# HELPERS
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def clean_input(text: str) -> str:
"""Strip bold unicode characters (e.g. Telegram bold)."""
return ''.join(
c for c in unicodedata.normalize('NFKD', text)
if not unicodedata.combining(c)
)
def fix_keys(results: list) -> list:
"""Fix 'match' key โ team_1 / team_2 if model returns it."""
for item in results:
if "match" in item and "team_1" not in item:
parts = item.pop("match").split(" - ", 1)
item["team_1"] = parts[0].strip() if len(parts) > 0 else None
item["team_2"] = parts[1].strip() if len(parts) > 1 else None
return results
def normalize(result: list) -> list:
keys = ["league", "team_1", "team_2", "prediction", "date", "odds"]
if result and not isinstance(result[0], (dict, list)):
return [dict(zip(keys, result))]
normalized = []
for item in result:
if isinstance(item, str):
try:
item = json.loads(item)
except:
continue
if isinstance(item, list):
item = dict(zip(keys, item))
if isinstance(item, dict):
normalized.append(item)
return normalized
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# INFERENCE
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def extract(text: str, debug: bool = False) -> list:
text = clean_input(text)
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": text},
],
temperature=0.0,
max_tokens=512,
stop=["<|im_end|>", "<|endoftext|>"],
)
raw = response["choices"][0]["message"]["content"].strip()
if debug:
print(f"[raw] {repr(raw)}")
try:
result = json.loads(raw)
result = normalize(result if isinstance(result, list) else [result])
result = fix_keys(result)
return result
except json.JSONDecodeError:
print(f"[!] Could not parse JSON:\n{raw}")
return []
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# TEST
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
if __name__ == "__main__":
tests = [
# single tip
"""โฝ๏ธ Prediction of the Day โฝ๏ธ
Date: 24/03/2026
League: Eerste divisie Netherlands
Match: FC Emmen - SC Cambuur
Kick off: 20:00 WAT
โ
Over 1.5
โ
Odds @1.13 on BETANO""",
# multi tip real format
"""โฝ๏ธ ๐๐ซ๐๐๐ข๐๐ญ๐ข๐จ๐ง ๐จ๐ ๐ญ๐ก๐ ๐๐๐ฒ โฝ๏ธ
๐๐๐ญ๐: 24/03/2026
๐๐๐๐ ๐ฎ๐: League 1 England
๐๐๐ญ๐๐ก: Doncaster Rovers - Port Vale
๐๐ข๐๐ค ๐จ๐๐: 20:45 WAT
โ
Under 3.5
โ
Odds @1.36 on BETANO
โฝ๏ธ ๐๐ผ๐ผ๐๐ฏ๐ฎ๐น๐น ๐ง๐ถ๐ฝ ๐ฎ โฝ๏ธ
๐๐๐ญ๐: 24/03/2026
๐๐๐๐ ๐ฎ๐: La Liga
๐๐๐ญ๐๐ก: Real Madrid - Barcelona
๐๐ข๐๐ค ๐จ๐๐: 21:00 WAT
โ
1X
โ
Odds @1.42 on BETANO""",
# noisy missing date
"""wow predictions
MATCH: Juventus VS Napoli
League: Serie A
we forecast Over 2.5
Odds 1.75""",
]
for i, test in enumerate(tests, 1):
print(f"\n{'='*50}")
print(f"TEST {i}: {test[:80]}...")
result = extract(test)
print(json.dumps(result, indent=2, ensure_ascii=False))
|