File size: 2,755 Bytes
16c19b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from __future__ import annotations

import json
from pathlib import Path

import pandas as pd

from ingest.sofascore.compact_stats import compact_match_stats_json
from ingest.sofascore.paths import MATCH_STATS_PARQUET


def _write_json(path: Path, payload: dict) -> None:
    path.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")


def test_compact_match_stats_json_dedup_by_event_id(tmp_path: Path):
    stats_dir = tmp_path / "sofascore"
    stats_dir.mkdir()
    _write_json(
        stats_dir / "1_Brasil_x_Argentina_stats.json",
        {
            "event_id": 1,
            "home_team": "Brasil",
            "away_team": "Argentina",
            "match_date": "2022-11-20",
            "home_xg": 1.1,
            "fetched_at": "2026-01-01T00:00:00+00:00",
        },
    )
    _write_json(
        stats_dir / "1_Brasil_x_Argentina_v2_stats.json",
        {
            "event_id": 1,
            "home_team": "Brasil",
            "away_team": "Argentina",
            "match_date": "2022-11-20",
            "home_xg": 2.2,
            "fetched_at": "2026-06-01T00:00:00+00:00",
        },
    )
    _write_json(
        stats_dir / "2_Portugal_x_Suíça_stats.json",
        {
            "event_id": 2,
            "home_team": "Portugal",
            "away_team": "Suíça",
            "match_date": "2022-12-06",
            "home_xg": 2.36,
            "fetched_at": "2026-06-01T00:00:00+00:00",
        },
    )

    report = compact_match_stats_json(stats_dir=stats_dir, merge_existing=False)

    assert report.json_files == 3
    assert report.rows_written == 2
    parquet = stats_dir / MATCH_STATS_PARQUET
    df = pd.read_parquet(parquet)
    assert len(df) == 2
    brasil = df[df["event_id"] == 1].iloc[0]
    assert float(brasil["home_xg"]) == 2.2


def test_compact_merges_existing_parquet(tmp_path: Path):
    stats_dir = tmp_path / "sofascore"
    stats_dir.mkdir()
    existing = pd.DataFrame(
        [
            {
                "event_id": 99,
                "home_team": "França",
                "away_team": "Alemanha",
                "match_date": "2018-07-15",
                "home_xg": 1.0,
            }
        ]
    )
    existing.to_parquet(stats_dir / MATCH_STATS_PARQUET, index=False)
    _write_json(
        stats_dir / "1_Brasil_x_Argentina_stats.json",
        {
            "event_id": 1,
            "home_team": "Brasil",
            "away_team": "Argentina",
            "match_date": "2022-11-20",
            "home_xg": 1.5,
        },
    )

    report = compact_match_stats_json(stats_dir=stats_dir, merge_existing=True)

    assert report.rows_written == 2
    df = pd.read_parquet(stats_dir / MATCH_STATS_PARQUET)
    assert set(df["event_id"].tolist()) == {1, 99}