File size: 4,722 Bytes
d64fd55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# src/ingestion/strava_tcx_parser.py
import gzip
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import List, Dict, Any, Optional

from pathlib import Path

NS = {
    "tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
}


def _open_tcx(path: str) -> bytes:
    path = str(path)
    if path.endswith(".gz"):
        with gzip.open(path, "rb") as f:
            return f.read()
    else:
        with open(path, "rb") as f:
            return f.read()


def parse_tcx_file(path: str, allow_nonrunning: bool = False) -> Optional[Dict[str, Any]]:
    raw = _open_tcx(path)
    try:
        root = ET.fromstring(raw)
    except ET.ParseError:
        print(f"⚠️ Skipping invalid TCX file: {path}")
        return None

    activities = root.find("tcx:Activities", NS)
    if activities is None:
        return None

    activity = activities.find("tcx:Activity", NS)
    if activity is None:
        return None

    sport = activity.attrib.get("Sport", "Unknown")
    if not allow_nonrunning and sport.lower() not in ("running", "trailrunning"):
        # skip non-running
        return None

    activity_id_el = activity.find("tcx:Id", NS)
    start_time = None
    if activity_id_el is not None and activity_id_el.text:
        start_time = datetime.fromisoformat(activity_id_el.text.replace("Z", "+00:00"))

    records = []
    for lap in activity.findall("tcx:Lap", NS):
        for track in lap.findall("tcx:Track", NS):
            for tp in track.findall("tcx:Trackpoint", NS):
                time_el = tp.find("tcx:Time", NS)
                pos = tp.find("tcx:Position", NS)
                alt_el = tp.find("tcx:AltitudeMeters", NS)
                dist_el = tp.find("tcx:DistanceMeters", NS)
                hr_el = tp.find("tcx:HeartRateBpm", NS)
                cadence_el = tp.find("tcx:Cadence", NS)

                time_val = None
                if time_el is not None and time_el.text:
                    time_val = datetime.fromisoformat(time_el.text.replace("Z", "+00:00"))

                lat = None
                lon = None
                if pos is not None:
                    lat_el = pos.find("tcx:LatitudeDegrees", NS)
                    lon_el = pos.find("tcx:LongitudeDegrees", NS)
                    if lat_el is not None and lat_el.text:
                        lat = float(lat_el.text)
                    if lon_el is not None and lon_el.text:
                        lon = float(lon_el.text)

                hr = None
                if hr_el is not None:
                    val = hr_el.find("tcx:Value", NS)
                    if val is not None and val.text:
                        hr = int(val.text)

                cadence = None
                if cadence_el is not None and cadence_el.text:
                    try:
                        cadence = int(cadence_el.text)
                    except Exception:
                        cadence = None

                record = {
                    "time": time_val,
                    "lat": lat,
                    "lon": lon,
                    "altitude_m": (
                        float(alt_el.text) if alt_el is not None and alt_el.text else None
                    ),
                    "distance_m": (
                        float(dist_el.text) if dist_el is not None and dist_el.text else None
                    ),
                    "hr_bpm": hr,
                    "cadence_rpm": cadence,
                }
                records.append(record)

    if not records:
        return None

    # Normalize id
    file_stem = Path(path).stem
    if file_stem.endswith(".tcx"):
        file_stem = file_stem[:-4]

    total_distance = records[-1].get("distance_m")
    total_duration = None
    if records[0].get("time") and records[-1].get("time"):
        total_duration = (records[-1]["time"] - records[0]["time"]).total_seconds()

    return {
        "id": file_stem,
        "sport": sport,
        "start_time": start_time,
        "total_distance_m": total_distance,
        "total_duration_s": total_duration,
        "records": records,
        "source_path": str(path),
    }


def parse_strava_directory(folder: str, allow_nonrunning: bool = False) -> List[Dict[str, Any]]:
    runs = []
    folder = Path(folder)
    if not folder.exists():
        return runs

    for file in sorted(folder.iterdir()):
        if not (str(file).endswith(".tcx") or str(file).endswith(".tcx.gz")):
            continue
        parsed = parse_tcx_file(str(file), allow_nonrunning=allow_nonrunning)
        if parsed:
            runs.append(parsed)
    # sort by start_time
    runs.sort(key=lambda r: r.get("start_time") or datetime.min)
    return runs