Spaces:
Running
Running
File size: 4,722 Bytes
d64fd55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | # src/ingestion/strava_tcx_parser.py
import gzip
import xml.etree.ElementTree as ET
from datetime import datetime
from typing import List, Dict, Any, Optional
from pathlib import Path
NS = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
}
def _open_tcx(path: str) -> bytes:
path = str(path)
if path.endswith(".gz"):
with gzip.open(path, "rb") as f:
return f.read()
else:
with open(path, "rb") as f:
return f.read()
def parse_tcx_file(path: str, allow_nonrunning: bool = False) -> Optional[Dict[str, Any]]:
raw = _open_tcx(path)
try:
root = ET.fromstring(raw)
except ET.ParseError:
print(f"⚠️ Skipping invalid TCX file: {path}")
return None
activities = root.find("tcx:Activities", NS)
if activities is None:
return None
activity = activities.find("tcx:Activity", NS)
if activity is None:
return None
sport = activity.attrib.get("Sport", "Unknown")
if not allow_nonrunning and sport.lower() not in ("running", "trailrunning"):
# skip non-running
return None
activity_id_el = activity.find("tcx:Id", NS)
start_time = None
if activity_id_el is not None and activity_id_el.text:
start_time = datetime.fromisoformat(activity_id_el.text.replace("Z", "+00:00"))
records = []
for lap in activity.findall("tcx:Lap", NS):
for track in lap.findall("tcx:Track", NS):
for tp in track.findall("tcx:Trackpoint", NS):
time_el = tp.find("tcx:Time", NS)
pos = tp.find("tcx:Position", NS)
alt_el = tp.find("tcx:AltitudeMeters", NS)
dist_el = tp.find("tcx:DistanceMeters", NS)
hr_el = tp.find("tcx:HeartRateBpm", NS)
cadence_el = tp.find("tcx:Cadence", NS)
time_val = None
if time_el is not None and time_el.text:
time_val = datetime.fromisoformat(time_el.text.replace("Z", "+00:00"))
lat = None
lon = None
if pos is not None:
lat_el = pos.find("tcx:LatitudeDegrees", NS)
lon_el = pos.find("tcx:LongitudeDegrees", NS)
if lat_el is not None and lat_el.text:
lat = float(lat_el.text)
if lon_el is not None and lon_el.text:
lon = float(lon_el.text)
hr = None
if hr_el is not None:
val = hr_el.find("tcx:Value", NS)
if val is not None and val.text:
hr = int(val.text)
cadence = None
if cadence_el is not None and cadence_el.text:
try:
cadence = int(cadence_el.text)
except Exception:
cadence = None
record = {
"time": time_val,
"lat": lat,
"lon": lon,
"altitude_m": (
float(alt_el.text) if alt_el is not None and alt_el.text else None
),
"distance_m": (
float(dist_el.text) if dist_el is not None and dist_el.text else None
),
"hr_bpm": hr,
"cadence_rpm": cadence,
}
records.append(record)
if not records:
return None
# Normalize id
file_stem = Path(path).stem
if file_stem.endswith(".tcx"):
file_stem = file_stem[:-4]
total_distance = records[-1].get("distance_m")
total_duration = None
if records[0].get("time") and records[-1].get("time"):
total_duration = (records[-1]["time"] - records[0]["time"]).total_seconds()
return {
"id": file_stem,
"sport": sport,
"start_time": start_time,
"total_distance_m": total_distance,
"total_duration_s": total_duration,
"records": records,
"source_path": str(path),
}
def parse_strava_directory(folder: str, allow_nonrunning: bool = False) -> List[Dict[str, Any]]:
runs = []
folder = Path(folder)
if not folder.exists():
return runs
for file in sorted(folder.iterdir()):
if not (str(file).endswith(".tcx") or str(file).endswith(".tcx.gz")):
continue
parsed = parse_tcx_file(str(file), allow_nonrunning=allow_nonrunning)
if parsed:
runs.append(parsed)
# sort by start_time
runs.sort(key=lambda r: r.get("start_time") or datetime.min)
return runs
|