# src/ingestion/strava_tcx_parser.py import gzip import xml.etree.ElementTree as ET from datetime import datetime from typing import List, Dict, Any, Optional from pathlib import Path NS = { "tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2", } def _open_tcx(path: str) -> bytes: path = str(path) if path.endswith(".gz"): with gzip.open(path, "rb") as f: return f.read() else: with open(path, "rb") as f: return f.read() def parse_tcx_file(path: str, allow_nonrunning: bool = False) -> Optional[Dict[str, Any]]: raw = _open_tcx(path) try: root = ET.fromstring(raw) except ET.ParseError: print(f"⚠️ Skipping invalid TCX file: {path}") return None activities = root.find("tcx:Activities", NS) if activities is None: return None activity = activities.find("tcx:Activity", NS) if activity is None: return None sport = activity.attrib.get("Sport", "Unknown") if not allow_nonrunning and sport.lower() not in ("running", "trailrunning"): # skip non-running return None activity_id_el = activity.find("tcx:Id", NS) start_time = None if activity_id_el is not None and activity_id_el.text: start_time = datetime.fromisoformat(activity_id_el.text.replace("Z", "+00:00")) records = [] for lap in activity.findall("tcx:Lap", NS): for track in lap.findall("tcx:Track", NS): for tp in track.findall("tcx:Trackpoint", NS): time_el = tp.find("tcx:Time", NS) pos = tp.find("tcx:Position", NS) alt_el = tp.find("tcx:AltitudeMeters", NS) dist_el = tp.find("tcx:DistanceMeters", NS) hr_el = tp.find("tcx:HeartRateBpm", NS) cadence_el = tp.find("tcx:Cadence", NS) time_val = None if time_el is not None and time_el.text: time_val = datetime.fromisoformat(time_el.text.replace("Z", "+00:00")) lat = None lon = None if pos is not None: lat_el = pos.find("tcx:LatitudeDegrees", NS) lon_el = pos.find("tcx:LongitudeDegrees", NS) if lat_el is not None and lat_el.text: lat = float(lat_el.text) if lon_el is not None and lon_el.text: lon = float(lon_el.text) hr = None if hr_el is not None: val = hr_el.find("tcx:Value", NS) if val is not None and val.text: hr = int(val.text) cadence = None if cadence_el is not None and cadence_el.text: try: cadence = int(cadence_el.text) except Exception: cadence = None record = { "time": time_val, "lat": lat, "lon": lon, "altitude_m": ( float(alt_el.text) if alt_el is not None and alt_el.text else None ), "distance_m": ( float(dist_el.text) if dist_el is not None and dist_el.text else None ), "hr_bpm": hr, "cadence_rpm": cadence, } records.append(record) if not records: return None # Normalize id file_stem = Path(path).stem if file_stem.endswith(".tcx"): file_stem = file_stem[:-4] total_distance = records[-1].get("distance_m") total_duration = None if records[0].get("time") and records[-1].get("time"): total_duration = (records[-1]["time"] - records[0]["time"]).total_seconds() return { "id": file_stem, "sport": sport, "start_time": start_time, "total_distance_m": total_distance, "total_duration_s": total_duration, "records": records, "source_path": str(path), } def parse_strava_directory(folder: str, allow_nonrunning: bool = False) -> List[Dict[str, Any]]: runs = [] folder = Path(folder) if not folder.exists(): return runs for file in sorted(folder.iterdir()): if not (str(file).endswith(".tcx") or str(file).endswith(".tcx.gz")): continue parsed = parse_tcx_file(str(file), allow_nonrunning=allow_nonrunning) if parsed: runs.append(parsed) # sort by start_time runs.sort(key=lambda r: r.get("start_time") or datetime.min) return runs