Spaces:

DavMelchi
/

db_query

Running

App Files Files Community

DavMelchi commited on May 4

Commit

cc3d5fb

1 Parent(s): b75a50f

Parse structured KML descriptions into TAB fields

Browse files

Files changed (2) hide show

tests/test_kml_to_tab.py +58 -0
utils/kml_to_tab.py +114 -12

tests/test_kml_to_tab.py CHANGED Viewed

@@ -80,6 +80,29 @@ RSRP_KML = b"""<?xml version="1.0" encoding="UTF-8"?>
 </kml>
 """
 def test_convert_kml_to_tab_zip_splits_mixed_geometries():
     result = convert_kml_to_tab_zip(MIXED_KML, "mixed sample.kml")
@@ -123,6 +146,41 @@ def test_convert_kml_to_tab_zip_preserves_measurement_values():
     assert data["value"].tolist() == [-105.599998474121, -116.099998474121]
 def test_convert_kml_to_tab_zip_rejects_empty_input():
     with pytest.raises(KmlToTabError, match="vide"):
         convert_kml_to_tab_zip(b"", "empty.kml")

 </kml>
 """
+STRUCTURED_DESCRIPTION_KML = b"""<?xml version="1.0" encoding="UTF-8"?>
+<kml xmlns="http://www.opengis.net/kml/2.2">
+  <Document>
+    <Placemark>
+      <name>Busaiteen_1</name>
+      <description>&lt;b&gt;Sector Details:&lt;/b&gt;&lt;br&gt;&lt;b&gt;code:&lt;/b&gt; 1004&lt;br&gt;&lt;b&gt;name:&lt;/b&gt; Busaiteen_1&lt;br&gt;&lt;b&gt;Azimut:&lt;/b&gt; 30&lt;br&gt;&lt;b&gt;Longitude:&lt;/b&gt; 50.606051&lt;br&gt;&lt;b&gt;Latitude:&lt;/b&gt; 26.261322&lt;br&gt;&lt;b&gt;size:&lt;/b&gt; 1000&lt;br&gt;&lt;b&gt;color:&lt;/b&gt; 7fff0000&lt;br&gt;</description>
+      <Polygon>
+        <outerBoundaryIs>
+          <LinearRing>
+            <coordinates>
+              50.606051,26.261322,0
+              50.607051,26.261322,0
+              50.607051,26.262322,0
+              50.606051,26.261322,0
+            </coordinates>
+          </LinearRing>
+        </outerBoundaryIs>
+      </Polygon>
+    </Placemark>
+  </Document>
+</kml>
+"""
 def test_convert_kml_to_tab_zip_splits_mixed_geometries():
     result = convert_kml_to_tab_zip(MIXED_KML, "mixed sample.kml")
     assert data["value"].tolist() == [-105.599998474121, -116.099998474121]
+def test_convert_kml_to_tab_zip_expands_structured_description_fields():
+    result = convert_kml_to_tab_zip(STRUCTURED_DESCRIPTION_KML, "sectors.kml")
+    with tempfile.TemporaryDirectory() as tmp_dir_name:
+        tmp_dir = Path(tmp_dir_name)
+        with zipfile.ZipFile(BytesIO(result.zip_bytes)) as zf:
+            zf.extractall(tmp_dir)
+        metadata, _fids, _geometry, field_data = pyogrio.raw.read(
+            tmp_dir / "sectors.tab"
+        )
+    fields = metadata["fields"].tolist()
+    data = dict(zip(fields, field_data))
+    assert "value" not in fields
+    for field in [
+        "code",
+        "description_name",
+        "Azimut",
+        "Longitude",
+        "Latitude",
+        "size",
+        "color",
+    ]:
+        assert field in fields
+    assert data["code"].tolist() == ["1004"]
+    assert data["description_name"].tolist() == ["Busaiteen_1"]
+    assert data["Azimut"].tolist() == ["30"]
+    assert data["Longitude"].tolist() == ["50.606051"]
+    assert data["Latitude"].tolist() == ["26.261322"]
+    assert data["size"].tolist() == ["1000"]
+    assert data["color"].tolist() == ["7fff0000"]
 def test_convert_kml_to_tab_zip_rejects_empty_input():
     with pytest.raises(KmlToTabError, match="vide"):
         convert_kml_to_tab_zip(b"", "empty.kml")

utils/kml_to_tab.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import io
 import re
 import struct
 import tempfile
@@ -165,18 +166,24 @@ def _append_kml_fields(fields, field_data, kml_bytes: bytes, feature_count: int)
     if len(records) != feature_count:
         return fields, field_data
-    existing = {str(field).lower() for field in fields}
-    extra_names = [name for name in ["value"] if name.lower() not in existing]
-    if not extra_names:
         return fields, field_data
     extra_values = {
-        "value": np.array([record["value"] for record in records], dtype=float),
     }
     return (
-        np.array([*fields, *extra_names], dtype=object),
-        [*field_data, *[extra_values[name] for name in extra_names]],
     )
@@ -194,17 +201,80 @@ def _extract_kml_records(kml_bytes: bytes) -> list[dict[str, object]]:
             continue
         description = _child_text(element, "description")
-        value = _parse_measurement_value(description)
-        records.append(
-            {
-                "value": value,
-            }
-        )
     return records
 def _placemark_has_geometry(placemark) -> bool:
     geometry_tags = {
         "Point",
@@ -247,6 +317,38 @@ def _parse_measurement_value(description: str) -> float:
     return float(match.group(1))
 def _wkb_geometry_type(wkb: bytes) -> str:
     if not wkb or len(wkb) < 5:
         raise KmlToTabError("Geometrie WKB invalide dans le KML.")

 from __future__ import annotations
 import io
+import html
 import re
 import struct
 import tempfile
     if len(records) != feature_count:
         return fields, field_data
+    field_pairs = _output_field_pairs(records, fields)
+    if not field_pairs:
         return fields, field_data
     extra_values = {
+        output_name: _record_column(records, source_name)
+        for source_name, output_name in field_pairs
     }
     return (
+        np.array(
+            [*fields, *[output_name for _source, output_name in field_pairs]],
+            dtype=object,
+        ),
+        [
+            *field_data,
+            *[extra_values[output_name] for _source, output_name in field_pairs],
+        ],
     )
             continue
         description = _child_text(element, "description")
+        record = _parse_structured_description(description)
+        value = float("nan") if record else _parse_measurement_value(description)
+        if not record and np.isfinite(value):
+            record["value"] = value
+        records.append(record)
     return records
+def _output_field_pairs(
+    records: list[dict[str, object]],
+    existing_fields,
+) -> list[tuple[str, str]]:
+    used = {str(field).lower() for field in existing_fields}
+    pairs: list[tuple[str, str]] = []
+    for source_name in _record_field_names(records):
+        output_name = source_name
+        if output_name.lower() in used:
+            output_name = _unique_field_name(f"description_{source_name}", used)
+        else:
+            used.add(output_name.lower())
+        pairs.append((source_name, output_name))
+    return pairs
+def _unique_field_name(name: str, used: set[str]) -> str:
+    base = _safe_column_name(name)[:31]
+    candidate = base
+    suffix = 2
+    while candidate.lower() in used:
+        suffix_text = f"_{suffix}"
+        candidate = f"{base[: 31 - len(suffix_text)]}{suffix_text}"
+        suffix += 1
+    used.add(candidate.lower())
+    return candidate
+def _record_field_names(records: list[dict[str, object]]) -> list[str]:
+    names: list[str] = []
+    if any("value" in record for record in records):
+        names.append("value")
+    for record in records:
+        for name in record:
+            if name != "value" and name not in names:
+                names.append(name)
+    return names
+def _record_column(records: list[dict[str, object]], name: str):
+    values = [record.get(name, "") for record in records]
+    if name == "value":
+        return np.array(
+            [value if value != "" else float("nan") for value in values],
+            dtype=float,
+        )
+    return np.array([_stringify_field_value(value) for value in values], dtype=object)
+def _stringify_field_value(value: object) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, float) and not np.isfinite(value):
+        return ""
+    return str(value)
 def _placemark_has_geometry(placemark) -> bool:
     geometry_tags = {
         "Point",
     return float(match.group(1))
+def _parse_structured_description(description: str) -> dict[str, str]:
+    text = html.unescape(description or "")
+    text = re.sub(r"(?i)<br\s*/?>", "\n", text)
+    text = re.sub(r"(?i)</?b>", "", text)
+    text = re.sub(r"<[^>]+>", "", text)
+    record: dict[str, str] = {}
+    for line in text.splitlines():
+        if ":" not in line:
+            continue
+        key, value = line.split(":", maxsplit=1)
+        column = _safe_column_name(key)
+        value = value.strip()
+        if not column or not value:
+            continue
+        record[column] = value
+    return record
+def _safe_column_name(name: str) -> str:
+    column = html.unescape(name or "").strip()
+    column = re.sub(r"\s+", "_", column)
+    column = re.sub(r"[^A-Za-z0-9_]+", "_", column).strip("_")
+    if not column:
+        return ""
+    if column[0].isdigit():
+        column = f"field_{column}"
+    return column[:31]
 def _wkb_geometry_type(wkb: bytes) -> str:
     if not wkb or len(wkb) < 5:
         raise KmlToTabError("Geometrie WKB invalide dans le KML.")