| """ |
| parse_edm.py |
| ------------ |
| Parses the Volve F.edm.xml (Landmark Engineering Data Model) into |
| structured CSVs extracting well/wellbore metadata, casing configurations, |
| BHA (Bottom Hole Assembly) details, and daily cost records. |
| |
| Outputs to data/processed/edm/ |
| """ |
|
|
| import xml.etree.ElementTree as ET |
| import pandas as pd |
| from pathlib import Path |
| import logging |
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") |
| log = logging.getLogger(__name__) |
|
|
| BASE_DIR = Path(__file__).resolve().parents[2] |
| EDM_FILE = BASE_DIR / "data" / "raw" / "Well_technical_data" / "EDM.XML" / "Volve F.edm.xml" |
| OUT_DIR = BASE_DIR / "data" / "processed" / "edm" |
| OUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def _strip_ns(tag: str) -> str: |
| return tag.split("}")[-1] if "}" in tag else tag |
|
|
|
|
| def elem_to_dict(elem: ET.Element, prefix: str = "") -> dict: |
| """ |
| Flatten an XML element into a flat dict by concatenating tag paths. |
| Handles attributes and text content. |
| """ |
| result = {} |
| for attr_k, attr_v in elem.attrib.items(): |
| result[f"{prefix}{_strip_ns(attr_k)}"] = attr_v |
| if elem.text and elem.text.strip(): |
| result[f"{prefix}value"] = elem.text.strip() |
| for child in elem: |
| tag = _strip_ns(child.tag) |
| child_dict = elem_to_dict(child, prefix=f"{tag}_") |
| result.update(child_dict) |
| return result |
|
|
|
|
| def collect_elements(root: ET.Element, element_type: str) -> list[dict]: |
| """Collect all elements of a given type into list of dicts.""" |
| rows = [] |
| for elem in root.iter(): |
| if _strip_ns(elem.tag).lower() == element_type.lower(): |
| rows.append(elem_to_dict(elem)) |
| return rows |
|
|
|
|
| def parse_edm(): |
| if not EDM_FILE.exists(): |
| log.error(f"EDM file not found: {EDM_FILE}") |
| return |
|
|
| log.info(f"Parsing EDM file: {EDM_FILE}") |
| try: |
| tree = ET.parse(EDM_FILE) |
| root = tree.getroot() |
| except ET.ParseError as e: |
| log.error(f"XML parse error: {e}") |
| return |
|
|
| |
| tag_counts: dict[str, int] = {} |
| for elem in root.iter(): |
| tag = _strip_ns(elem.tag) |
| tag_counts[tag] = tag_counts.get(tag, 0) + 1 |
|
|
| log.info("Top element types in EDM.XML:") |
| for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:30]: |
| log.info(f" {tag}: {count}") |
|
|
| |
| inv_df = pd.DataFrame( |
| sorted(tag_counts.items(), key=lambda x: -x[1]), |
| columns=["element_type", "count"] |
| ) |
| inv_df.to_csv(OUT_DIR / "_edm_element_types.csv", index=False) |
|
|
| |
| ENTITIES = [ |
| "CD_WELL", |
| "CD_WELLBORE", |
| "CD_ASSEMBLY", |
| "CD_ASSEMBLY_COMP", |
| "CD_HOLE_SECT", |
| "CD_HOLE_SECT_GROUP", |
| "CD_WELLBORE_FORMATION", |
| "CD_BHA_COMP_MWD", |
| "CD_BHA_COMP_STAB", |
| "CD_BHA_COMP_NOZZLE", |
| "CD_BHA_COMP_DP_HW", |
| "CD_SURVEY_STATION", |
| "CD_DEFINITIVE_SURVEY_STATION", |
| "CD_PORE_PRESSURE", |
| "CD_FRAC_GRADIENT", |
| "CD_CASE", |
| "WP_TDA_DRAGCHART", |
| ] |
|
|
| for entity in ENTITIES: |
| rows = collect_elements(root, entity) |
| if rows: |
| df = pd.DataFrame(rows) |
| out_path = OUT_DIR / f"edm_{entity}.csv" |
| df.to_csv(out_path, index=False) |
| log.info(f" Saved {entity}: {len(df)} rows → {out_path.name}") |
| else: |
| log.info(f" {entity}: no rows found") |
|
|
|
|
| if __name__ == "__main__": |
| parse_edm() |
|
|