import re from collections.abc import Sequence import pandas as pd def _normalize_header_value(value: object) -> str: return str(value).strip().replace("\xa0", " ").lower() def _is_numeric_like(value: str) -> bool: return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value)) def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] | None) -> int: normalized = [_normalize_header_value(value) for value in values] non_empty = [value for value in normalized if value and value != "nan"] expected = { _normalize_header_value(column) for column in (expected_columns or []) if column } expected_matches = len(expected.intersection(non_empty)) text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty) numeric_like_count = sum(_is_numeric_like(value) for value in non_empty) info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0 return ( expected_matches * 100 + text_like_count * 3 + len(non_empty) - numeric_like_count * 2 - info_penalty ) def _probe_sheet_name(sheet_name) -> str | int: if isinstance(sheet_name, (str, int)): return sheet_name if isinstance(sheet_name, Sequence) and sheet_name: return sheet_name[0] raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets") def detect_dump_header_row( file_path, sheet_name, *, expected_columns: Sequence[str] | None = None, engine: str = "calamine", ) -> int: """ Detect whether a dump sheet header is on the first or second row. """ probe_sheet = _probe_sheet_name(sheet_name) if hasattr(file_path, "seek"): file_path.seek(0) preview = pd.read_excel( file_path, sheet_name=probe_sheet, engine=engine, header=None, nrows=2, ) if hasattr(file_path, "seek"): file_path.seek(0) if preview.empty: return 0 best_row = 0 best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns) max_candidate_row = min(len(preview.index), 2) for row_index in range(1, max_candidate_row): score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns) if score > best_score: best_row = row_index best_score = score return best_row def read_dump_excel( file_path, *, sheet_name, expected_columns: Sequence[str] | None = None, engine: str = "calamine", **kwargs, ): """ Read dump sheets while supporting headers placed on row 1 or row 2. """ if "header" in kwargs or "skiprows" in kwargs: raise ValueError("read_dump_excel manages header detection internally") header_row = detect_dump_header_row( file_path, sheet_name, expected_columns=expected_columns, engine=engine, ) if hasattr(file_path, "seek"): file_path.seek(0) return pd.read_excel( file_path, sheet_name=sheet_name, engine=engine, header=header_row, **kwargs, )