Spaces:
Sleeping
Sleeping
| import re | |
| from collections.abc import Sequence | |
| import pandas as pd | |
| def _normalize_header_value(value: object) -> str: | |
| return str(value).strip().replace("\xa0", " ").lower() | |
| def _is_numeric_like(value: str) -> bool: | |
| return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value)) | |
| def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] | None) -> int: | |
| normalized = [_normalize_header_value(value) for value in values] | |
| non_empty = [value for value in normalized if value and value != "nan"] | |
| expected = { | |
| _normalize_header_value(column) for column in (expected_columns or []) if column | |
| } | |
| expected_matches = len(expected.intersection(non_empty)) | |
| text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty) | |
| numeric_like_count = sum(_is_numeric_like(value) for value in non_empty) | |
| info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0 | |
| return ( | |
| expected_matches * 100 | |
| + text_like_count * 3 | |
| + len(non_empty) | |
| - numeric_like_count * 2 | |
| - info_penalty | |
| ) | |
| def _probe_sheet_name(sheet_name) -> str | int: | |
| if isinstance(sheet_name, (str, int)): | |
| return sheet_name | |
| if isinstance(sheet_name, Sequence) and sheet_name: | |
| return sheet_name[0] | |
| raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets") | |
| def detect_dump_header_row( | |
| file_path, | |
| sheet_name, | |
| *, | |
| expected_columns: Sequence[str] | None = None, | |
| engine: str = "calamine", | |
| ) -> int: | |
| """ | |
| Detect whether a dump sheet header is on the first or second row. | |
| """ | |
| probe_sheet = _probe_sheet_name(sheet_name) | |
| if hasattr(file_path, "seek"): | |
| file_path.seek(0) | |
| preview = pd.read_excel( | |
| file_path, | |
| sheet_name=probe_sheet, | |
| engine=engine, | |
| header=None, | |
| nrows=2, | |
| ) | |
| if hasattr(file_path, "seek"): | |
| file_path.seek(0) | |
| if preview.empty: | |
| return 0 | |
| best_row = 0 | |
| best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns) | |
| max_candidate_row = min(len(preview.index), 2) | |
| for row_index in range(1, max_candidate_row): | |
| score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns) | |
| if score > best_score: | |
| best_row = row_index | |
| best_score = score | |
| return best_row | |
| def read_dump_excel( | |
| file_path, | |
| *, | |
| sheet_name, | |
| expected_columns: Sequence[str] | None = None, | |
| engine: str = "calamine", | |
| **kwargs, | |
| ): | |
| """ | |
| Read dump sheets while supporting headers placed on row 1 or row 2. | |
| """ | |
| if "header" in kwargs or "skiprows" in kwargs: | |
| raise ValueError("read_dump_excel manages header detection internally") | |
| header_row = detect_dump_header_row( | |
| file_path, | |
| sheet_name, | |
| expected_columns=expected_columns, | |
| engine=engine, | |
| ) | |
| if hasattr(file_path, "seek"): | |
| file_path.seek(0) | |
| return pd.read_excel( | |
| file_path, | |
| sheet_name=sheet_name, | |
| engine=engine, | |
| header=header_row, | |
| **kwargs, | |
| ) | |