File size: 3,149 Bytes
7efc151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
from collections.abc import Sequence

import pandas as pd


def _normalize_header_value(value: object) -> str:
    return str(value).strip().replace("\xa0", " ").lower()


def _is_numeric_like(value: str) -> bool:
    return bool(re.fullmatch(r"[+-]?\d+(?:\.\d+)?", value))


def _header_row_score(values: Sequence[object], expected_columns: Sequence[str] | None) -> int:
    normalized = [_normalize_header_value(value) for value in values]
    non_empty = [value for value in normalized if value and value != "nan"]
    expected = {
        _normalize_header_value(column) for column in (expected_columns or []) if column
    }

    expected_matches = len(expected.intersection(non_empty))
    text_like_count = sum(any(char.isalpha() for char in value) for value in non_empty)
    numeric_like_count = sum(_is_numeric_like(value) for value in non_empty)
    info_penalty = 6 if "info" in non_empty and len(non_empty) <= 2 else 0

    return (
        expected_matches * 100
        + text_like_count * 3
        + len(non_empty)
        - numeric_like_count * 2
        - info_penalty
    )


def _probe_sheet_name(sheet_name) -> str | int:
    if isinstance(sheet_name, (str, int)):
        return sheet_name
    if isinstance(sheet_name, Sequence) and sheet_name:
        return sheet_name[0]
    raise ValueError("sheet_name must be a sheet label or a non-empty sequence of sheets")


def detect_dump_header_row(
    file_path,
    sheet_name,
    *,
    expected_columns: Sequence[str] | None = None,
    engine: str = "calamine",
) -> int:
    """
    Detect whether a dump sheet header is on the first or second row.
    """
    probe_sheet = _probe_sheet_name(sheet_name)

    if hasattr(file_path, "seek"):
        file_path.seek(0)

    preview = pd.read_excel(
        file_path,
        sheet_name=probe_sheet,
        engine=engine,
        header=None,
        nrows=2,
    )

    if hasattr(file_path, "seek"):
        file_path.seek(0)

    if preview.empty:
        return 0

    best_row = 0
    best_score = _header_row_score(preview.iloc[0].tolist(), expected_columns)

    max_candidate_row = min(len(preview.index), 2)
    for row_index in range(1, max_candidate_row):
        score = _header_row_score(preview.iloc[row_index].tolist(), expected_columns)
        if score > best_score:
            best_row = row_index
            best_score = score

    return best_row


def read_dump_excel(
    file_path,
    *,
    sheet_name,
    expected_columns: Sequence[str] | None = None,
    engine: str = "calamine",
    **kwargs,
):
    """
    Read dump sheets while supporting headers placed on row 1 or row 2.
    """
    if "header" in kwargs or "skiprows" in kwargs:
        raise ValueError("read_dump_excel manages header detection internally")

    header_row = detect_dump_header_row(
        file_path,
        sheet_name,
        expected_columns=expected_columns,
        engine=engine,
    )

    if hasattr(file_path, "seek"):
        file_path.seek(0)

    return pd.read_excel(
        file_path,
        sheet_name=sheet_name,
        engine=engine,
        header=header_row,
        **kwargs,
    )