File size: 6,861 Bytes
d5e4667
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
"""
Excel File Parser for SDR Status Tracker

Reads Excel files (.xlsx) and converts them to the same row format
that the Google Sheets API returns, enabling local file fallback
when Google Sheet access is not available.

Usage:
    # As a module
    from excel_parser import read_excel_file
    rows = read_excel_file("data/Case status_ February 2026.xlsx")

    # CLI for testing
    python excel_parser.py data/Case_status_February_2026.xlsx
"""
import os
import sys
from typing import List, Optional
from openpyxl import load_workbook


# Default tab name to look for
DEFAULT_TAB_NAME = "DAILY - for SDR to add data🌟"


def read_excel_file(
    file_path: str,
    tab_name: str = DEFAULT_TAB_NAME,
    max_col: str = "AI"
) -> List[List]:
    """
    Read an Excel file and return rows as list of lists.

    Matches the format returned by Google Sheets API:
    - Returns list of rows, where each row is a list of cell values
    - Empty cells are represented as empty strings ""
    - Rows are trimmed to remove trailing empty cells (like Sheets API)

    Args:
        file_path: Path to the .xlsx file
        tab_name: Name of the worksheet tab to read
        max_col: Maximum column to read (e.g., "AI" = column 35)

    Returns:
        List of rows, each row is a list of cell values (strings)

    Raises:
        FileNotFoundError: If the Excel file doesn't exist
        ValueError: If the specified tab is not found
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Excel file not found: {file_path}")

    # Load workbook (data_only=True to get calculated values, not formulas)
    wb = load_workbook(file_path, data_only=True)

    # Find the tab
    if tab_name not in wb.sheetnames:
        # Try partial match
        matching_tabs = [name for name in wb.sheetnames if tab_name.lower() in name.lower()]
        if matching_tabs:
            tab_name = matching_tabs[0]
            print(f"Using matching tab: {tab_name}")
        else:
            available_tabs = ", ".join(wb.sheetnames)
            raise ValueError(f"Tab '{tab_name}' not found. Available tabs: {available_tabs}")

    ws = wb[tab_name]

    # Convert column letter to number (e.g., "AI" -> 35)
    max_col_num = column_letter_to_number(max_col)

    # Read all rows
    rows = []
    for row in ws.iter_rows(min_row=1, max_col=max_col_num):
        row_values = []
        for cell in row:
            # Convert cell value to string (matching Google Sheets behavior)
            if cell.value is None:
                row_values.append("")
            elif isinstance(cell.value, (int, float)):
                # Handle percentages (stored as decimals in Excel)
                if cell.number_format and '%' in cell.number_format:
                    # Convert decimal to percentage integer (0.5 -> 50)
                    row_values.append(str(int(cell.value * 100)))
                else:
                    # Keep numbers as-is
                    if cell.value == int(cell.value):
                        row_values.append(str(int(cell.value)))
                    else:
                        row_values.append(str(cell.value))
            else:
                row_values.append(str(cell.value))

        # Trim trailing empty cells (like Sheets API does)
        while row_values and row_values[-1] == "":
            row_values.pop()

        rows.append(row_values)

    # Remove trailing empty rows
    while rows and not any(rows[-1]):
        rows.pop()

    wb.close()
    return rows


def column_letter_to_number(col_letter: str) -> int:
    """Convert column letter(s) to 1-based number. E.g., 'A'->1, 'Z'->26, 'AI'->35"""
    result = 0
    for char in col_letter.upper():
        result = result * 26 + (ord(char) - ord('A') + 1)
    return result


def get_excel_file_path(month_id: str, data_dir: str = "data") -> Optional[str]:
    """
    Get the Excel file path for a given month.

    Tries several naming patterns:
    - data/Case status_ February 2026.xlsx
    - data/Case_status_February_2026.xlsx
    - data/2026-02.xlsx

    Args:
        month_id: Month ID in format "2026-02"
        data_dir: Directory where Excel files are stored

    Returns:
        Path to Excel file if found, None otherwise
    """
    # Parse month_id to get month name and year
    try:
        year, month_num = month_id.split("-")
        month_names = {
            "01": "January", "02": "February", "03": "March",
            "04": "April", "05": "May", "06": "June",
            "07": "July", "08": "August", "09": "September",
            "10": "October", "11": "November", "12": "December"
        }
        month_name = month_names.get(month_num, "")
    except ValueError:
        month_name = ""
        year = ""

    # List of patterns to try
    patterns = [
        f"Case status_ {month_name} {year}.xlsx",
        f"Case_status_{month_name}_{year}.xlsx",
        f"{month_id}.xlsx",
        f"Case status {month_name} {year}.xlsx",
    ]

    for pattern in patterns:
        path = os.path.join(data_dir, pattern)
        if os.path.exists(path):
            return path

    return None


def list_available_excel_files(data_dir: str = "data") -> List[str]:
    """List all .xlsx files in the data directory."""
    if not os.path.exists(data_dir):
        return []

    return [f for f in os.listdir(data_dir) if f.endswith('.xlsx')]


def main():
    """CLI for testing Excel parsing."""
    if len(sys.argv) < 2:
        print("Usage: python excel_parser.py <excel_file> [tab_name]")
        print("\nExample: python excel_parser.py data/Case_status_February_2026.xlsx")

        # List available files
        files = list_available_excel_files()
        if files:
            print(f"\nAvailable Excel files in data/:")
            for f in files:
                print(f"  - {f}")
        return 1

    file_path = sys.argv[1]
    tab_name = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_TAB_NAME

    try:
        print(f"Reading: {file_path}")
        print(f"Tab: {tab_name}")
        print("-" * 50)

        rows = read_excel_file(file_path, tab_name)

        print(f"Total rows: {len(rows)}")
        print(f"Header row 1: {rows[0][:10] if rows else 'empty'}...")
        print(f"Header row 4: {rows[3][:10] if len(rows) > 3 else 'empty'}...")

        # Show a few data rows
        print("\nFirst data row (row 5):")
        if len(rows) > 4:
            print(f"  {rows[4][:15]}...")

        # Count rows with data in column C (activity)
        activity_rows = sum(1 for r in rows[4:] if len(r) > 2 and r[2])
        print(f"\nRows with activity data: {activity_rows}")

        return 0

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    exit(main())