Spaces:

DavMelchi
/

db_query

Running

File size: 3,101 Bytes

d9f7219

import re

import pandas as pd


TECH_SHEET_KEYWORDS: dict[str, tuple[str, ...]] = {
    "2g": ("2g", "gsm", "geran"),
    "3g": ("3g", "wcdma", "umts", "utra"),
    "4g": ("4g", "lte", "eutran"),
}


def _normalize_sheet_name(sheet_name: object) -> tuple[str, str]:
    normalized = re.sub(r"[^a-z0-9]+", " ", str(sheet_name).strip().lower()).strip()
    collapsed = normalized.replace(" ", "")
    return normalized, collapsed


def _score_sheet_name(sheet_name: str, technology: str) -> int:
    keywords = TECH_SHEET_KEYWORDS.get(technology.lower(), (technology.lower(),))
    normalized, collapsed = _normalize_sheet_name(sheet_name)
    tokens = normalized.split()

    best_score = 0
    for keyword in keywords:
        if normalized == keyword:
            best_score = max(best_score, 100)
        elif keyword in tokens:
            best_score = max(best_score, 80)
        elif collapsed.endswith(keyword) or collapsed.startswith(keyword):
            best_score = max(best_score, 60)
        elif keyword in collapsed:
            best_score = max(best_score, 40)

    return best_score


def _get_workbook_sheet_names(ciq_file) -> list[str]:
    if hasattr(ciq_file, "seek"):
        ciq_file.seek(0)

    workbook = pd.ExcelFile(ciq_file, engine="calamine")
    return list(workbook.sheet_names)


def get_ciq_sheet_selection_details(ciq_file, technology: str) -> tuple[str, list[str]]:
    sheet_names = _get_workbook_sheet_names(ciq_file)

    if not sheet_names:
        raise ValueError("The uploaded Excel file does not contain any sheet.")

    if len(sheet_names) == 1:
        return sheet_names[0], sheet_names

    scored_matches = []
    for idx, sheet_name in enumerate(sheet_names):
        score = _score_sheet_name(sheet_name, technology)
        if score > 0:
            scored_matches.append((score, idx, sheet_name))

    if not scored_matches:
        joined = ", ".join(str(name) for name in sheet_names)
        raise ValueError(
            f"Unable to detect the {technology.upper()} sheet automatically. "
            f"Available sheets: {joined}"
        )

    scored_matches.sort(key=lambda item: (-item[0], item[1]))
    return scored_matches[0][2], sheet_names


def select_ciq_sheet_name(ciq_file, technology: str) -> str:
    selected_sheet, _sheet_names = get_ciq_sheet_selection_details(
        ciq_file, technology=technology
    )
    return selected_sheet


def read_ciq_excel(
    ciq_file, technology: str, sheet_name: str | None = None
) -> pd.DataFrame:
    if sheet_name is None:
        sheet_name = select_ciq_sheet_name(ciq_file, technology=technology)
    else:
        sheet_names = _get_workbook_sheet_names(ciq_file)
        if sheet_name not in sheet_names:
            joined = ", ".join(str(name) for name in sheet_names)
            raise ValueError(
                f"Sheet '{sheet_name}' does not exist in the uploaded Excel file. "
                f"Available sheets: {joined}"
            )

    if hasattr(ciq_file, "seek"):
        ciq_file.seek(0)

    return pd.read_excel(ciq_file, sheet_name=sheet_name, engine="calamine")