db_query / utils /ciq_excel.py
DavMelchi's picture
Add selectable CIQ sheet detection
d9f7219
import re
import pandas as pd
TECH_SHEET_KEYWORDS: dict[str, tuple[str, ...]] = {
"2g": ("2g", "gsm", "geran"),
"3g": ("3g", "wcdma", "umts", "utra"),
"4g": ("4g", "lte", "eutran"),
}
def _normalize_sheet_name(sheet_name: object) -> tuple[str, str]:
normalized = re.sub(r"[^a-z0-9]+", " ", str(sheet_name).strip().lower()).strip()
collapsed = normalized.replace(" ", "")
return normalized, collapsed
def _score_sheet_name(sheet_name: str, technology: str) -> int:
keywords = TECH_SHEET_KEYWORDS.get(technology.lower(), (technology.lower(),))
normalized, collapsed = _normalize_sheet_name(sheet_name)
tokens = normalized.split()
best_score = 0
for keyword in keywords:
if normalized == keyword:
best_score = max(best_score, 100)
elif keyword in tokens:
best_score = max(best_score, 80)
elif collapsed.endswith(keyword) or collapsed.startswith(keyword):
best_score = max(best_score, 60)
elif keyword in collapsed:
best_score = max(best_score, 40)
return best_score
def _get_workbook_sheet_names(ciq_file) -> list[str]:
if hasattr(ciq_file, "seek"):
ciq_file.seek(0)
workbook = pd.ExcelFile(ciq_file, engine="calamine")
return list(workbook.sheet_names)
def get_ciq_sheet_selection_details(ciq_file, technology: str) -> tuple[str, list[str]]:
sheet_names = _get_workbook_sheet_names(ciq_file)
if not sheet_names:
raise ValueError("The uploaded Excel file does not contain any sheet.")
if len(sheet_names) == 1:
return sheet_names[0], sheet_names
scored_matches = []
for idx, sheet_name in enumerate(sheet_names):
score = _score_sheet_name(sheet_name, technology)
if score > 0:
scored_matches.append((score, idx, sheet_name))
if not scored_matches:
joined = ", ".join(str(name) for name in sheet_names)
raise ValueError(
f"Unable to detect the {technology.upper()} sheet automatically. "
f"Available sheets: {joined}"
)
scored_matches.sort(key=lambda item: (-item[0], item[1]))
return scored_matches[0][2], sheet_names
def select_ciq_sheet_name(ciq_file, technology: str) -> str:
selected_sheet, _sheet_names = get_ciq_sheet_selection_details(
ciq_file, technology=technology
)
return selected_sheet
def read_ciq_excel(
ciq_file, technology: str, sheet_name: str | None = None
) -> pd.DataFrame:
if sheet_name is None:
sheet_name = select_ciq_sheet_name(ciq_file, technology=technology)
else:
sheet_names = _get_workbook_sheet_names(ciq_file)
if sheet_name not in sheet_names:
joined = ", ".join(str(name) for name in sheet_names)
raise ValueError(
f"Sheet '{sheet_name}' does not exist in the uploaded Excel file. "
f"Available sheets: {joined}"
)
if hasattr(ciq_file, "seek"):
ciq_file.seek(0)
return pd.read_excel(ciq_file, sheet_name=sheet_name, engine="calamine")