Spaces:

DavMelchi
/

db_query

Running

App Files Files Community

db_query / utils /ciq_excel.py

DavMelchi

Add selectable CIQ sheet detection

d9f7219 3 months ago

raw

history blame contribute delete

3.1 kB

	import re

	import pandas as pd


	TECH_SHEET_KEYWORDS: dict[str, tuple[str, ...]] = {
	"2g": ("2g", "gsm", "geran"),
	"3g": ("3g", "wcdma", "umts", "utra"),
	"4g": ("4g", "lte", "eutran"),
	}


	def _normalize_sheet_name(sheet_name: object) -> tuple[str, str]:
	normalized = re.sub(r"[^a-z0-9]+", " ", str(sheet_name).strip().lower()).strip()
	collapsed = normalized.replace(" ", "")
	return normalized, collapsed


	def _score_sheet_name(sheet_name: str, technology: str) -> int:
	keywords = TECH_SHEET_KEYWORDS.get(technology.lower(), (technology.lower(),))
	normalized, collapsed = _normalize_sheet_name(sheet_name)
	tokens = normalized.split()

	best_score = 0
	for keyword in keywords:
	if normalized == keyword:
	best_score = max(best_score, 100)
	elif keyword in tokens:
	best_score = max(best_score, 80)
	elif collapsed.endswith(keyword) or collapsed.startswith(keyword):
	best_score = max(best_score, 60)
	elif keyword in collapsed:
	best_score = max(best_score, 40)

	return best_score


	def _get_workbook_sheet_names(ciq_file) -> list[str]:
	if hasattr(ciq_file, "seek"):
	ciq_file.seek(0)

	workbook = pd.ExcelFile(ciq_file, engine="calamine")
	return list(workbook.sheet_names)


	def get_ciq_sheet_selection_details(ciq_file, technology: str) -> tuple[str, list[str]]:
	sheet_names = _get_workbook_sheet_names(ciq_file)

	if not sheet_names:
	raise ValueError("The uploaded Excel file does not contain any sheet.")

	if len(sheet_names) == 1:
	return sheet_names[0], sheet_names

	scored_matches = []
	for idx, sheet_name in enumerate(sheet_names):
	score = _score_sheet_name(sheet_name, technology)
	if score > 0:
	scored_matches.append((score, idx, sheet_name))

	if not scored_matches:
	joined = ", ".join(str(name) for name in sheet_names)
	raise ValueError(
	f"Unable to detect the {technology.upper()} sheet automatically. "
	f"Available sheets: {joined}"
	)

	scored_matches.sort(key=lambda item: (-item[0], item[1]))
	return scored_matches[0][2], sheet_names


	def select_ciq_sheet_name(ciq_file, technology: str) -> str:
	selected_sheet, _sheet_names = get_ciq_sheet_selection_details(
	ciq_file, technology=technology
	)
	return selected_sheet


	def read_ciq_excel(
	ciq_file, technology: str, sheet_name: str \| None = None
	) -> pd.DataFrame:
	if sheet_name is None:
	sheet_name = select_ciq_sheet_name(ciq_file, technology=technology)
	else:
	sheet_names = _get_workbook_sheet_names(ciq_file)
	if sheet_name not in sheet_names:
	joined = ", ".join(str(name) for name in sheet_names)
	raise ValueError(
	f"Sheet '{sheet_name}' does not exist in the uploaded Excel file. "
	f"Available sheets: {joined}"
	)

	if hasattr(ciq_file, "seek"):
	ciq_file.seek(0)

	return pd.read_excel(ciq_file, sheet_name=sheet_name, engine="calamine")