ELLS / pattern_detection.py

Create pattern_detection.py

eaa982f verified 1 day ago

10.5 kB

	# pattern_detection.py
	import numpy as np
	import pandas as pd
	from collections import Counter
	from sklearn.decomposition import PCA
	from sklearn.cluster import DBSCAN
	from sklearn.neighbors import LocalOutlierFactor
	import hdbscan
	from scipy.stats import circvar

	from utils import add_zone_labels


	def get_dominant_zone(df: pd.DataFrame) -> str:
	if len(df) == 0 or 'zone_label' not in df.columns:
	return "N/A"
	counter = Counter(df['zone_label'])
	most_common_zone, _ = counter.most_common(1)[0]
	return most_common_zone


	def circular_range_deg(angles_deg: np.ndarray) -> float:
	if len(angles_deg) < 2: return 0.0
	angles_sorted = np.sort(np.array(angles_deg) % 360.0)
	gaps = np.diff(angles_sorted)
	circular_gap = 360.0 - angles_sorted[-1] + angles_sorted[0]
	max_gap = max(np.max(gaps), circular_gap)
	return 360.0 - max_gap


	def check_sector_coverage(theta_deg: np.ndarray, min_sectors: int = 8) -> bool:
	if len(theta_deg) == 0: return False
	sector_indices = ((theta_deg % 360) // 30).astype(int) % 12
	unique_sectors = len(np.unique(sector_indices))
	return unique_sectors >= min_sectors


	def fit_circle_least_squares(x: np.ndarray, y: np.ndarray):
	if len(x) < 3: return None, None, None, np.inf
	x = x[:, np.newaxis]
	y = y[:, np.newaxis]
	A = np.hstack([x, y, np.ones_like(x)])
	b = x2 + y2
	try:
	solution, residuals, _, _ = np.linalg.lstsq(A, b, rcond=None)
	a, b, c = solution.flatten()
	center_x = a / 2
	center_y = b / 2
	radius = np.sqrt((a2 + b2) / 4 + c)
	fitted_dists = np.sqrt((x - center_x)2 + (y - center_y)2)
	rmse = np.sqrt(np.mean((fitted_dists - radius)**2))
	return center_x, center_y, radius, rmse
	except:
	return None, None, None, np.inf


	def filter_main_ring_band(df: pd.DataFrame, r_bin_width: float = 5.0, top_n_bins: int = 1) -> pd.DataFrame:
	if len(df) == 0 or 'r' not in df.columns: return df.copy()
	r = df['r'].values
	r = r[(r >= 0) & (r <= 150)]
	if len(r) == 0: return pd.DataFrame(columns=df.columns)
	r_bins = np.arange(0, 150 + r_bin_width, r_bin_width)
	r_hist, r_edges = np.histogram(r, bins=r_bins)
	top_bin_indices = np.argsort(r_hist)[::-1][:top_n_bins]
	mask = np.zeros(len(df), dtype=bool)
	for bin_idx in top_bin_indices:
	r_min = r_edges[bin_idx]
	r_max = r_edges[bin_idx + 1]
	bin_mask = (df['r'] >= r_min) & (df['r'] < r_max)
	mask = mask \| bin_mask.values
	return df[mask].copy()


	def is_ring_pattern_robust(inlier_df: pd.DataFrame, cfg: dict) -> bool:
	n_total = len(inlier_df)
	if n_total < cfg['ring']['ring_min_points']: return False
	main_ring_df = filter_main_ring_band(inlier_df, r_bin_width=cfg['ring']['ring_band_width'], top_n_bins=1)
	if len(main_ring_df) < cfg['ring']['ring_min_points']: return False
	r = main_ring_df['r'].values
	theta_deg = main_ring_df['theta_deg'].values
	x = main_ring_df['coor_x'].values
	y = main_ring_df['coor_y'].values
	if r.max() - r.min() > cfg['ring']['ring_r_absolute_tolerance']: return False
	if circular_range_deg(theta_deg) < cfg['ring']['ring_min_angular_coverage']: return False
	if not check_sector_coverage(theta_deg, min_sectors=cfg['ring']['ring_min_sectors']): return False
	cx, cy, r_fit, rmse = fit_circle_least_squares(x, y)
	if rmse == np.inf or rmse > cfg['ring']['ring_fit_rmse_max']: return False
	if np.sqrt(cx2 + cy2) > 10.0: return False
	return True


	def _is_linear_set(coords: np.ndarray, cfg: dict) -> bool:
	n = len(coords)
	if n < 3: return False
	centroid = np.mean(coords, axis=0)
	max_dist = np.max(np.linalg.norm(coords - centroid, axis=1))
	if 2 * max_dist < cfg['linear']['linear_min_length']: return False
	pca = PCA(n_components=min(2, n)).fit(coords)
	if len(pca.explained_variance_) < 2: return False
	eig_ratio = pca.explained_variance_[0] / (pca.explained_variance_[1] + 1e-9)
	if np.sqrt(eig_ratio) < cfg['linear']['linear_pca_ratio_min']: return False
	normal_vec = np.array([-pca.components_[0][1], pca.components_[0][0]])
	if np.mean(np.abs(np.dot(coords - pca.mean_, normal_vec))) > cfg['linear']['linear_max_deviation']: return False
	proj = np.sort(np.dot(coords - pca.mean_, pca.components_[0]))
	total_len = proj[-1] - proj[0]
	if total_len > 0 and np.max(np.diff(proj)) / total_len > cfg['linear']['linear_max_gap_ratio']: return False
	return True


	def _is_centroids_linear(sub_coords_list: list, cfg: dict) -> bool:
	if len(sub_coords_list) < 3: return False
	centroids = np.array([np.mean(sc, axis=0) for sc in sub_coords_list])
	max_span = 2 * np.max(np.linalg.norm(centroids - np.mean(centroids, axis=0), axis=1))
	if max_span < cfg['linear']['centroid_linear_min_length']: return False
	pca = PCA(n_components=2).fit(centroids)
	if len(pca.explained_variance_) < 2: return False
	if np.sqrt(pca.explained_variance_[0] / (pca.explained_variance_[1] + 1e-9)) < cfg['linear']['centroid_linear_pca_min']: return False
	normal = np.array([-pca.components_[0][1], pca.components_[0][0]])
	if np.mean(np.abs(np.dot(centroids - pca.mean_, normal))) > cfg['linear']['centroid_linear_dev_max']: return False
	return True


	def _classify_subcluster(sub_coords: np.ndarray, cfg: dict) -> str:
	n = len(sub_coords)
	if n < 3: return "군집"
	centroid = np.mean(sub_coords, axis=0)
	dists_from_centroid = np.linalg.norm(sub_coords - centroid, axis=1)
	max_dist = np.max(dists_from_centroid)
	if max_dist <= cfg['cluster']['cluster_compactness_radius']: return "군집"
	pca = PCA(n_components=min(2, n)).fit(sub_coords)
	if len(pca.explained_variance_) >= 2:
	eig_ratio = pca.explained_variance_[0] / (pca.explained_variance_[1] + 1e-9)
	shape_idx = np.sqrt(eig_ratio)
	if shape_idx >= cfg['linear']['linear_pca_ratio_min']:
	normal_vec = np.array([-pca.components_[0][1], pca.components_[0][0]])
	mean_dev = np.mean(np.abs(np.dot(sub_coords - pca.mean_, normal_vec)))
	if mean_dev <= cfg['linear']['linear_max_deviation'] and 2*max_dist >= cfg['linear']['linear_min_length']:
	return "선형"
	return "군집"


	def classify_wafer_patterns(df: pd.DataFrame, cfg: dict) -> tuple:
	if df.empty: return df, "데이터 없음", ["None"], None
	df = df.copy().reset_index(drop=True)
	df = add_zone_labels(df, inner_radius=cfg['preprocessing']['inner_radius_mm'])
	coords = df[["coor_x", "coor_y"]].values
	n_total = len(df)
	if n_total < cfg['misc']['min_points_for_clustering']:
	return df.assign(inlier=np.zeros(len(df), dtype=bool)), "데이터 없음", ["정상/미달"], None

	clusterer = hdbscan.HDBSCAN(
	min_cluster_size=cfg['clustering']['min_cluster_size'],
	min_samples=cfg['clustering']['min_samples'],
	cluster_selection_method=cfg['clustering']['cluster_selection_method'],
	metric="euclidean",
	gen_min_span_tree=True
	)
	labels = clusterer.fit_predict(coords)
	if np.all(labels == -1):
	labels = DBSCAN(eps=cfg['clustering']['dbscan_eps'], min_samples=cfg['clustering']['min_cluster_size']).fit(coords).labels_
	inlier_mask = (labels != -1)
	if not any(inlier_mask):
	return df.assign(inlier=inlier_mask), "데이터 없음", ["Others"], None

	inlier_df_pre = df[inlier_mask].copy()
	inlier_coords = coords[inlier_mask]
	n_inlier = len(inlier_coords)
	if n_inlier >= cfg['lof']['lof_min_points']:
	n_neighbors_lof = min(cfg['lof']['lof_n_neighbors'], n_inlier - 1)
	if n_neighbors_lof >= 2:
	lof = LocalOutlierFactor(
	n_neighbors=n_neighbors_lof,
	contamination=cfg['lof']['lof_contamination'],
	metric="euclidean"
	)
	lof_labels = lof.fit_predict(inlier_coords)
	full_lof_mask = np.zeros(len(df), dtype=bool)
	full_lof_mask[inlier_mask] = (lof_labels == 1)
	inlier_mask = inlier_mask & full_lof_mask

	inlier_df = df[inlier_mask].copy()
	inlier_coords = coords[inlier_mask]
	n_inlier = len(inlier_df)
	if n_inlier < cfg['clustering']['min_cluster_size']:
	return df.assign(inlier=inlier_mask), "데이터 없음", ["Others"], None

	if is_ring_pattern_robust(inlier_df, cfg):
	dominant_zone = get_dominant_zone(inlier_df)
	centroid = tuple(np.mean(inlier_df[['coor_x', 'coor_y']].values, axis=0))
	return df.assign(inlier=inlier_mask), dominant_zone, ["환형"], centroid

	if _is_linear_set(inlier_coords, cfg):
	dominant_zone = get_dominant_zone(inlier_df)
	dom_points = inlier_df[inlier_df['zone_label'] == dominant_zone]
	centroid = tuple(np.mean(dom_points[['coor_x', 'coor_y']].values, axis=0)) if not dom_points.empty else tuple(np.mean(inlier_coords, axis=0))
	return df.assign(inlier=inlier_mask), dominant_zone, ["선형"], centroid

	dominant_zone = get_dominant_zone(inlier_df)
	dom_points = inlier_df[inlier_df['zone_label'] == dominant_zone]
	centroid = tuple(np.mean(dom_points[['coor_x', 'coor_y']].values, axis=0)) if not dom_points.empty else tuple(np.mean(inlier_coords, axis=0))

	if n_inlier >= 2:
	dbscan_sub = DBSCAN(eps=cfg['clustering']['cluster_dbscan_eps'], min_samples=cfg['clustering']['min_cluster_size']).fit(inlier_coords)
	sub_labels = dbscan_sub.labels_
	n_sub_clusters = len(set(sub_labels)) - (1 if -1 in sub_labels else 0)
	if n_sub_clusters >= 2:
	sub_coords_list = [inlier_coords[sub_labels == lbl] for lbl in set(sub_labels) if lbl != -1]
	if _is_centroids_linear(sub_coords_list, cfg):
	return df.assign(inlier=inlier_mask), dominant_zone, ["선형"], centroid
	sub_results = [(_classify_subcluster(sc, cfg), len(sc)) for sc in sub_coords_list]
	pat_totals = {}
	for pat, cnt in sub_results: pat_totals[pat] = pat_totals.get(pat, 0) + cnt
	dominant_pattern = max(pat_totals, key=pat_totals.get)
	return df.assign(inlier=inlier_mask), dominant_zone, [dominant_pattern], centroid
	pattern = _classify_subcluster(inlier_coords, cfg)
	return df.assign(inlier=inlier_mask), dominant_zone, [pattern], centroid

	return df.assign(inlier=inlier_mask), dominant_zone, ["Others"], None