| | """Plot pandas.DataFrame with DBSCAN clustering.""" |
| | |
| | |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from sklearn.cluster import DBSCAN |
| |
|
| | from logzero import logger |
| |
|
| | |
| | if "get_ipython" in globals(): |
| | plt.ion() |
| |
|
| |
|
| | |
| | def plot_df( |
| | df_: pd.DataFram, |
| | min_samples: int = 6, |
| | eps: float = 10, |
| | ylim: int = None, |
| | xlabel: str = "en", |
| | ylabel: str = "zh", |
| | ) -> plt: |
| | |
| | """Plot df with DBSCAN clustering. |
| | |
| | Args: |
| | df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"] |
| | Returns: |
| | matplotlib.pyplot: for possible use in gradio |
| | |
| | plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])) |
| | df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']) |
| | |
| | # sort 'x', axis 0 changes, index regenerated |
| | df_s = df_.sort_values('x', axis=0, ignore_index=True) |
| | |
| | # sorintg does not seem to impact clustering |
| | DBSCAN(1.5, min_samples=3).fit(df_).labels_ |
| | DBSCAN(1.5, min_samples=3).fit(df_s).labels_ |
| | |
| | """ |
| | df_ = pd.DataFrame(df_) |
| | if df_.columns.__len__() < 3: |
| | logger.error( |
| | "expected 3 columns DataFram, got: %s, cant proceed, returninng None", |
| | df_.columns.tolist(), |
| | ) |
| | return None |
| |
|
| | |
| | columns = df_.columns[:3] |
| | df_ = df_[columns] |
| |
|
| | |
| | df_.columns = ["x", "y", "cos"] |
| |
|
| | sns.set() |
| | sns.set_style("darkgrid") |
| | fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27)) |
| | fig.suptitle("alignment projection") |
| | _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1 |
| | _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0 |
| |
|
| | |
| | |
| | |
| |
|
| | df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0) |
| |
|
| | |
| | df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1) |
| |
|
| | |
| | df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0) |
| |
|
| | |
| | |
| | ax0.set_xlabel("") |
| | ax0.set_ylabel(ylabel) |
| | xlim = len(df_) |
| | ax0.set_xlim(0, xlim) |
| | if ylim: |
| | ax0.set_ylim(0, ylim) |
| | ax0.set_title("max similarity along columns (outliers denoted by 'x')") |
| |
|
| | |
| | |
| | ax1.set_xlabel(xlabel) |
| | ax1.set_ylabel(ylabel) |
| |
|
| | ax1.set_xlim(0, xlim) |
| | if ylim: |
| | ax1.set_ylim(0, ylim) |
| | ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})") |
| |
|
| | return plt |
| |
|