Spaces:

EngBioNUS
/

BitConverter

Sleeping

App Files Files Community

wenjun99 commited on Mar 24

Commit

177e320

verified ·

1 Parent(s): f532ad3

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +224 -188

src/app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import matplotlib
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 from matplotlib.ticker import MultipleLocator
 from scipy.stats import gaussian_kde
 from PIL import Image
@@ -310,7 +309,7 @@ with tab1:
             index=0,
             key="enc_scheme",
             help=(
-                "**6-bit LNS** – Custom 57-character table (A-Z, 0-9, punctuation). 6 bits/char.\n\n"
                 "**Base64 (6-bit)** – Standard Base64 encoding of UTF-8 bytes. 6 bits/symbol.\n\n"
                 "**ASCII (7-bit)** – Standard 7-bit ASCII. 7 bits/char.\n\n"
                 "**UTF-8 (8-bit)** – Full UTF-8 byte encoding. 8 bits/byte. Supports all Unicode."
@@ -391,7 +390,7 @@ with tab1:
             columns = [f"Position {i+1}" for i in range(group_size)]
             df = pd.DataFrame(groups, columns=columns)
             df.insert(0, "Sample", range(1, len(df) + 1))
-            st.dataframe(df, width="stretch")
             st.download_button(
                 "⬇️ Download as CSV",
@@ -1015,213 +1014,250 @@ with tab2:
 # --------------------------------------------------
 with tab3:
     st.header("📊 Data Analytics")
-    st.markdown("""
-    Upload your sample data file (Excel or CSV) for a quick exploratory assessment of the editing rates distribution.
-    The file should contain Reactions as rows and position columns with editing values.
-    This tab provides visualizations **before** any binary labelling.
-    """)
-    analytics_uploaded = st.file_uploader(
-        "📤 Upload data file",
-        type=["xlsx", "csv"],
-        key="analytics_uploader"
     )
-    if analytics_uploaded is not None:
-        try:
-            if analytics_uploaded.name.endswith(".xlsx"):
-                adf = pd.read_excel(analytics_uploaded)
-            else:
-                adf = pd.read_csv(analytics_uploaded)
-            st.success(f"✅ Loaded file with {len(adf)} rows and {len(adf.columns)} columns")
-            adf.columns = [str(c).strip() for c in adf.columns]
-            non_pos_keywords = {"sample", "description", "descritpion", "total edited",
-                                'volume per "1"', "volume per 1", "id", "name"}
-            position_cols = [c for c in adf.columns
-                            if c.lower() not in non_pos_keywords
-                            and pd.to_numeric(adf[c], errors="coerce").notna().any()]
-            def pos_sort_key(col_name: str):
-                m = re.search(r"(\d+)", col_name)
-                return int(m.group(1)) if m else 10**9
-            position_cols = sorted(position_cols, key=pos_sort_key)
-            if not position_cols:
-                st.error("No numeric position columns detected.")
-                st.stop()
-            st.info(f"Detected **{len(position_cols)}** position columns and **{len(adf)}** Reactions.")
-            pos_data = adf[position_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
-            if "Total edited" in adf.columns:
-                total_edited = pd.to_numeric(adf["Total edited"], errors="coerce").fillna(0.0)
-            else:
-                total_edited = pos_data.sum(axis=1)
-            st.markdown("### 1️⃣ Raw Data Distribution")
-            st.caption("Visualize editing values across all positions and Reactions - before any binary labelling.")
-            transform_option = st.selectbox(
-                "Value transformation:",
-                ["Raw (linear)", "ln(1+x)", "ln(ln(1+x))"],
-                index=0,
-                key="transform_select",
-                help=(
-                    "**Raw** - No transformation.\n\n"
-                    "**ln(1+x)** - `ln(1 + x)`. Compresses high values, spreads low range.\n\n"
-                    "**ln(ln(1+x))** - Double ln. Even stronger compression.\n\n"
                 )
-            )
-            # def robust_pos_normalize_log1p(data: pd.DataFrame) -> pd.DataFrame:
-            #     logged = np.log1p(data)
-            #     result = logged.copy()
-            #     for col in result.columns:
-            #         med = result[col].median()
-            #         q75, q25 = result[col].quantile(0.75), result[col].quantile(0.25)
-            #         iqr = q75 - q25
-            #         if iqr > 0:
-            #             result[col] = (result[col] - med) / iqr
-            #         else:
-            #             result[col] = result[col] - med
-            #     return result
-            if transform_option == "ln(1+x)":
-                transformed = np.log1p(pos_data)
-                value_label = "Editing Value (ln(1+x))"
-                transform_tag = "ln(1+x)"
-            elif transform_option == "ln(ln(1+x))":
-                transformed = np.log1p(np.log1p(pos_data))
-                value_label = "Editing Value (ln(ln(1+x)))"
-                transform_tag = "ln(ln(1+x))"
-            # elif transform_option == "log1p → pos. norm.":
-            #     transformed = robust_pos_normalize_log1p(pos_data)
-            #     value_label = "Editing Value (log1p → pos. norm.)"
-            #     transform_tag = "log1p_posnorm"
-            else:
-                transformed = pos_data
-                value_label = "Editing Value"
-                transform_tag = "raw"
-            melted = transformed.melt(var_name="Position", value_name="Value")
-            melted["Position_idx"] = melted["Position"].apply(
-                lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
-            )
-            st.markdown("#### 📊 Histogram - All Values")
-            n_bins = st.number_input("Number of bins:", min_value=10, max_value=300, value=80, step=10, key="hist_bins")
-            matplotlib.rcParams["font.family"] = "Arial"
-            matplotlib.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Liberation Sans", "sans-serif"]
-            fig2, ax2 = plt.subplots(figsize=(7, 5))
-            all_vals = melted["Value"].values
-            ax2.hist(all_vals, bins=n_bins, color="#808080", alpha=0.8, label="All")
-            ax2.set_xlim(left=0.0)
-            val_range = np.nanmax(all_vals) - np.nanmin(all_vals)
-            if val_range <= 5:
-                major_step, minor_step = 0.5, 0.1
-            elif val_range <= 20:
-                major_step, minor_step = 2, 0.5
-            elif val_range <= 50:
-                major_step, minor_step = 5, 1
-            else:
-                major_step, minor_step = 10, 2
-            ax2.xaxis.set_major_locator(MultipleLocator(major_step))
-            ax2.xaxis.set_minor_locator(MultipleLocator(minor_step))
-            ax2.tick_params(axis="x", which="major", length=6)
-            ax2.tick_params(axis="x", which="minor", length=3)
-            ax2.legend(fontsize=8)
-            ax2.set_title(f"Raw Values Distribution ({transform_tag})")
-            ax2.set_xlabel(value_label)
-            ax2.set_ylabel("Counts")
-            # Apply axis styling
-            for attr in [ax2.xaxis.label, ax2.yaxis.label, ax2.title]:
-                attr.set_fontsize(12)
-                attr.set_fontweight("bold")
-                attr.set_fontfamily("Arial")
-            for spine in ax2.spines.values():
-                spine.set_linewidth(1.5)
-            ax2.tick_params(axis="both", which="both", width=1.2, labelsize=10)
-            ax2.spines['top'].set_visible(False)
-            ax2.spines['right'].set_visible(False)
-            for ticklab in ax2.get_xticklabels() + ax2.get_yticklabels():
-                ticklab.set_fontfamily("Arial")
-                ticklab.set_fontweight("normal")
-            fig2.tight_layout()
-            st.pyplot(fig2)
-            # st.markdown("#### 📊 Histogram - All Values")
-            # n_bins = st.number_input("Number of bins:", min_value=10, max_value=300, value=80, step=10, key="hist_bins")
-            # fig2, ax2 = plt.subplots(figsize=(10, 4))
-            # ax2.hist(melted["Value"].values, bins=n_bins, color="#4F46E5", edgecolor="white", linewidth=0.3)
-            # ax2.set_xlabel(value_label)
-            # ax2.set_ylabel("Count")
-            # ax2.set_title(f"Raw Values Distribution ({transform_tag})")
-            # val_min = melted["Value"].min()
-            # val_max = melted["Value"].max()
-            # val_range = val_max - val_min
-            # if val_range <= 2:
-            #     tick_step = 0.1
-            # elif val_range <= 6:
-            #     tick_step = 0.2
-            # elif val_range <= 20:
-            #     tick_step = 1
-            # else:
-            #     tick_step = 5
-            # ax2.set_xticks(np.arange(np.floor(val_min / tick_step) * tick_step,
-            #                          val_max + tick_step, tick_step))
-            # ax2.tick_params(axis='x', labelsize=8, rotation=45)
-            # ax2.grid(axis='y', alpha=0.3)
-            # fig2.tight_layout()
-            # st.pyplot(fig2)
-            st.markdown("#### 2️⃣ Density Scatter Plot")
-            st.caption("Each dot = one measurement (sample × position). Color = local point density.")
-            x_vals = melted["Position_idx"].values.astype(float)
-            y_vals = melted["Value"].values.astype(float)
-            x_jittered = x_vals + np.random.default_rng(42).uniform(-0.3, 0.3, size=len(x_vals))
-            with st.spinner("Computing point density..."):
-                try:
-                    xy = np.vstack([x_jittered, y_vals])
-                    density = gaussian_kde(xy)(xy)
-                except np.linalg.LinAlgError:
-                    density = np.ones(len(x_vals))
-            sort_idx = density.argsort()
-            x_plot = x_jittered[sort_idx]
-            y_plot = y_vals[sort_idx]
-            d_plot = density[sort_idx]
-            fig3, ax3 = plt.subplots(figsize=(12, 6))
-            scatter = ax3.scatter(x_plot, y_plot, c=d_plot, cmap="jet", s=8, alpha=0.7, edgecolors="none")
-            cbar = fig3.colorbar(scatter, ax=ax3, label="Density")
-            ax3.set_xlabel("Position")
-            ax3.set_ylabel(value_label)
-            ax3.set_title(f"Density Scatter - {value_label} by Position")
-            ax3.set_xticks(sorted(melted["Position_idx"].unique()))
-            ax3.grid(alpha=0.2)
-            fig3.tight_layout()
-            st.pyplot(fig3)
-        except Exception as e:
-            st.error(f"❌ Error processing file: {e}")
-            import traceback
-            st.code(traceback.format_exc())
     else:
-        st.info("👆 Upload a data file (CSV or Excel) to start exploring.")
 # --------------------------------------------------
 # TAB 4: Pipetting Command Generator

 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 from matplotlib.ticker import MultipleLocator
 from scipy.stats import gaussian_kde
 from PIL import Image
             index=0,
             key="enc_scheme",
             help=(
+                "**6-bit LNS** – Custom 56-character table (A-Z, 0-9, punctuation). 6 bits/char.\n\n"
                 "**Base64 (6-bit)** – Standard Base64 encoding of UTF-8 bytes. 6 bits/symbol.\n\n"
                 "**ASCII (7-bit)** – Standard 7-bit ASCII. 7 bits/char.\n\n"
                 "**UTF-8 (8-bit)** – Full UTF-8 byte encoding. 8 bits/byte. Supports all Unicode."
             columns = [f"Position {i+1}" for i in range(group_size)]
             df = pd.DataFrame(groups, columns=columns)
             df.insert(0, "Sample", range(1, len(df) + 1))
+            st.dataframe(df, hide_index=True, width="stretch")
             st.download_button(
                 "⬇️ Download as CSV",
 # --------------------------------------------------
 with tab3:
     st.header("📊 Data Analytics")
+    section_choice = st.radio(
+        "Select analysis type:",
+        ["Experimental Editing Values", "Binary Values"],
+        horizontal=True
     )
+    # ── Section 1: Experimental Editing Values ──
+    if section_choice == "Experimental Editing Values":
+        st.markdown("""
+        Upload your sample data file (Excel or CSV) for a quick exploratory assessment of the editing rates distribution.
+        The file should contain Reactions as rows and position columns with editing values.
+        This section provides visualizations **before** any binary labelling.
+        """)
+        analytics_uploaded = st.file_uploader(
+            "📤 Upload data file",
+            type=["xlsx", "csv"],
+            key="analytics_uploader"
+        )
+        if analytics_uploaded is not None:
+            try:
+                if analytics_uploaded.name.endswith(".xlsx"):
+                    adf = pd.read_excel(analytics_uploaded)
+                else:
+                    adf = pd.read_csv(analytics_uploaded)
+                st.success(f"✅ Loaded file with {len(adf)} rows and {len(adf.columns)} columns")
+                adf.columns = [str(c).strip() for c in adf.columns]
+                non_pos_keywords = {"sample", "description", "descritpion", "total edited",
+                                    'volume per "1"', "volume per 1", "id", "name"}
+                position_cols = [c for c in adf.columns
+                                if c.lower() not in non_pos_keywords
+                                and pd.to_numeric(adf[c], errors="coerce").notna().any()]
+                def pos_sort_key(col_name: str):
+                    m = re.search(r"(\d+)", col_name)
+                    return int(m.group(1)) if m else 10**9
+                position_cols = sorted(position_cols, key=pos_sort_key)
+                if not position_cols:
+                    st.error("No numeric position columns detected.")
+                    st.stop()
+                st.info(f"Detected **{len(position_cols)}** position columns and **{len(adf)}** Reactions.")
+                pos_data = adf[position_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+                if "Total edited" in adf.columns:
+                    total_edited = pd.to_numeric(adf["Total edited"], errors="coerce").fillna(0.0)
+                else:
+                    total_edited = pos_data.sum(axis=1)
+                st.markdown("### 1️⃣ Raw Data Distribution")
+                st.caption("Visualize editing values across all positions and Reactions - before any binary labelling.")
+                transform_option = st.selectbox(
+                    "Value transformation:",
+                    ["Raw (linear)", "log1p", "log1p → log1p"],
+                    index=0,
+                    key="transform_select",
+                    help=(
+                        "**Raw** - No transformation.\n\n"
+                        "**log1p** - `log(1 + x)`. Compresses high values, spreads low range.\n\n"
+                        "**log1p → log1p** - Double log1p. Even stronger compression.\n\n"
+                    )
                 )
+                if transform_option == "log1p":
+                    transformed = np.log1p(pos_data)
+                    value_label = "Editing Value (log1p)"
+                    transform_tag = "log1p"
+                elif transform_option == "log1p → log1p":
+                    transformed = np.log1p(np.log1p(pos_data))
+                    value_label = "Editing Value (log1p → log1p)"
+                    transform_tag = "log1p_log1p"
+                else:
+                    transformed = pos_data
+                    value_label = "Editing Value"
+                    transform_tag = "raw"
+                melted = transformed.melt(var_name="Position", value_name="Value")
+                melted["Position_idx"] = melted["Position"].apply(
+                    lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
+                )
+                st.markdown("#### 📊 Histogram - All Values")
+                n_bins = st.number_input("Number of bins:", min_value=10, max_value=300, value=80, step=10, key="hist_bins")
+                matplotlib.rcParams["font.family"] = "Arial"
+                matplotlib.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Liberation Sans", "sans-serif"]
+                fig2, ax2 = plt.subplots(figsize=(7, 5))
+                all_vals = melted["Value"].values
+                ax2.hist(all_vals, bins=n_bins, color="#808080", alpha=0.8, label="All")
+                ax2.set_xlim(left=0.0)
+                val_range = np.nanmax(all_vals) - np.nanmin(all_vals)
+                if val_range <= 5:
+                    major_step, minor_step = 0.5, 0.1
+                elif val_range <= 20:
+                    major_step, minor_step = 2, 0.5
+                elif val_range <= 50:
+                    major_step, minor_step = 5, 1
+                else:
+                    major_step, minor_step = 10, 2
+                ax2.xaxis.set_major_locator(MultipleLocator(major_step))
+                ax2.xaxis.set_minor_locator(MultipleLocator(minor_step))
+                ax2.tick_params(axis="x", which="major", length=6)
+                ax2.tick_params(axis="x", which="minor", length=3)
+                ax2.legend(fontsize=8)
+                ax2.set_title(f"Raw Values Distribution ({transform_tag})")
+                ax2.set_xlabel(value_label)
+                ax2.set_ylabel("Counts")
+                for attr in [ax2.xaxis.label, ax2.yaxis.label, ax2.title]:
+                    attr.set_fontsize(12)
+                    attr.set_fontweight("bold")
+                    attr.set_fontfamily("Arial")
+                for spine in ax2.spines.values():
+                    spine.set_linewidth(1.5)
+                ax2.tick_params(axis="both", which="both", width=1.2, labelsize=10)
+                ax2.spines['top'].set_visible(False)
+                ax2.spines['right'].set_visible(False)
+                for ticklab in ax2.get_xticklabels() + ax2.get_yticklabels():
+                    ticklab.set_fontfamily("Arial")
+                    ticklab.set_fontweight("normal")
+                fig2.tight_layout()
+                st.pyplot(fig2)
+                st.markdown("#### 2️⃣ Density Scatter Plot")
+                st.caption("Each dot = one measurement (sample × position). Color = local point density.")
+                x_vals = melted["Position_idx"].values.astype(float)
+                y_vals = melted["Value"].values.astype(float)
+                x_jittered = x_vals + np.random.default_rng(42).uniform(-0.3, 0.3, size=len(x_vals))
+                with st.spinner("Computing point density..."):
+                    try:
+                        xy = np.vstack([x_jittered, y_vals])
+                        density = gaussian_kde(xy)(xy)
+                    except np.linalg.LinAlgError:
+                        density = np.ones(len(x_vals))
+                sort_idx = density.argsort()
+                x_plot = x_jittered[sort_idx]
+                y_plot = y_vals[sort_idx]
+                d_plot = density[sort_idx]
+                fig3, ax3 = plt.subplots(figsize=(12, 6))
+                scatter = ax3.scatter(x_plot, y_plot, c=d_plot, cmap="jet", s=8, alpha=0.7, edgecolors="none")
+                cbar = fig3.colorbar(scatter, ax=ax3, label="Density")
+                ax3.set_xlabel("Position")
+                ax3.set_ylabel(value_label)
+                ax3.set_title(f"Density Scatter - {value_label} by Position")
+                ax3.set_xticks(sorted(melted["Position_idx"].unique()))
+                ax3.grid(alpha=0.2)
+                fig3.tight_layout()
+                st.pyplot(fig3)
+            except Exception as e:
+                st.error(f"❌ Error processing file: {e}")
+                import traceback
+                st.code(traceback.format_exc())
+        else:
+            st.info("👆 Upload a data file (CSV or Excel) to start exploring.")
+    # ── Section 2: Binary Values ──
     else:
+        st.markdown("""
+        Upload a binary labels CSV file (rows = reactions, columns = positions with 0/1 values).
+        A **Total_Edited** column will be computed automatically as the sum of 1s per reaction,
+        and a box plot of Total Edited counts will be displayed.
+        """)
+        binary_uploaded = st.file_uploader(
+            "📤 Upload binary labels CSV",
+            type=["csv"],
+            key="binary_uploader"
+        )
+        if binary_uploaded is not None:
+            try:
+                bdf = pd.read_csv(binary_uploaded)
+                st.success(f"✅ Loaded file with {len(bdf)} rows and {len(bdf.columns)} columns")
+                # All columns should be position columns (0/1 values)
+                bdf.columns = [str(c).strip() for c in bdf.columns]
+                pos_cols = [c for c in bdf.columns
+                            if pd.to_numeric(bdf[c], errors="coerce").notna().any()]
+                pos_data_bin = bdf[pos_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)
+                # Compute Total_Edited
+                pos_data_bin["Total_Edited"] = pos_data_bin.sum(axis=1)
+                st.info(f"Detected **{len(pos_cols)}** position columns across **{len(bdf)}** reactions.")
+                st.dataframe(pos_data_bin, hide_index=True)
+                # Box plot
+                matplotlib.rcParams["font.family"] = "Arial"
+                matplotlib.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Liberation Sans", "sans-serif"]
+                fig_box, ax_box = plt.subplots(figsize=(7, 5))
+                bp = ax_box.boxplot(pos_data_bin["Total_Edited"].values, vert=True, patch_artist=True,
+                                    boxprops=dict(facecolor="#808080", alpha=0.8),
+                                    medianprops=dict(color="black", linewidth=1.5),
+                                    whiskerprops=dict(linewidth=1.2),
+                                    capprops=dict(linewidth=1.2))
+                ax_box.set_ylabel("Total Edited (sum of 1s)")
+                ax_box.set_title("Distribution of Total Edited per Reaction")
+                ax_box.set_xticklabels(["All Reactions"])
+                # Apply axis styling
+                for attr in [ax_box.xaxis.label, ax_box.yaxis.label, ax_box.title]:
+                    attr.set_fontsize(12)
+                    attr.set_fontweight("bold")
+                    attr.set_fontfamily("Arial")
+                for spine in ax_box.spines.values():
+                    spine.set_linewidth(1.5)
+                ax_box.tick_params(axis="both", which="both", width=1.2, labelsize=10)
+                ax_box.spines['top'].set_visible(False)
+                ax_box.spines['right'].set_visible(False)
+                for ticklab in ax_box.get_xticklabels() + ax_box.get_yticklabels():
+                    ticklab.set_fontfamily("Arial")
+                    ticklab.set_fontweight("normal")
+                fig_box.tight_layout()
+                st.pyplot(fig_box)
+                # Summary stats
+                st.markdown("#### Summary Statistics")
+                stats = pos_data_bin["Total_Edited"].describe()
+                st.dataframe(stats.to_frame("Total_Edited").T)
+            except Exception as e:
+                st.error(f"❌ Error processing file: {e}")
+                import traceback
+                st.code(traceback.format_exc())
+        else:
+            st.info("👆 Upload a binary labels CSV file to start exploring.")
 # --------------------------------------------------
 # TAB 4: Pipetting Command Generator