Spaces:

developmentwellmatix
/

PFAS-Analyzer

Running

App Files Files Community

tueniuu commited on 9 days ago

Commit

b1e5a57

verified ·

1 Parent(s): e7d30e6

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -32

app.py CHANGED Viewed

@@ -8,13 +8,15 @@ import random
 import selfies as sf
 import matplotlib.pyplot as plt
 import seaborn as sns
 from rdkit import Chem
 from rdkit.Chem import SaltRemover
 from rdkit.Chem.MolStandardize import rdMolStandardize
 from transformers import AutoTokenizer, AutoModel, pipeline as hf_pipeline
 # =================================================================
-# PART 0: THE BRIDGE (Brain Setup)
 # =================================================================
 st.set_page_config(page_title="PFAS Discovery AI", layout="wide")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -96,7 +98,7 @@ def mutate_smart(s):
         # Action 2: Cap Ends to increase Solubility
         if random.random() < 0.6:
-            chars.append(random.choice(["[O]", "[N]", "[C][=O][O]"])) # Add Acid/Alcohol group
         return sf.decoder("".join(chars))
     except: return s
@@ -119,10 +121,10 @@ if clf is None:
     st.stop()
 # =================================================================
-# PART 2: THE UI (With Evolutionary Search)
 # =================================================================
 st.title("🧪 End-to-End PFAS Discovery AI")
-st.markdown("Powered by **Evolutionary Optimization** (Generating 20 $\\to$ Keeping Top 3)")
 st.sidebar.header("1. Input Data")
 input_type = st.sidebar.radio("Source:", ["Single Molecule", "Batch CSV"])
@@ -154,29 +156,26 @@ if st.sidebar.button("🚀 Run Pipeline") and data:
     # --- PATH A: EVOLUTIONARY DISCOVERY ---
     if mode == "Discovery (Optimize)":
         seeds = valid_df['Clean_SMILES'].tolist()
         progress_bar = st.progress(0)
         for i, s in enumerate(seeds):
-            # 1. SPAWN POPULATION (Generate 20 mutants)
-            population = [s] # Include original
             for _ in range(20):
                 new_mol = mutate_smart(s)
                 if new_mol not in population: population.append(new_mol)
-            # 2. SCORE POPULATION (The Filter)
             feats = get_descriptors(population)
-            preds = clf.predict(feats) # Class
-            scores_b = reg_b.predict(feats) # Bioaccumulation (Target)
             scores_p = reg_p.predict(feats)
             scores_m = reg_m.predict(feats)
-            # 3. RANK & SELECT (Survival of the Safest)
             ranked_candidates = []
             for j, cand in enumerate(population):
-                # Apply Logic Layer
                 final_cls = sanity_check_class(cand, preds[j])
                 ranked_candidates.append({
                     "Candidate": cand,
                     "Type": "Original" if cand == s else "Optimized",
@@ -186,12 +185,9 @@ if st.sidebar.button("🚀 Run Pipeline") and data:
                     "Mobility": scores_m[j]
                 })
-            # SORT: Lowest Bioaccumulation first
             ranked_candidates.sort(key=lambda x: x['Bioaccumulation'])
-            # KEEP: Only the Top 3 Best
             results.extend(ranked_candidates[:3])
             progress_bar.progress((i + 1) / len(seeds))
     # --- PATH B: SCREENING ---
@@ -217,22 +213,68 @@ if st.sidebar.button("🚀 Run Pipeline") and data:
                 "Tox_Result": tox
             })
-    # Results Display
     res_df = pd.DataFrame(results)
-    st.subheader("📊 Optimization Results")
     st.dataframe(res_df)
     st.download_button("Download CSV", res_df.to_csv(index=False).encode('utf-8'), "results.csv", "text/csv")
-    st.subheader("⚠️ Safety Dashboard")
-    fig, ax = plt.subplots(figsize=(10, 6))
-    palette = {"Non-PFAS": "green", "General PFAS": "red", "PFCA": "darkred", "PFSA": "purple"}
-    for u in res_df['Subclass'].unique():
-        if u not in palette: palette[u] = "gray"
-    sns.scatterplot(
-        data=res_df, x='Bioaccumulation', y='Mobility', hue='Subclass', style='Subclass',
-        size='Persistence', sizes=(50, 300), palette=palette, ax=ax, alpha=0.8, edgecolor='black'
-    )
-    plt.axvline(x=3.5, color='orange', linestyle='--', label='Bioacc Limit')
-    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-    st.pyplot(fig)

 import selfies as sf
 import matplotlib.pyplot as plt
 import seaborn as sns
+import plotly.express as px  # Interactive Graphs
+import plotly.graph_objects as go
 from rdkit import Chem
 from rdkit.Chem import SaltRemover
 from rdkit.Chem.MolStandardize import rdMolStandardize
 from transformers import AutoTokenizer, AutoModel, pipeline as hf_pipeline
 # =================================================================
+# PART 0: THE BRIDGE (Automatic Brain Setup)
 # =================================================================
 st.set_page_config(page_title="PFAS Discovery AI", layout="wide")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
         # Action 2: Cap Ends to increase Solubility
         if random.random() < 0.6:
+            chars.append(random.choice(["[O]", "[N]", "[C][=O][O]"]))
         return sf.decoder("".join(chars))
     except: return s
     st.stop()
 # =================================================================
+# PART 2: THE UI (With 4-Graph Dashboard)
 # =================================================================
 st.title("🧪 End-to-End PFAS Discovery AI")
+st.markdown("Powered by **Evolutionary Optimization** & **Advanced Visualization**")
 st.sidebar.header("1. Input Data")
 input_type = st.sidebar.radio("Source:", ["Single Molecule", "Batch CSV"])
     # --- PATH A: EVOLUTIONARY DISCOVERY ---
     if mode == "Discovery (Optimize)":
         seeds = valid_df['Clean_SMILES'].tolist()
         progress_bar = st.progress(0)
         for i, s in enumerate(seeds):
+            # 1. SPAWN POPULATION
+            population = [s]
             for _ in range(20):
                 new_mol = mutate_smart(s)
                 if new_mol not in population: population.append(new_mol)
+            # 2. SCORE
             feats = get_descriptors(population)
+            preds = clf.predict(feats)
+            scores_b = reg_b.predict(feats)
             scores_p = reg_p.predict(feats)
             scores_m = reg_m.predict(feats)
+            # 3. RANK
             ranked_candidates = []
             for j, cand in enumerate(population):
                 final_cls = sanity_check_class(cand, preds[j])
                 ranked_candidates.append({
                     "Candidate": cand,
                     "Type": "Original" if cand == s else "Optimized",
                     "Mobility": scores_m[j]
                 })
+            # Select Top 3 Best
             ranked_candidates.sort(key=lambda x: x['Bioaccumulation'])
             results.extend(ranked_candidates[:3])
             progress_bar.progress((i + 1) / len(seeds))
     # --- PATH B: SCREENING ---
                 "Tox_Result": tox
             })
+    # ------------------------------------------------------------------
+    # VISUALIZATION DASHBOARD
+    # ------------------------------------------------------------------
     res_df = pd.DataFrame(results)
+    # 1. RESULTS TABLE
+    st.subheader("📊 Data Table")
     st.dataframe(res_df)
     st.download_button("Download CSV", res_df.to_csv(index=False).encode('utf-8'), "results.csv", "text/csv")
+    st.markdown("---")
+    st.header("📈 Advanced Analytics Dashboard")
+    col1, col2 = st.columns(2)
+    color_map = {"Non-PFAS": "green", "PFCA": "red", "PFSA": "purple", "General PFAS": "orange"}
+    # GRAPH 1: 3D DISCOVERY CUBE
+    with col1:
+        st.subheader("🧊 1. Multi-Dimensional Risk")
+        fig_3d = px.scatter_3d(
+            res_df,
+            x='Bioaccumulation', y='Mobility', z='Persistence',
+            color='Subclass', symbol='Type' if 'Type' in res_df.columns else 'Subclass',
+            color_discrete_map=color_map, opacity=0.8, size_max=10,
+            title="Bioacc vs Mobility vs Persistence"
+        )
+        fig_3d.update_layout(margin=dict(l=0, r=0, b=0, t=30))
+        st.plotly_chart(fig_3d, use_container_width=True)
+    # GRAPH 2: CLASS DISTRIBUTION (Bar Chart)
+    with col2:
+        st.subheader("📊 2. Class Composition")
+        fig_bar = px.bar(
+            res_df, x="Subclass", color="Subclass",
+            title="Count of Molecules by Class",
+            color_discrete_map=color_map
+        )
+        st.plotly_chart(fig_bar, use_container_width=True)
+    col3, col4 = st.columns(2)
+    # GRAPH 3: PARALLEL COORDINATES (The "Trace" Graph)
+    with col3:
+        st.subheader("📉 3. Property Tracing")
+        # Normalize Subclass to integer for coloring if needed, or use Bioacc
+        fig_para = px.parallel_coordinates(
+            res_df,
+            dimensions=['Persistence', 'Mobility', 'Bioaccumulation'],
+            color="Bioaccumulation",
+            color_continuous_scale=px.colors.diverging.TealRose,
+            title="Trace: Persist -> Mobile -> Bioacc"
+        )
+        st.plotly_chart(fig_para, use_container_width=True)
+    # GRAPH 4: DISTRIBUTION VIOLIN PLOT
+    with col4:
+        st.subheader("🎻 4. Risk Distribution")
+        fig_vio = px.violin(
+            res_df, y="Bioaccumulation", x="Subclass",
+            color="Subclass", box=True, points="all",
+            color_discrete_map=color_map,
+            title="Bioaccumulation Spread per Class"
+        )
+        st.plotly_chart(fig_vio, use_container_width=True)