Spaces:
Runtime error
Runtime error
Add property calculator, molecular clustering, and scaffold analysis features
Browse files- app.py +183 -0
- force_rebuild.txt +1 -1
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -737,6 +737,138 @@ def name_to_3d_molecule(name: str) -> str:
|
|
| 737 |
raise gr.Error(f"Error creating 3D molecule: {str(e)}")
|
| 738 |
|
| 739 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
smiles_interface = gr.Interface(
|
| 741 |
fn=smiles_to_canonical,
|
| 742 |
inputs=gr.Textbox(label="SMILES"),
|
|
@@ -826,6 +958,51 @@ chemiscope_interface = gr.Interface(
|
|
| 826 |
cache_examples=False,
|
| 827 |
)
|
| 828 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
|
| 830 |
demo = gr.TabbedInterface(
|
| 831 |
[
|
|
@@ -833,6 +1010,9 @@ demo = gr.TabbedInterface(
|
|
| 833 |
molecule_3d_interface,
|
| 834 |
chemiscope_interface,
|
| 835 |
orbital_interface,
|
|
|
|
|
|
|
|
|
|
| 836 |
smiles_interface,
|
| 837 |
smiles_to_name_interface,
|
| 838 |
mw_interface,
|
|
@@ -844,6 +1024,9 @@ demo = gr.TabbedInterface(
|
|
| 844 |
"3D Molecule Viewer",
|
| 845 |
"Chemiscope Explorer",
|
| 846 |
"Molecular Orbitals",
|
|
|
|
|
|
|
|
|
|
| 847 |
"SMILES to Canonical",
|
| 848 |
"SMILES to Name",
|
| 849 |
"Molecular Weight",
|
|
|
|
| 737 |
raise gr.Error(f"Error creating 3D molecule: {str(e)}")
|
| 738 |
|
| 739 |
|
| 740 |
+
def calculate_properties_batch(smiles_list: str) -> str:
|
| 741 |
+
"""Calculate physicochemical properties for multiple molecules"""
|
| 742 |
+
from rdkit.Chem import Lipinski
|
| 743 |
+
|
| 744 |
+
lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
|
| 745 |
+
if not lines:
|
| 746 |
+
return "Please enter at least one SMILES string (one per line)"
|
| 747 |
+
|
| 748 |
+
results = []
|
| 749 |
+
results.append("SMILES\tMW\tLogP\tTPSA\tHBD\tHBA\tRotBonds\tRings\tAromRings")
|
| 750 |
+
|
| 751 |
+
for smiles in lines[:50]: # Limit to 50 molecules
|
| 752 |
+
try:
|
| 753 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 754 |
+
if mol is None:
|
| 755 |
+
results.append(f"{smiles}\tInvalid SMILES")
|
| 756 |
+
continue
|
| 757 |
+
|
| 758 |
+
mw = Descriptors.MolWt(mol)
|
| 759 |
+
logp = Descriptors.MolLogP(mol)
|
| 760 |
+
tpsa = Descriptors.TPSA(mol)
|
| 761 |
+
hbd = Lipinski.NumHDonors(mol)
|
| 762 |
+
hba = Lipinski.NumHAcceptors(mol)
|
| 763 |
+
rotbonds = Lipinski.NumRotatableBonds(mol)
|
| 764 |
+
rings = Lipinski.RingCount(mol)
|
| 765 |
+
arom_rings = Lipinski.NumAromaticRings(mol)
|
| 766 |
+
|
| 767 |
+
results.append(f"{smiles}\t{mw:.2f}\t{logp:.2f}\t{tpsa:.2f}\t{hbd}\t{hba}\t{rotbonds}\t{rings}\t{arom_rings}")
|
| 768 |
+
except Exception as e:
|
| 769 |
+
results.append(f"{smiles}\tError: {str(e)}")
|
| 770 |
+
|
| 771 |
+
return "\n".join(results)
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
def cluster_molecules(smiles_list: str, n_clusters: int = 5) -> str:
|
| 775 |
+
"""Cluster molecules based on structural similarity using Morgan fingerprints"""
|
| 776 |
+
from rdkit.Chem import AllChem
|
| 777 |
+
from sklearn.cluster import KMeans
|
| 778 |
+
import pandas as pd
|
| 779 |
+
|
| 780 |
+
lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
|
| 781 |
+
if not lines:
|
| 782 |
+
return "Please enter at least one SMILES string (one per line)"
|
| 783 |
+
|
| 784 |
+
if len(lines) < 2:
|
| 785 |
+
return "Please enter at least 2 SMILES strings for clustering"
|
| 786 |
+
|
| 787 |
+
# Generate fingerprints
|
| 788 |
+
mols = []
|
| 789 |
+
valid_smiles = []
|
| 790 |
+
fps = []
|
| 791 |
+
|
| 792 |
+
for smiles in lines[:100]: # Limit to 100 molecules
|
| 793 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 794 |
+
if mol is not None:
|
| 795 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
|
| 796 |
+
mols.append(mol)
|
| 797 |
+
valid_smiles.append(smiles)
|
| 798 |
+
fps.append(fp)
|
| 799 |
+
|
| 800 |
+
if len(fps) < 2:
|
| 801 |
+
return "Need at least 2 valid SMILES for clustering"
|
| 802 |
+
|
| 803 |
+
# Convert fingerprints to numpy array
|
| 804 |
+
fp_array = np.array([list(fp) for fp in fps])
|
| 805 |
+
|
| 806 |
+
# Perform clustering
|
| 807 |
+
n_clusters = min(n_clusters, len(fps))
|
| 808 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
| 809 |
+
clusters = kmeans.fit_predict(fp_array)
|
| 810 |
+
|
| 811 |
+
# Create results
|
| 812 |
+
results = []
|
| 813 |
+
results.append(f"Clustered {len(valid_smiles)} molecules into {n_clusters} groups\n")
|
| 814 |
+
results.append("Cluster\tSMILES\tMW\tLogP")
|
| 815 |
+
|
| 816 |
+
for i, (smiles, cluster_id) in enumerate(zip(valid_smiles, clusters)):
|
| 817 |
+
mol = mols[i]
|
| 818 |
+
mw = Descriptors.MolWt(mol)
|
| 819 |
+
logp = Descriptors.MolLogP(mol)
|
| 820 |
+
results.append(f"{cluster_id + 1}\t{smiles}\t{mw:.2f}\t{logp:.2f}")
|
| 821 |
+
|
| 822 |
+
return "\n".join(results)
|
| 823 |
+
|
| 824 |
+
|
| 825 |
+
def analyze_scaffolds(smiles_list: str) -> str:
|
| 826 |
+
"""Extract and analyze molecular scaffolds (Bemis-Murcko scaffolds)"""
|
| 827 |
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
| 828 |
+
from collections import Counter
|
| 829 |
+
|
| 830 |
+
lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
|
| 831 |
+
if not lines:
|
| 832 |
+
return "Please enter at least one SMILES string (one per line)"
|
| 833 |
+
|
| 834 |
+
scaffolds = []
|
| 835 |
+
mol_to_scaffold = []
|
| 836 |
+
|
| 837 |
+
for smiles in lines[:100]: # Limit to 100 molecules
|
| 838 |
+
try:
|
| 839 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 840 |
+
if mol is not None:
|
| 841 |
+
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
|
| 842 |
+
scaffold_smiles = Chem.MolToSmiles(scaffold)
|
| 843 |
+
scaffolds.append(scaffold_smiles)
|
| 844 |
+
mol_to_scaffold.append((smiles, scaffold_smiles))
|
| 845 |
+
except:
|
| 846 |
+
continue
|
| 847 |
+
|
| 848 |
+
if not scaffolds:
|
| 849 |
+
return "No valid scaffolds could be extracted"
|
| 850 |
+
|
| 851 |
+
# Count scaffold frequencies
|
| 852 |
+
scaffold_counts = Counter(scaffolds)
|
| 853 |
+
|
| 854 |
+
results = []
|
| 855 |
+
results.append(f"Analyzed {len(mol_to_scaffold)} molecules")
|
| 856 |
+
results.append(f"Found {len(scaffold_counts)} unique scaffolds\n")
|
| 857 |
+
results.append("=== Most Common Scaffolds ===")
|
| 858 |
+
|
| 859 |
+
for scaffold, count in scaffold_counts.most_common(10):
|
| 860 |
+
results.append(f"\nScaffold: {scaffold}")
|
| 861 |
+
results.append(f"Frequency: {count} molecules ({100*count/len(scaffolds):.1f}%)")
|
| 862 |
+
|
| 863 |
+
# Show examples
|
| 864 |
+
examples = [smiles for smiles, scaf in mol_to_scaffold if scaf == scaffold][:3]
|
| 865 |
+
results.append("Examples:")
|
| 866 |
+
for ex in examples:
|
| 867 |
+
results.append(f" - {ex}")
|
| 868 |
+
|
| 869 |
+
return "\n".join(results)
|
| 870 |
+
|
| 871 |
+
|
| 872 |
smiles_interface = gr.Interface(
|
| 873 |
fn=smiles_to_canonical,
|
| 874 |
inputs=gr.Textbox(label="SMILES"),
|
|
|
|
| 958 |
cache_examples=False,
|
| 959 |
)
|
| 960 |
|
| 961 |
+
# Property calculation interface
|
| 962 |
+
properties_interface = gr.Interface(
|
| 963 |
+
fn=calculate_properties_batch,
|
| 964 |
+
inputs=gr.Textbox(
|
| 965 |
+
label="SMILES List (one per line)",
|
| 966 |
+
placeholder="CCO\nc1ccccc1\nCC(=O)O\nCCN",
|
| 967 |
+
lines=10
|
| 968 |
+
),
|
| 969 |
+
outputs=gr.Textbox(label="Properties (Tab-separated)", lines=15),
|
| 970 |
+
title="Batch Property Calculator",
|
| 971 |
+
description="Calculate physicochemical properties for multiple molecules. Enter one SMILES per line (max 50).",
|
| 972 |
+
examples=[["CCO\nc1ccccc1\nCC(=O)O\nCN1C=NC2=C1C(=O)N(C(=O)N2C)C"]],
|
| 973 |
+
)
|
| 974 |
+
|
| 975 |
+
# Clustering interface
|
| 976 |
+
clustering_interface = gr.Interface(
|
| 977 |
+
fn=cluster_molecules,
|
| 978 |
+
inputs=[
|
| 979 |
+
gr.Textbox(
|
| 980 |
+
label="SMILES List (one per line)",
|
| 981 |
+
placeholder="CCO\nc1ccccc1\nCC(=O)O\nCCN",
|
| 982 |
+
lines=10
|
| 983 |
+
),
|
| 984 |
+
gr.Slider(minimum=2, maximum=10, value=5, step=1, label="Number of Clusters")
|
| 985 |
+
],
|
| 986 |
+
outputs=gr.Textbox(label="Clustering Results (Tab-separated)", lines=15),
|
| 987 |
+
title="Molecular Clustering",
|
| 988 |
+
description="Cluster molecules based on structural similarity using Morgan fingerprints and K-means (max 100 molecules).",
|
| 989 |
+
examples=[["CCO\nCCCO\nCCCCO\nc1ccccc1\nc1ccc(O)cc1\nc1ccc(N)cc1\nCC(=O)O\nCCC(=O)O\nCCCC(=O)O", 3]],
|
| 990 |
+
)
|
| 991 |
+
|
| 992 |
+
# Scaffold analysis interface
|
| 993 |
+
scaffold_interface = gr.Interface(
|
| 994 |
+
fn=analyze_scaffolds,
|
| 995 |
+
inputs=gr.Textbox(
|
| 996 |
+
label="SMILES List (one per line)",
|
| 997 |
+
placeholder="c1ccc(CCN)cc1\nc1ccc(CCO)cc1\nc1ccc(CCC)cc1",
|
| 998 |
+
lines=10
|
| 999 |
+
),
|
| 1000 |
+
outputs=gr.Textbox(label="Scaffold Analysis", lines=15),
|
| 1001 |
+
title="Scaffold Analysis",
|
| 1002 |
+
description="Extract and analyze Bemis-Murcko scaffolds from molecules (max 100).",
|
| 1003 |
+
examples=[["c1ccc(CCN)cc1\nc1ccc(CCO)cc1\nc1ccc(CCC)cc1\nCCOc1ccc(CCN)cc1\nCCc1ccc(O)cc1"]],
|
| 1004 |
+
)
|
| 1005 |
+
|
| 1006 |
|
| 1007 |
demo = gr.TabbedInterface(
|
| 1008 |
[
|
|
|
|
| 1010 |
molecule_3d_interface,
|
| 1011 |
chemiscope_interface,
|
| 1012 |
orbital_interface,
|
| 1013 |
+
properties_interface,
|
| 1014 |
+
clustering_interface,
|
| 1015 |
+
scaffold_interface,
|
| 1016 |
smiles_interface,
|
| 1017 |
smiles_to_name_interface,
|
| 1018 |
mw_interface,
|
|
|
|
| 1024 |
"3D Molecule Viewer",
|
| 1025 |
"Chemiscope Explorer",
|
| 1026 |
"Molecular Orbitals",
|
| 1027 |
+
"Property Calculator",
|
| 1028 |
+
"Molecular Clustering",
|
| 1029 |
+
"Scaffold Analysis",
|
| 1030 |
"SMILES to Canonical",
|
| 1031 |
"SMILES to Name",
|
| 1032 |
"Molecular Weight",
|
force_rebuild.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
# Force rebuild
|
| 2 |
-
2025-11-08
|
|
|
|
| 1 |
# Force rebuild
|
| 2 |
+
2025-11-08 v16
|
requirements.txt
CHANGED
|
@@ -12,3 +12,5 @@ ase
|
|
| 12 |
plotly
|
| 13 |
matplotlib
|
| 14 |
pillow
|
|
|
|
|
|
|
|
|
| 12 |
plotly
|
| 13 |
matplotlib
|
| 14 |
pillow
|
| 15 |
+
scikit-learn
|
| 16 |
+
pandas
|