Nanny7 commited on
Commit
4eda0f0
·
1 Parent(s): 53712e3

Add property calculator, molecular clustering, and scaffold analysis features

Browse files
Files changed (3) hide show
  1. app.py +183 -0
  2. force_rebuild.txt +1 -1
  3. requirements.txt +2 -0
app.py CHANGED
@@ -737,6 +737,138 @@ def name_to_3d_molecule(name: str) -> str:
737
  raise gr.Error(f"Error creating 3D molecule: {str(e)}")
738
 
739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
  smiles_interface = gr.Interface(
741
  fn=smiles_to_canonical,
742
  inputs=gr.Textbox(label="SMILES"),
@@ -826,6 +958,51 @@ chemiscope_interface = gr.Interface(
826
  cache_examples=False,
827
  )
828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
 
830
  demo = gr.TabbedInterface(
831
  [
@@ -833,6 +1010,9 @@ demo = gr.TabbedInterface(
833
  molecule_3d_interface,
834
  chemiscope_interface,
835
  orbital_interface,
 
 
 
836
  smiles_interface,
837
  smiles_to_name_interface,
838
  mw_interface,
@@ -844,6 +1024,9 @@ demo = gr.TabbedInterface(
844
  "3D Molecule Viewer",
845
  "Chemiscope Explorer",
846
  "Molecular Orbitals",
 
 
 
847
  "SMILES to Canonical",
848
  "SMILES to Name",
849
  "Molecular Weight",
 
737
  raise gr.Error(f"Error creating 3D molecule: {str(e)}")
738
 
739
 
740
+ def calculate_properties_batch(smiles_list: str) -> str:
741
+ """Calculate physicochemical properties for multiple molecules"""
742
+ from rdkit.Chem import Lipinski
743
+
744
+ lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
745
+ if not lines:
746
+ return "Please enter at least one SMILES string (one per line)"
747
+
748
+ results = []
749
+ results.append("SMILES\tMW\tLogP\tTPSA\tHBD\tHBA\tRotBonds\tRings\tAromRings")
750
+
751
+ for smiles in lines[:50]: # Limit to 50 molecules
752
+ try:
753
+ mol = Chem.MolFromSmiles(smiles)
754
+ if mol is None:
755
+ results.append(f"{smiles}\tInvalid SMILES")
756
+ continue
757
+
758
+ mw = Descriptors.MolWt(mol)
759
+ logp = Descriptors.MolLogP(mol)
760
+ tpsa = Descriptors.TPSA(mol)
761
+ hbd = Lipinski.NumHDonors(mol)
762
+ hba = Lipinski.NumHAcceptors(mol)
763
+ rotbonds = Lipinski.NumRotatableBonds(mol)
764
+ rings = Lipinski.RingCount(mol)
765
+ arom_rings = Lipinski.NumAromaticRings(mol)
766
+
767
+ results.append(f"{smiles}\t{mw:.2f}\t{logp:.2f}\t{tpsa:.2f}\t{hbd}\t{hba}\t{rotbonds}\t{rings}\t{arom_rings}")
768
+ except Exception as e:
769
+ results.append(f"{smiles}\tError: {str(e)}")
770
+
771
+ return "\n".join(results)
772
+
773
+
774
+ def cluster_molecules(smiles_list: str, n_clusters: int = 5) -> str:
775
+ """Cluster molecules based on structural similarity using Morgan fingerprints"""
776
+ from rdkit.Chem import AllChem
777
+ from sklearn.cluster import KMeans
778
+ import pandas as pd
779
+
780
+ lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
781
+ if not lines:
782
+ return "Please enter at least one SMILES string (one per line)"
783
+
784
+ if len(lines) < 2:
785
+ return "Please enter at least 2 SMILES strings for clustering"
786
+
787
+ # Generate fingerprints
788
+ mols = []
789
+ valid_smiles = []
790
+ fps = []
791
+
792
+ for smiles in lines[:100]: # Limit to 100 molecules
793
+ mol = Chem.MolFromSmiles(smiles)
794
+ if mol is not None:
795
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
796
+ mols.append(mol)
797
+ valid_smiles.append(smiles)
798
+ fps.append(fp)
799
+
800
+ if len(fps) < 2:
801
+ return "Need at least 2 valid SMILES for clustering"
802
+
803
+ # Convert fingerprints to numpy array
804
+ fp_array = np.array([list(fp) for fp in fps])
805
+
806
+ # Perform clustering
807
+ n_clusters = min(n_clusters, len(fps))
808
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
809
+ clusters = kmeans.fit_predict(fp_array)
810
+
811
+ # Create results
812
+ results = []
813
+ results.append(f"Clustered {len(valid_smiles)} molecules into {n_clusters} groups\n")
814
+ results.append("Cluster\tSMILES\tMW\tLogP")
815
+
816
+ for i, (smiles, cluster_id) in enumerate(zip(valid_smiles, clusters)):
817
+ mol = mols[i]
818
+ mw = Descriptors.MolWt(mol)
819
+ logp = Descriptors.MolLogP(mol)
820
+ results.append(f"{cluster_id + 1}\t{smiles}\t{mw:.2f}\t{logp:.2f}")
821
+
822
+ return "\n".join(results)
823
+
824
+
825
+ def analyze_scaffolds(smiles_list: str) -> str:
826
+ """Extract and analyze molecular scaffolds (Bemis-Murcko scaffolds)"""
827
+ from rdkit.Chem.Scaffolds import MurckoScaffold
828
+ from collections import Counter
829
+
830
+ lines = [line.strip() for line in smiles_list.strip().split('\n') if line.strip()]
831
+ if not lines:
832
+ return "Please enter at least one SMILES string (one per line)"
833
+
834
+ scaffolds = []
835
+ mol_to_scaffold = []
836
+
837
+ for smiles in lines[:100]: # Limit to 100 molecules
838
+ try:
839
+ mol = Chem.MolFromSmiles(smiles)
840
+ if mol is not None:
841
+ scaffold = MurckoScaffold.GetScaffoldForMol(mol)
842
+ scaffold_smiles = Chem.MolToSmiles(scaffold)
843
+ scaffolds.append(scaffold_smiles)
844
+ mol_to_scaffold.append((smiles, scaffold_smiles))
845
+ except:
846
+ continue
847
+
848
+ if not scaffolds:
849
+ return "No valid scaffolds could be extracted"
850
+
851
+ # Count scaffold frequencies
852
+ scaffold_counts = Counter(scaffolds)
853
+
854
+ results = []
855
+ results.append(f"Analyzed {len(mol_to_scaffold)} molecules")
856
+ results.append(f"Found {len(scaffold_counts)} unique scaffolds\n")
857
+ results.append("=== Most Common Scaffolds ===")
858
+
859
+ for scaffold, count in scaffold_counts.most_common(10):
860
+ results.append(f"\nScaffold: {scaffold}")
861
+ results.append(f"Frequency: {count} molecules ({100*count/len(scaffolds):.1f}%)")
862
+
863
+ # Show examples
864
+ examples = [smiles for smiles, scaf in mol_to_scaffold if scaf == scaffold][:3]
865
+ results.append("Examples:")
866
+ for ex in examples:
867
+ results.append(f" - {ex}")
868
+
869
+ return "\n".join(results)
870
+
871
+
872
  smiles_interface = gr.Interface(
873
  fn=smiles_to_canonical,
874
  inputs=gr.Textbox(label="SMILES"),
 
958
  cache_examples=False,
959
  )
960
 
961
+ # Property calculation interface
962
+ properties_interface = gr.Interface(
963
+ fn=calculate_properties_batch,
964
+ inputs=gr.Textbox(
965
+ label="SMILES List (one per line)",
966
+ placeholder="CCO\nc1ccccc1\nCC(=O)O\nCCN",
967
+ lines=10
968
+ ),
969
+ outputs=gr.Textbox(label="Properties (Tab-separated)", lines=15),
970
+ title="Batch Property Calculator",
971
+ description="Calculate physicochemical properties for multiple molecules. Enter one SMILES per line (max 50).",
972
+ examples=[["CCO\nc1ccccc1\nCC(=O)O\nCN1C=NC2=C1C(=O)N(C(=O)N2C)C"]],
973
+ )
974
+
975
+ # Clustering interface
976
+ clustering_interface = gr.Interface(
977
+ fn=cluster_molecules,
978
+ inputs=[
979
+ gr.Textbox(
980
+ label="SMILES List (one per line)",
981
+ placeholder="CCO\nc1ccccc1\nCC(=O)O\nCCN",
982
+ lines=10
983
+ ),
984
+ gr.Slider(minimum=2, maximum=10, value=5, step=1, label="Number of Clusters")
985
+ ],
986
+ outputs=gr.Textbox(label="Clustering Results (Tab-separated)", lines=15),
987
+ title="Molecular Clustering",
988
+ description="Cluster molecules based on structural similarity using Morgan fingerprints and K-means (max 100 molecules).",
989
+ examples=[["CCO\nCCCO\nCCCCO\nc1ccccc1\nc1ccc(O)cc1\nc1ccc(N)cc1\nCC(=O)O\nCCC(=O)O\nCCCC(=O)O", 3]],
990
+ )
991
+
992
+ # Scaffold analysis interface
993
+ scaffold_interface = gr.Interface(
994
+ fn=analyze_scaffolds,
995
+ inputs=gr.Textbox(
996
+ label="SMILES List (one per line)",
997
+ placeholder="c1ccc(CCN)cc1\nc1ccc(CCO)cc1\nc1ccc(CCC)cc1",
998
+ lines=10
999
+ ),
1000
+ outputs=gr.Textbox(label="Scaffold Analysis", lines=15),
1001
+ title="Scaffold Analysis",
1002
+ description="Extract and analyze Bemis-Murcko scaffolds from molecules (max 100).",
1003
+ examples=[["c1ccc(CCN)cc1\nc1ccc(CCO)cc1\nc1ccc(CCC)cc1\nCCOc1ccc(CCN)cc1\nCCc1ccc(O)cc1"]],
1004
+ )
1005
+
1006
 
1007
  demo = gr.TabbedInterface(
1008
  [
 
1010
  molecule_3d_interface,
1011
  chemiscope_interface,
1012
  orbital_interface,
1013
+ properties_interface,
1014
+ clustering_interface,
1015
+ scaffold_interface,
1016
  smiles_interface,
1017
  smiles_to_name_interface,
1018
  mw_interface,
 
1024
  "3D Molecule Viewer",
1025
  "Chemiscope Explorer",
1026
  "Molecular Orbitals",
1027
+ "Property Calculator",
1028
+ "Molecular Clustering",
1029
+ "Scaffold Analysis",
1030
  "SMILES to Canonical",
1031
  "SMILES to Name",
1032
  "Molecular Weight",
force_rebuild.txt CHANGED
@@ -1,2 +1,2 @@
1
  # Force rebuild
2
- 2025-11-08 v15
 
1
  # Force rebuild
2
+ 2025-11-08 v16
requirements.txt CHANGED
@@ -12,3 +12,5 @@ ase
12
  plotly
13
  matplotlib
14
  pillow
 
 
 
12
  plotly
13
  matplotlib
14
  pillow
15
+ scikit-learn
16
+ pandas