Spaces:

Fredaaaaaa
/

HM

Sleeping

App Files Files Community

Fredaaaaaa commited on Apr 25, 2025

Commit

f21c58d

verified ·

1 Parent(s): c2c62e3

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -1

app.py CHANGED Viewed

@@ -1,3 +1,287 @@
 # Updated prediction function with improved confidence handling
 def predict_severity(drug1, drug2):
     if not drug1 or not drug2:
@@ -247,7 +531,7 @@ def predict_severity(drug1, drug2):
         print(f"Error during prediction: {e}")
         return f"Error making prediction: {e}"
-        # Gradio Interface
 interface = gr.Interface(
     fn=predict_severity,
     inputs=[

+import pickle
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import gradio as gr
+import pandas as pd
+import re
+from sklearn.utils.class_weight import compute_class_weight
+import numpy as np
+# ✅ Device setup
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Download label encoder from Hugging Face Hub
+label_encoder_path = hf_hub_download(repo_id="Fredaaaaaa/hybrid_model", filename="label_encoder.pkl")
+with open(label_encoder_path, 'rb') as f:
+    label_encoder = pickle.load(f)
+# Load model and tokenizer
+model_name = "Fredaaaaaa/hybrid_model"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+model.to(device)  # Move model to appropriate device
+model.eval()
+# Download the dataset from Hugging Face Hub
+dataset_path = hf_hub_download(repo_id="Fredaaaaaa/hybrid_model", filename="labeled_severity.csv")
+# Load the dataset with appropriate encoding
+df = pd.read_csv(dataset_path, encoding='ISO-8859-1')
+print(f"Dataset loaded successfully! Shape: {df.shape}")
+# Check the columns and display first few rows for debugging
+print(df.columns)
+print(df.head())
+# Get unique severity classes from the dataset
+unique_classes = df['severity'].unique()
+print(f"Unique severity classes in dataset: {unique_classes}")
+# Calculate class weights to handle imbalanced classes
+# Use the unique classes from the dataset for the `classes` parameter
+class_weights = compute_class_weight('balanced', classes=np.unique(unique_classes), y=df['severity'])
+class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
+loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
+# Extract unique drug names from the dataset to create a list of known drugs
+all_drugs = set()
+# Check the possible column names and add drugs to our set
+for col in ['Drug1', 'Drug 1', 'drug1', 'drug_1', 'Drug 1_normalized']:
+    if col in df.columns:
+        # Convert to strings, clean and add to set
+        all_drugs.update(df[col].astype(str).str.lower().str.strip().tolist())
+for col in ['Drug2', 'Drug 2', 'drug2', 'drug_2', 'Drug 2_normalized']:
+    if col in df.columns:
+        # Convert to strings, clean and add to set
+        all_drugs.update(df[col].astype(str).str.lower().str.strip().tolist())
+# Remove any empty strings or NaN values
+all_drugs = {drug for drug in all_drugs if drug and drug != 'nan'}
+print(f"Loaded {len(all_drugs)} unique drug names from dataset")
+# Function to properly clean drug names
+def clean_drug_name(drug_name):
+    if not drug_name:
+        return ""
+    # Remove extra whitespace and standardize to lowercase
+    return re.sub(r'\s+', ' ', drug_name.strip().lower())
+# Function to validate if input is a legitimate drug name
+def validate_drug_input(drug_name):
+    # Clean the input
+    drug_name = clean_drug_name(drug_name)
+    if not drug_name or len(drug_name) <= 1:
+        return False, "Drug name is too short"
+    # Check if it's just a single letter or number
+    if len(drug_name) == 1 or drug_name.isdigit():
+        return False, "Not a valid drug name"
+    # Check if it contains weird characters
+    if not re.match(r'^[a-zA-Z0-9\s\-\+]+$', drug_name):
+        return False, "Drug name contains invalid characters"
+    # Check if it's in our known drug list
+    if drug_name in all_drugs:
+        return True, "Drug found in dataset"
+    # If we have a small drug list or need to be more forgiving, we can try fuzzy matching
+    for known_drug in all_drugs:
+        if drug_name in known_drug or known_drug in drug_name:
+            return True, f"Drug found in dataset (matched with '{known_drug}')"
+    # If not in dataset, we'll try the API validation
+    return None, "Drug not in dataset, needs API validation"
+def validate_drug_via_api(drug_name):
+    """Validate a drug name using PubChem API"""
+    try:
+        # Clean the input
+        drug_name = clean_drug_name(drug_name)
+        # Use PubChem API to search for the drug
+        search_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/cids/JSON"
+        response = requests.get(search_url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            # Check if we got a valid CID (PubChem Compound ID)
+            if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
+                return True, f"Drug validated via PubChem API (CID: {data['IdentifierList']['CID'][0]})"
+            else:
+                return False, "Drug not found in PubChem database"
+        else:
+            # Try a fallback for compounds with special characters
+            fallback_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{requests.utils.quote(drug_name)}/cids/JSON"
+            fallback_response = requests.get(fallback_url, timeout=10)
+            if fallback_response.status_code == 200:
+                data = fallback_response.json()
+                if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
+                    return True, f"Drug validated via PubChem API (CID: {data['IdentifierList']['CID'][0]})"
+            return False, f"Invalid drug name: API returned status {response.status_code}"
+    except Exception as e:
+        print(f"Error validating drug via API: {e}")
+        # Be more lenient if API validation fails
+        return True, "API validation failed, assuming valid drug"
+def get_drug_features_from_api(drug_name):
+    """Get drug features from PubChem API"""
+    try:
+        # Clean the input
+        drug_name = clean_drug_name(drug_name)
+        # First get the CID from PubChem
+        search_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/cids/JSON"
+        response = requests.get(search_url, timeout=10)
+        if response.status_code != 200:
+            # Try URL encoding for drugs with special characters
+            search_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{requests.utils.quote(drug_name)}/cids/JSON"
+            response = requests.get(search_url, timeout=10)
+            if response.status_code != 200:
+                print(f"Drug {drug_name} not found in PubChem")
+                return None
+        # Extract the CID
+        data = response.json()
+        if 'IdentifierList' not in data or 'CID' not in data['IdentifierList']:
+            print(f"No CID found for drug {drug_name}")
+            return None
+        cid = data['IdentifierList']['CID'][0]
+        # Get the SMILES notation
+        smiles_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/JSON"
+        smiles_response = requests.get(smiles_url, timeout=10)
+        # Initialize features dictionary
+        features = {
+            'SMILES': 'No data',
+            'pharmacodynamics': 'No data',
+            'toxicity': 'No data'
+        }
+        # Extract SMILES if available
+        if smiles_response.status_code == 200:
+            smiles_data = smiles_response.json()
+            if 'PropertyTable' in smiles_data and 'Properties' in smiles_data['PropertyTable']:
+                properties = smiles_data['PropertyTable']['Properties']
+                if properties and 'CanonicalSMILES' in properties[0]:
+                    features['SMILES'] = properties[0]['CanonicalSMILES']
+        # Get pharmacological information (we'll use this for both pharmacodynamics and toxicity)
+        info_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
+        info_response = requests.get(info_url, timeout=15)  # Increased timeout
+        if info_response.status_code == 200:
+            info_data = info_response.json()
+            if 'Record' in info_data and 'Section' in info_data['Record']:
+                # Search through sections for pharmacology information
+                for section in info_data['Record']['Section']:
+                    if 'TOCHeading' in section:
+                        # Look for Pharmacology section
+                        if section['TOCHeading'] == 'Pharmacology':
+                            if 'Section' in section:
+                                for subsection in section['Section']:
+                                    if 'TOCHeading' in subsection:
+                                        # Extract pharmacodynamics
+                                        if subsection['TOCHeading'] == 'Mechanism of Action':
+                                            if 'Information' in subsection:
+                                                for info in subsection['Information']:
+                                                    if 'Value' in info and 'StringWithMarkup' in info['Value']:
+                                                        for text in info['Value']['StringWithMarkup']:
+                                                            if 'String' in text:
+                                                                features['pharmacodynamics'] = text['String'][:500]  # Limit to 500 chars
+                                                                break
+                        # Look for toxicity information
+                        if section['TOCHeading'] == 'Toxicity':
+                            if 'Information' in section:
+                                for info in section['Information']:
+                                    if 'Value' in info and 'StringWithMarkup' in info['Value']:
+                                        for text in info['Value']['StringWithMarkup']:
+                                            if 'String' in text:
+                                                features['toxicity'] = text['String'][:500]  # Limit to 500 chars
+                                                break
+        return features
+    except Exception as e:
+        print(f"Error getting drug features from API: {e}")
+        return None
+# Function to check if drugs are in the dataset
+def get_drug_features_from_dataset(drug1, drug2, df):
+    if df.empty:
+        print("Dataset is empty, cannot search for drugs")
+        return None
+    # Normalize drug names for matching
+    drug1 = clean_drug_name(drug1)
+    drug2 = clean_drug_name(drug2)
+    print(f"Checking for drugs in dataset: '{drug1}', '{drug2}'")
+    try:
+        # First try with normalized columns
+        if 'Drug 1_normalized' in df.columns and 'Drug 2_normalized' in df.columns:
+            # Apply cleaning function to dataframe columns for comparison
+            drug_data = df[
+                (df['Drug 1_normalized'].str.lower().str.strip() == drug1) &
+                (df['Drug 2_normalized'].str.lower().str.strip() == drug2)
+            ]
+            # Also check the reverse combination
+            reversed_drug_data = df[
+                (df['Drug 1_normalized'].str.lower().str.strip() == drug2) &
+                (df['Drug 2_normalized'].str.lower().str.strip() == drug1)
+            ]
+            # Combine the results
+            drug_data = pd.concat([drug_data, reversed_drug_data])
+        else:
+            # Try with regular Drug1/Drug2 columns if normalized not available
+            possible_column_pairs = [
+                ('Drug1', 'Drug2'),
+                ('Drug 1', 'Drug 2'),
+                ('drug1', 'drug2'),
+                ('drug_1', 'drug_2')
+            ]
+            drug_data = pd.DataFrame()  # Initialize as empty
+            for col1, col2 in possible_column_pairs:
+                if col1 in df.columns and col2 in df.columns:
+                    # Clean the strings in the dataframe columns for comparison
+                    matches = df[
+                        ((df[col1].astype(str).str.lower().str.strip() == drug1) &
+                         (df[col2].astype(str).str.lower().str.strip() == drug2)) |
+                        ((df[col1].astype(str).str.lower().str.strip() == drug2) &
+                         (df[col2].astype(str).str.lower().str.strip() == drug1))
+                    ]
+                    if not matches.empty:
+                        drug_data = matches
+                        break
+        if not drug_data.empty:
+            print(f"Found drugs '{drug1}' and '{drug2}' in the dataset!")
+            return drug_data.iloc[0]  # Returns the first match
+        else:
+            print(f"Drugs '{drug1}' and '{drug2}' not found in the dataset.")
+            return None
+    except Exception as e:
+        print(f"Error searching for drugs in dataset: {e}")
+        return None
 # Updated prediction function with improved confidence handling
 def predict_severity(drug1, drug2):
     if not drug1 or not drug2:
         print(f"Error during prediction: {e}")
         return f"Error making prediction: {e}"
+# Gradio Interface
 interface = gr.Interface(
     fn=predict_severity,
     inputs=[