Spaces:

Fredaaaaaa
/

HM

Sleeping

App Files Files Community

Fredaaaaaa commited on Apr 24, 2025

Commit

80a0bab

verified ·

1 Parent(s): 3df3aeb

Update app.py

Browse files

Files changed (1) hide show

app.py +341 -100

app.py CHANGED Viewed

@@ -1,100 +1,341 @@
-import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-import requests
-import gradio as gr
-# Load the pre-trained model and tokenizer from the Hugging Face directory where the model is saved
-model = AutoModelForSequenceClassification.from_pretrained("Fredaaaaaa/hybrid_model")
-tokenizer = AutoTokenizer.from_pretrained("hybrid_model")
-# Function to fetch drug features from an external API (e.g., PubChem)
-def get_drug_features(drug1, drug2):
-    # You can modify this function to fetch additional features based on your API choice.
-    api_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug1}/property/SMILES,Pharmacology,Toxicity,Mechanism-of-action,Route-of-elimination,Metabolism/JSON'
-    response = requests.get(api_url)
-    if response.status_code == 200:
-        data = response.json()
-        drug_features = {
-            'SMILES': data['PropertyTable']['Properties'][0].get('SMILES', 'No data'),
-            'pharmacology': data['PropertyTable']['Properties'][0].get('Pharmacology', 'No data'),
-            'toxicity': data['PropertyTable']['Properties'][0].get('Toxicity', 'No data'),
-            'mechanism-of-action': data['PropertyTable']['Properties'][0].get('Mechanism-of-action', 'No data'),
-            'route-of-elimination': data['PropertyTable']['Properties'][0].get('Route-of-elimination', 'No data'),
-            'metabolism': data['PropertyTable']['Properties'][0].get('Metabolism', 'No data'),
-        }
-        return drug_features
-    else:
-        return None  # If no data is returned, handle the missing values gracefully
-# Define the Hybrid Model (already trained)
-class HybridModel(torch.nn.Module):
-    def __init__(self, text_model, input_size, dropout_rate=0.3):
-        super(HybridModel, self).__init__()
-        self.text_model = text_model
-        self.fc1 = torch.nn.Linear(input_size, 128)  # Fully connected layer for drug features
-        self.fc2 = torch.nn.Linear(128, 64)         # Additional fully connected layer
-        self.fc3 = torch.nn.Linear(64, 4)           # Output layer (4 classes for severity)
-        self.dropout = torch.nn.Dropout(dropout_rate)  # Dropout layer to prevent overfitting
-    def forward(self, input_ids, attention_mask, drug_features):
-        # Process the text data (interaction description) through BioBERT
-        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
-        text_features = text_outputs.logits  # Shape: (batch_size, num_labels)
-        # Pass the drug features through the fully connected layers
-        x = torch.relu(self.fc1(drug_features))  # Apply ReLU activation
-        x = self.dropout(x)  # Apply Dropout
-        x = torch.relu(self.fc2(x))  # Additional ReLU layer
-        x = self.fc3(x)  # Output layer (4 classes)
-        # Combine text features and drug features (can use addition or concatenation)
-        combined = text_features + x  # Simple addition, can experiment with concatenation
-        return combined
-# Initialize the model
-text_model = AutoModelForSequenceClassification.from_pretrained("hybrid_model")
-hybrid_model = HybridModel(text_model, input_size=3)  # Example size, adjust based on your data
-hybrid_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-# Function to process the input and predict severity
-def predict_severity(drug1, drug2):
-    # Fetch drug features from external API
-    drug_features = get_drug_features(drug1, drug2)
-    if not drug_features:
-        return "Error: Could not fetch drug features from the API."
-    # Preprocess text data (interaction description)
-    interaction_description = f"{drug1} interacts with {drug2}"  # Example interaction description
-    inputs = tokenizer(interaction_description, return_tensors="pt", padding=True, truncation=True, max_length=128)
-    input_ids = inputs['input_ids'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-    attention_mask = inputs['attention_mask'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-    # Process drug features (SMILES, pharmacology, toxicity, etc.)
-    drug_feature_values = [drug_features['SMILES'], drug_features['pharmacology'], drug_features['toxicity']]
-    drug_features_tensor = torch.tensor(drug_feature_values, dtype=torch.float32).unsqueeze(0).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-    # Run the model to get predictions
-    hybrid_model.eval()
-    with torch.no_grad():
-        outputs = hybrid_model(input_ids, attention_mask, drug_features_tensor)
-    # Get the predicted class
-    prediction = torch.argmax(outputs, dim=1).item()
-    # Map the predicted class index to the severity label
-    severity_labels = ["No interaction", "Mild", "Moderate", "Severe"]
-    return severity_labels[prediction]
-# Gradio Interface
-interface = gr.Interface(
-    fn=predict_severity,
-    inputs=[gr.Textbox(label="Drug 1"), gr.Textbox(label="Drug 2")],
-    outputs="text",
-    live=True
-)
-# Launch the interface
-interface.launch()

+import pickle
+import requests
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import gradio as gr
+import pandas as pd
+import re
+# Download label encoder from Hugging Face Hub
+label_encoder_path = hf_hub_download(repo_id="Fredaaaaaa/hybrid_model", filename="label_encoder.pkl")
+with open(label_encoder_path, 'rb') as f:
+    label_encoder = pickle.load(f)
+# Load model and tokenizer
+model_name = "Fredaaaaaa/hybrid_model"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+model.eval()
+# Download the dataset from Hugging Face Hub
+dataset_path = hf_hub_download(repo_id="Fredaaaaaa/hybrid_model", filename="labeled_severity.csv")
+# Load the dataset with appropriate encoding
+try:
+    df = pd.read_csv(dataset_path, encoding='ISO-8859-1')
+    print(f"Dataset loaded successfully! Shape: {df.shape}")
+    # Create a set of all unique drugs in the dataset for validation
+    all_drugs = set()
+    # Check which columns contain drug names
+    drug_columns = []
+    for col in df.columns:
+        if 'drug' in col.lower() or 'medication' in col.lower():
+            drug_columns.append(col)
+            # Add all drugs from this column to our set after cleaning
+            clean_drugs = df[col].dropna().astype(str).apply(lambda x: x.strip().lower())
+            all_drugs.update(clean_drugs.unique())
+    print(f"Found {len(all_drugs)} unique drugs in the dataset")
+    print(f"Drug name columns found: {drug_columns}")
+except Exception as e:
+    print(f"Error reading the dataset: {e}")
+    df = pd.DataFrame()  # Empty dataframe as fallback
+    all_drugs = set()
+# Function to properly clean drug names
+def clean_drug_name(drug_name):
+    if not drug_name:
+        return ""
+    # Remove extra whitespace and standardize to lowercase
+    return re.sub(r'\s+', ' ', drug_name.strip().lower())
+# Function to validate if input is a legitimate drug name
+def validate_drug_input(drug_name):
+    # Clean the input
+    drug_name = clean_drug_name(drug_name)
+    if not drug_name or len(drug_name) <= 1:
+        return False, "Drug name is too short"
+    # Check if it's just a single letter or number
+    if len(drug_name) == 1 or drug_name.isdigit():
+        return False, "Not a valid drug name"
+    # Check if it contains weird characters
+    if not re.match(r'^[a-zA-Z0-9\s\-\+]+$', drug_name):
+        return False, "Drug name contains invalid characters"
+    # Print for debugging
+    print(f"Validating drug: '{drug_name}'")
+    print(f"Drug in dataset: {drug_name in all_drugs}")
+    # Check if it's in our known drug list
+    if drug_name in all_drugs:
+        return True, "Drug found in dataset"
+    # If we have a small drug list or need to be more forgiving, we can try fuzzy matching
+    for known_drug in all_drugs:
+        if drug_name in known_drug or known_drug in drug_name:
+            return True, f"Drug found in dataset (matched with '{known_drug}')"
+    # If not in dataset, we'll try the API validation
+    return None, "Drug not in dataset, needs API validation"
+# Function to validate drug via PubChem API
+def validate_drug_via_api(drug_name):
+    drug_name = clean_drug_name(drug_name)
+    try:
+        # Try to get basic info about the drug from PubChem
+        api_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/cids/JSON'
+        print(f"Calling PubChem API: {api_url}")
+        response = requests.get(api_url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if 'IdentifierList' in data and 'CID' in data['IdentifierList'] and len(data['IdentifierList']['CID']) > 0:
+                return True, f"Drug validated via PubChem (CID: {data['IdentifierList']['CID'][0]})"
+        # Try the full name version with any suffixes that may have been removed
+        return False, f"Not found in PubChem database (Status: {response.status_code})"
+    except Exception as e:
+        print(f"API validation error: {e}")
+        return False, f"API validation error: {e}"
+# Function to check if drugs are in the dataset
+def get_drug_features_from_dataset(drug1, drug2, df):
+    if df.empty:
+        print("Dataset is empty, cannot search for drugs")
+        return None
+    # Normalize drug names for matching
+    drug1 = clean_drug_name(drug1)
+    drug2 = clean_drug_name(drug2)
+    print(f"Checking for drugs in dataset: '{drug1}', '{drug2}'")
+    try:
+        # First try with normalized columns
+        if 'Drug 1_normalized' in df.columns and 'Drug 2_normalized' in df.columns:
+            # Apply cleaning function to dataframe columns for comparison
+            drug_data = df[
+                (df['Drug 1_normalized'].str.lower().str.strip() == drug1) &
+                (df['Drug 2_normalized'].str.lower().str.strip() == drug2)
+            ]
+            # Also check the reverse combination
+            reversed_drug_data = df[
+                (df['Drug 1_normalized'].str.lower().str.strip() == drug2) &
+                (df['Drug 2_normalized'].str.lower().str.strip() == drug1)
+            ]
+            # Combine the results
+            drug_data = pd.concat([drug_data, reversed_drug_data])
+        else:
+            # Try with regular Drug1/Drug2 columns if normalized not available
+            possible_column_pairs = [
+                ('Drug1', 'Drug2'),
+                ('Drug 1', 'Drug 2'),
+                ('drug1', 'drug2'),
+                ('drug_1', 'drug_2')
+            ]
+            drug_data = pd.DataFrame()  # Initialize as empty
+            for col1, col2 in possible_column_pairs:
+                if col1 in df.columns and col2 in df.columns:
+                    # Clean the strings in the dataframe columns for comparison
+                    matches = df[
+                        ((df[col1].astype(str).str.lower().str.strip() == drug1) &
+                         (df[col2].astype(str).str.lower().str.strip() == drug2)) |
+                        ((df[col1].astype(str).str.lower().str.strip() == drug2) &
+                         (df[col2].astype(str).str.lower().str.strip() == drug1))
+                    ]
+                    if not matches.empty:
+                        drug_data = matches
+                        break
+        if not drug_data.empty:
+            print(f"Found drugs '{drug1}' and '{drug2}' in the dataset!")
+            return drug_data.iloc[0]  # Returns the first match
+        else:
+            print(f"Drugs '{drug1}' and '{drug2}' not found in the dataset.")
+            return None
+    except Exception as e:
+        print(f"Error searching for drugs in dataset: {e}")
+        return None
+# Function to predict the severity based on the drugs' data
+def predict_severity(drug1, drug2):
+    if not drug1 or not drug2:
+        return "Please enter both drugs to predict interaction severity."
+    # Clean input before processing
+    drug1 = clean_drug_name(drug1)
+    drug2 = clean_drug_name(drug2)
+    print(f"Processing request for drugs: '{drug1}' and '{drug2}'")
+    # For drugs in the dataset, we'll bypass validation
+    drug_data = get_drug_features_from_dataset(drug1, drug2, df)
+    if drug_data is not None:
+        print(f"Found drugs in dataset, bypassing validation")
+        is_valid_drug1 = True
+        is_valid_drug2 = True
+    else:
+        # Step 1: Validate the inputs are actual drug names if not found in dataset
+        print("Drugs not found in dataset, validating through other means")
+        validation_results = []
+        for drug_name in [drug1, drug2]:
+            # Try dataset validation first (individual drug)
+            is_valid, message = validate_drug_input(drug_name)
+            # If not in dataset, try API validation
+            if is_valid is None:
+                is_valid, message = validate_drug_via_api(drug_name)
+            validation_results.append((drug_name, is_valid, message))
+        # If either drug failed validation, return error
+        invalid_drugs = [(name, msg) for name, valid, msg in validation_results if not valid]
+        if invalid_drugs:
+            invalid_names = ", ".join([f"'{name}' ({msg})" for name, msg in invalid_drugs])
+            return f"Invalid drug name(s): {invalid_names}. Please enter valid drug names."
+        is_valid_drug1 = validation_results[0][1]
+        is_valid_drug2 = validation_results[1][1]
+    # If we've made it here, both drugs are valid
+    # If we already have the drug data from the dataset check
+    if drug_data is not None:
+        print(f"Using dataset features for '{drug1}' and '{drug2}'")
+        # Extract features based on available columns
+        try:
+            # Prepare feature dictionary based on available columns
+            drug_features = {}
+            # Map potential column names to expected feature names
+            column_mappings = {
+                'SMILES': ['SMILES', 'smiles'],
+                'pharmacodynamics': ['pharmacodynamics', 'Pharmacodynamics', 'pharmacology'],
+                'toxicity': ['toxicity', 'Toxicity']
+            }
+            # Get features from dataset using flexible column matching
+            for feature, possible_cols in column_mappings.items():
+                feature_found = False
+                for col in possible_cols:
+                    if col in drug_data.index or col in drug_data:
+                        try:
+                            drug_features[feature] = drug_data[col]
+                            feature_found = True
+                            break
+                        except Exception as e:
+                            print(f"Error accessing column {col}: {e}")
+                            continue
+                if not feature_found:
+                    drug_features[feature] = 'No data'
+        except Exception as e:
+            print(f"Error extracting features from dataset: {e}")
+            return f"Error processing drug data: {e}"
+    else:
+        print(f"Fetching API data for '{drug1}' and '{drug2}'")
+        # If drugs not found in dataset, fetch from API
+        drug1_features = get_drug_features_from_api(drug1)
+        if drug1_features is None and is_valid_drug1:
+            # Try again with a fallback approach for special characters
+            drug1_features = {
+                'SMILES': 'No data from API',
+                'pharmacodynamics': 'No data from API',
+                'toxicity': 'No data from API'
+            }
+        drug2_features = get_drug_features_from_api(drug2)
+        if drug2_features is None and is_valid_drug2:
+            # Try again with a fallback approach for special characters
+            drug2_features = {
+                'SMILES': 'No data from API',
+                'pharmacodynamics': 'No data from API',
+                'toxicity': 'No data from API'
+            }
+        # Verify we got data for both drugs
+        if drug1_features is None or drug2_features is None:
+            return "Couldn't retrieve sufficient data for one or both drugs. Please try different drugs or check your spelling."
+        # Combine features from both drugs
+        drug_features = {
+            'SMILES': f"{drug1}: {drug1_features['SMILES']}; {drug2}: {drug2_features['SMILES']}",
+            'pharmacodynamics': f"{drug1}: {drug1_features.get('pharmacodynamics', 'No data')}; {drug2}: {drug2_features.get('pharmacodynamics', 'No data')}",
+            'toxicity': f"{drug1}: {drug1_features.get('toxicity', 'No data')}; {drug2}: {drug2_features.get('toxicity', 'No data')}"
+        }
+    # Create interaction description
+    interaction_description = f"{drug1} interacts with {drug2}"
+    # Tokenize the input for the model
+    inputs = tokenizer(interaction_description, return_tensors="pt", padding=True, truncation=True, max_length=128)
+    # Move inputs to appropriate device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    try:
+        # Run the model to get predictions
+        with torch.no_grad():
+            outputs = model(input_ids, attention_mask=attention_mask)
+        # Get the predicted class
+        prediction = torch.argmax(outputs.logits, dim=1).item()
+        # Map the predicted class index to the severity label using label encoder if available
+        if hasattr(label_encoder, 'classes_'):
+            severity_label = label_encoder.classes_[prediction]
+        else:
+            # Fallback labels if encoder doesn't work
+            severity_labels = ["No interaction", "Mild", "Moderate", "Severe"]
+            severity_label = severity_labels[prediction]
+        # Calculate confidence score
+        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
+        confidence = probabilities[0][prediction].item() * 100
+        result = f"Predicted interaction severity: {severity_label} (Confidence: {confidence:.1f}%)"
+        # Add source information
+        if drug_data is not None:
+            result += "\nData source: Features from dataset"
+        else:
+            result += "\nData source: Features from PubChem API"
+        return result
+    except Exception as e:
+        print(f"Error during prediction: {e}")
+        return f"Error making prediction: {e}"
+# Gradio Interface
+interface = gr.Interface(
+    fn=predict_severity,
+    inputs=[
+        gr.Textbox(label="Drug 1 (e.g., Aspirin)", placeholder="Enter first drug name"),
+        gr.Textbox(label="Drug 2 (e.g., Warfarin)", placeholder="Enter second drug name")
+    ],
+    outputs=gr.Textbox(label="Prediction Result"),
+    title="Drug Interaction Severity Predictor",
+    description="Enter two drug names to predict the severity of their interaction.",
+    examples=[["Aspirin", "Warfarin"], ["Ibuprofen", "Naproxen"], ["Hydralazine", "Amphetamine"]]
+)
+# Launch the interface
+if __name__ == "__main__":
+    interface.launch(debug=True)