Spaces:
Running
Running
Added duplicate deletion, limit to input SMILES and updated infos
Browse files
README.md
CHANGED
|
@@ -63,7 +63,9 @@ C1CCCCC1O
|
|
| 63 |
C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2
|
| 64 |
```
|
| 65 |
|
| 66 |
-
The model
|
|
|
|
|
|
|
| 67 |
|
| 68 |
### Programmatic usage
|
| 69 |
|
|
|
|
| 63 |
C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2
|
| 64 |
```
|
| 65 |
|
| 66 |
+
The model processes each drug–target pair in sequence to predict binding affinity. For visualization, only the final SMILES entry is shown.
|
| 67 |
+
If “Remove duplicate SMILES” is enabled, any repeated SMILES strings are filtered out before analysis.
|
| 68 |
+
Final results follow the original SMILES order, with adjusted index numbers when duplicates have been removed.
|
| 69 |
|
| 70 |
### Programmatic usage
|
| 71 |
|
app.py
CHANGED
|
@@ -334,19 +334,41 @@ class DrugTargetInteractionApp:
|
|
| 334 |
# Initialize the app
|
| 335 |
app = DrugTargetInteractionApp()
|
| 336 |
|
| 337 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
"""Wrapper function for Gradio interface"""
|
| 339 |
if not target_seq.strip() or not drug_smiles.strip():
|
| 340 |
return "Please provide both target sequence and drug SMILES."
|
| 341 |
|
| 342 |
-
|
|
|
|
|
|
|
| 343 |
|
| 344 |
-
def visualize_wrapper(target_seq, drug_smiles):
|
| 345 |
"""Wrapper function for visualization"""
|
| 346 |
if not target_seq.strip() or not drug_smiles.strip():
|
| 347 |
return None, None, None, "Please provide both target sequence and drug SMILES."
|
| 348 |
|
| 349 |
-
|
|
|
|
|
|
|
| 350 |
|
| 351 |
def load_model_wrapper(model_path):
|
| 352 |
"""Wrapper function to load model"""
|
|
@@ -389,9 +411,15 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
|
|
| 389 |
"For multiple SMILES, enter each on a new line:\n"
|
| 390 |
"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O\n"
|
| 391 |
"C1CCCCC1O",
|
| 392 |
-
lines=
|
|
|
|
| 393 |
)
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
with gr.Row():
|
| 396 |
predict_btn = gr.Button("🚀 Predict Interaction", variant="primary", size="lg")
|
| 397 |
visualize_btn = gr.Button("📊 Generate Visualizations", variant="secondary", size="lg")
|
|
@@ -417,7 +445,7 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
|
|
| 417 |
"C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2"
|
| 418 |
]
|
| 419 |
],
|
| 420 |
-
inputs=[target_input, drug_input],
|
| 421 |
outputs=prediction_output,
|
| 422 |
fn=predict_wrapper,
|
| 423 |
cache_examples=False
|
|
@@ -426,13 +454,14 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
|
|
| 426 |
# Button click events
|
| 427 |
predict_btn.click(
|
| 428 |
fn=predict_wrapper,
|
| 429 |
-
inputs=[target_input, drug_input],
|
| 430 |
outputs=prediction_output
|
| 431 |
)
|
|
|
|
| 432 |
|
| 433 |
-
def visualize_and_update(target_seq, drug_smiles):
|
| 434 |
"""Generate visualizations and update both status and state"""
|
| 435 |
-
img1, img2, img3, status = visualize_wrapper(target_seq, drug_smiles)
|
| 436 |
# Combine prediction result with visualization status
|
| 437 |
combined_status = status + "\n\nVisualization analysis complete. Please navigate to the Visualizations tab to view the generated images."
|
| 438 |
if len(drug_smiles) > 1:
|
|
@@ -442,11 +471,11 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
|
|
| 442 |
|
| 443 |
visualize_btn.click(
|
| 444 |
fn=visualize_and_update,
|
| 445 |
-
inputs=[target_input, drug_input],
|
| 446 |
outputs=[viz_state1, viz_state2, viz_state3, prediction_output],
|
| 447 |
-
api_name="visualize_and_update"
|
| 448 |
)
|
| 449 |
-
|
| 450 |
with gr.Tab("📊 Visualizations"):
|
| 451 |
gr.HTML("""
|
| 452 |
<div style="text-align: center; margin-bottom: 20px;">
|
|
@@ -588,33 +617,34 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
|
|
| 588 |
with gr.Tab("ℹ️ About"):
|
| 589 |
gr.Markdown("""
|
| 590 |
## About this application
|
| 591 |
-
|
| 592 |
-
This application implements DLRNA-BERTa, a Dual Language RoBERTa Transformer model for predicting drug
|
| 593 |
-
|
| 594 |
-
- **Target encoder**:
|
| 595 |
-
- **Drug encoder**:
|
| 596 |
-
- **Cross-attention mechanism**: Captures interactions between
|
| 597 |
-
- **Regression head**: Predicts binding affinity
|
| 598 |
-
|
| 599 |
-
### Input requirements
|
| 600 |
-
- **Target sequence**: RNA sequence
|
| 601 |
-
- **Drug SMILES**:
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
-
|
| 607 |
-
-
|
| 608 |
-
-
|
| 609 |
-
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
|
|
|
| 618 |
|
| 619 |
For best results, ensure your input sequences are properly formatted and within reasonable length limits (max 512 tokens).
|
| 620 |
|
|
|
|
| 334 |
# Initialize the app
|
| 335 |
app = DrugTargetInteractionApp()
|
| 336 |
|
| 337 |
+
def smiles_preprocessing(drug_smiles):
|
| 338 |
+
drugs = drug_smiles.strip().split("\n")
|
| 339 |
+
|
| 340 |
+
# Remove molecule duplicates in O(n) while preserving the order
|
| 341 |
+
seen = set()
|
| 342 |
+
sorted_drugs = []
|
| 343 |
+
|
| 344 |
+
kept = 0
|
| 345 |
+
|
| 346 |
+
for x in drugs:
|
| 347 |
+
if x not in seen:
|
| 348 |
+
seen.add(x)
|
| 349 |
+
sorted_drugs.append(x)
|
| 350 |
+
kept += 1
|
| 351 |
+
|
| 352 |
+
logger.info(f"{kept-len(drugs)} duplicate smiles removed!")
|
| 353 |
+
return sorted_drugs
|
| 354 |
+
|
| 355 |
+
def predict_wrapper(target_seq, drug_smiles, remove_dups):
|
| 356 |
"""Wrapper function for Gradio interface"""
|
| 357 |
if not target_seq.strip() or not drug_smiles.strip():
|
| 358 |
return "Please provide both target sequence and drug SMILES."
|
| 359 |
|
| 360 |
+
target_seq = target_seq.strip()
|
| 361 |
+
drug_smiles = smiles_preprocessing(drug_smiles, remove_dups)
|
| 362 |
+
return app.predict_interaction(target_seq, drug_smiles)
|
| 363 |
|
| 364 |
+
def visualize_wrapper(target_seq, drug_smiles, remove_dups):
|
| 365 |
"""Wrapper function for visualization"""
|
| 366 |
if not target_seq.strip() or not drug_smiles.strip():
|
| 367 |
return None, None, None, "Please provide both target sequence and drug SMILES."
|
| 368 |
|
| 369 |
+
target_seq = target_seq.strip()
|
| 370 |
+
drug_smiles = smiles_preprocessing(drug_smiles, remove_dups)
|
| 371 |
+
return app.visualize_interaction(target_seq, drug_smiles)
|
| 372 |
|
| 373 |
def load_model_wrapper(model_path):
|
| 374 |
"""Wrapper function to load model"""
|
|
|
|
| 411 |
"For multiple SMILES, enter each on a new line:\n"
|
| 412 |
"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O\n"
|
| 413 |
"C1CCCCC1O",
|
| 414 |
+
lines=4,
|
| 415 |
+
max_lines=2000
|
| 416 |
)
|
| 417 |
|
| 418 |
+
remove_dups_checkbox = gr.Checkbox(
|
| 419 |
+
label="Remove duplicate SMILES",
|
| 420 |
+
value=True
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
with gr.Row():
|
| 424 |
predict_btn = gr.Button("🚀 Predict Interaction", variant="primary", size="lg")
|
| 425 |
visualize_btn = gr.Button("📊 Generate Visualizations", variant="secondary", size="lg")
|
|
|
|
| 445 |
"C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2"
|
| 446 |
]
|
| 447 |
],
|
| 448 |
+
inputs=[target_input, drug_input, remove_dups_checkbox],
|
| 449 |
outputs=prediction_output,
|
| 450 |
fn=predict_wrapper,
|
| 451 |
cache_examples=False
|
|
|
|
| 454 |
# Button click events
|
| 455 |
predict_btn.click(
|
| 456 |
fn=predict_wrapper,
|
| 457 |
+
inputs=[target_input, drug_input, remove_dups_checkbox],
|
| 458 |
outputs=prediction_output
|
| 459 |
)
|
| 460 |
+
|
| 461 |
|
| 462 |
+
def visualize_and_update(target_seq, drug_smiles, remove_dups):
|
| 463 |
"""Generate visualizations and update both status and state"""
|
| 464 |
+
img1, img2, img3, status = visualize_wrapper(target_seq, drug_smiles, remove_dups)
|
| 465 |
# Combine prediction result with visualization status
|
| 466 |
combined_status = status + "\n\nVisualization analysis complete. Please navigate to the Visualizations tab to view the generated images."
|
| 467 |
if len(drug_smiles) > 1:
|
|
|
|
| 471 |
|
| 472 |
visualize_btn.click(
|
| 473 |
fn=visualize_and_update,
|
| 474 |
+
inputs=[target_input, drug_input, remove_dups_checkbox],
|
| 475 |
outputs=[viz_state1, viz_state2, viz_state3, prediction_output],
|
| 476 |
+
api_name="visualize_and_update"
|
| 477 |
)
|
| 478 |
+
|
| 479 |
with gr.Tab("📊 Visualizations"):
|
| 480 |
gr.HTML("""
|
| 481 |
<div style="text-align: center; margin-bottom: 20px;">
|
|
|
|
| 617 |
with gr.Tab("ℹ️ About"):
|
| 618 |
gr.Markdown("""
|
| 619 |
## About this application
|
| 620 |
+
|
| 621 |
+
This application implements DLRNA-BERTa, a Dual Language RoBERTa Transformer model for predicting drug-to-RNA target interactions. The architecture combines:
|
| 622 |
+
|
| 623 |
+
- **Target encoder**: RNA-BERTa for processing RNA sequences
|
| 624 |
+
- **Drug encoder**: ChemBERTa for SMILES representation
|
| 625 |
+
- **Cross-attention mechanism**: Captures interactions between drug and target
|
| 626 |
+
- **Regression head**: Predicts binding affinity (pKd)
|
| 627 |
+
|
| 628 |
+
### Input requirements
|
| 629 |
+
- **Target sequence**: RNA sequence (A, U, G, C)
|
| 630 |
+
- **Drug SMILES**: One or more SMILES strings
|
| 631 |
+
- For batch mode, enter each SMILES on a new line (max 2000 entries)
|
| 632 |
+
- A checkbox option allows automatic removal of duplicate SMILES before prediction
|
| 633 |
+
|
| 634 |
+
### Model features
|
| 635 |
+
- Cross-attention for drug-target interaction modeling
|
| 636 |
+
- Regularization via dropout
|
| 637 |
+
- Layer normalization for stable training
|
| 638 |
+
- Dedicated interpretability mode for visualization
|
| 639 |
+
- Batch prediction with optional de-duplication
|
| 640 |
+
|
| 641 |
+
### Usage tips
|
| 642 |
+
1. Load a model (optional) in the Model Settings tab
|
| 643 |
+
2. Enter an RNA sequence and one or more SMILES strings
|
| 644 |
+
3. Use the **“Remove duplicate SMILES”** checkbox if you want duplicates filtered automatically
|
| 645 |
+
4. Click *Predict Interaction* for affinity scores
|
| 646 |
+
5. Click *Generate Visualizations* for interpretability plots
|
| 647 |
+
6. Visualizations are produced only for the final SMILES entry in batch mode
|
| 648 |
|
| 649 |
For best results, ensure your input sequences are properly formatted and within reasonable length limits (max 512 tokens).
|
| 650 |
|