Spaces:

vignt97867896
/

bioflow

Sleeping

yassinekolsi commited on Jan 28

Commit

5770d80

1 Parent(s): b25a5e5

fix: PR review fixes - dockerfile, encoders, orchestrator, paths

- dockerfile: use conda run for pip installs, remove hardcoded proxy, add COPY, fix ENV PATH syntax
- orchestrator: remove duplicate Optional import, fix retriever API (modality param), safe getattr
- encoders: batch_encode honors pooling setting in protein/text encoders
- qdrant_retriever: safe modality mapping for legacy values
- plugins/__init__: replace print with logging
- run_summary.json: convert absolute paths to relative (5 files)
- BIOFLOW_README: fix license badge (Apache->MIT)
- USE_POLICY: fix spelling (sperate->separate)
- model_customization.ipynb: fix config path
- enhanced_search: sort by score after MMR, qdrant API fallback
- UI: database filter state management, result display improvements

Files changed (18) hide show

BIOFLOW_README.md +1 -1
USE_POLICY.md +1 -1
bioflow/api/server.py +9 -4
bioflow/core/orchestrator.py +10 -7
bioflow/plugins/__init__.py +14 -6
bioflow/plugins/encoders/protein_encoder.py +7 -2
bioflow/plugins/encoders/text_encoder.py +7 -2
bioflow/plugins/qdrant_retriever.py +18 -2
bioflow/runs/20260125_080409_BindingDB_Kd/run_summary.json +6 -6
bioflow/runs/20260125_104915_KIBA/run_summary.json +6 -6
bioflow/search/enhanced_search.py +26 -8
dockerfile +20 -17
examples/model_customization.ipynb +2 -1
ingest_dti_data.py +220 -0
runs/20260125_080409_BindingDB_Kd/run_summary.json +6 -6
runs/20260125_104915_KIBA/run_summary.json +6 -6
ui/app/api/agents/workflow/route.ts +0 -42
ui/app/dashboard/discovery/page.tsx +46 -14

BIOFLOW_README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # BioFlow - AI-Powered Drug Discovery Platform
 [![Version](https://img.shields.io/badge/version-2.0.0-blue.svg)]()
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
 **BioFlow** is a unified AI platform for drug discovery, combining molecular encoding, protein analysis, and drug-target interaction prediction in a modern web interface.

 # BioFlow - AI-Powered Drug Discovery Platform
 [![Version](https://img.shields.io/badge/version-2.0.0-blue.svg)]()
+[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
 **BioFlow** is a unified AI platform for drug discovery, combining molecular encoding, protein analysis, and drug-target interaction prediction in a modern web interface.

USE_POLICY.md CHANGED Viewed

@@ -6,7 +6,7 @@ You have the right to use BioMedGPT pursuant to relevant agreements, but you can
 1. inciting to resist or undermine the implementation of the Constitution, laws and administrative regulations;
 2. inciting to subvert the state power and the overthrow of the political system;
-3. inciting to sperate the state or undermine unity of the country;
 4. inciting national enmity or discrimination, undermine the unity of nations;
 5. content involving discrimination on the basis of race, sex, religion, geographical content, etc.;
 6. fabricating or distorting facts, spreading disinformation, or disturbing the public order;

 1. inciting to resist or undermine the implementation of the Constitution, laws and administrative regulations;
 2. inciting to subvert the state power and the overthrow of the political system;
+3. inciting to separate the state or undermine unity of the country;
 4. inciting national enmity or discrimination, undermine the unity of nations;
 5. content involving discrimination on the basis of race, sex, religion, geographical content, etc.;
 6. fabricating or distorting facts, spreading disinformation, or disturbing the public order;

bioflow/api/server.py CHANGED Viewed

@@ -539,7 +539,12 @@ async def enhanced_search(request: dict = None):
         collection = request.get("collection")
         use_mmr = request.get("use_mmr", True)
         lambda_param = request.get("lambda_param", 0.7)
-        filters = request.get("filters")
         # Map old type names to new modality names
         type_to_modality = {
@@ -584,9 +589,9 @@ async def enhanced_search(request: dict = None):
         _log_event(
             "search",
             request_id,
-            query=request.query[:200],
-            top_k=request.top_k,
-            use_mmr=request.use_mmr,
             returned=payload.get("returned"),
             total_found=payload.get("total_found"),
             duration_ms=round((time.perf_counter() - start) * 1000, 2),

         collection = request.get("collection")
         use_mmr = request.get("use_mmr", True)
         lambda_param = request.get("lambda_param", 0.7)
+        filters = request.get("filters") or {}
+        dataset = request.get("dataset")  # Optional dataset filter (davis, kiba)
+        # Add dataset filter if specified
+        if dataset:
+            filters["source"] = dataset.lower()
         # Map old type names to new modality names
         type_to_modality = {
         _log_event(
             "search",
             request_id,
+            query=query[:200] if query else "",
+            top_k=top_k,
+            use_mmr=use_mmr,
             returned=payload.get("returned"),
             total_found=payload.get("total_found"),
             duration_ms=round((time.perf_counter() - start) * 1000, 2),

bioflow/core/orchestrator.py CHANGED Viewed

@@ -12,14 +12,10 @@ from dataclasses import dataclass, field
 from datetime import datetime
 from collections import defaultdict
-from typing import Optional as OptionalType
 from bioflow.core.base import BioEncoder, BioPredictor, BioGenerator, Modality
 from bioflow.core.config import NodeConfig, WorkflowConfig, NodeType
 from bioflow.core.registry import ToolRegistry
-# Re-import Optional with a different name to avoid conflicts
-from typing import Optional
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -174,10 +170,11 @@ class BioFlowOrchestrator:
                 if self._retriever is None:
                     raise ValueError("No retriever configured. Call set_retriever() first.")
                 limit = node.params.get("limit", 5)
-                modality = node.params.get("modality", "text")
                 return self._retriever.search(
                     query=node_input,
-                    query_modality=modality,
                     limit=limit
                 )
@@ -191,7 +188,13 @@ class BioFlowOrchestrator:
                 threshold = node.params.get("threshold", 0.5)
                 key = node.params.get("key", "score")
                 if isinstance(node_input, list):
-                    return [x for x in node_input if getattr(x, key, x.get(key, 0)) >= threshold]
                 return node_input
             elif node.type == NodeType.CUSTOM:

 from datetime import datetime
 from collections import defaultdict
 from bioflow.core.base import BioEncoder, BioPredictor, BioGenerator, Modality
 from bioflow.core.config import NodeConfig, WorkflowConfig, NodeType
 from bioflow.core.registry import ToolRegistry
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
                 if self._retriever is None:
                     raise ValueError("No retriever configured. Call set_retriever() first.")
                 limit = node.params.get("limit", 5)
+                modality_name = node.params.get("modality", "text")
+                modality = Modality(modality_name)
                 return self._retriever.search(
                     query=node_input,
+                    modality=modality,
                     limit=limit
                 )
                 threshold = node.params.get("threshold", 0.5)
                 key = node.params.get("key", "score")
                 if isinstance(node_input, list):
+                    def _get_value(item: Any) -> float:
+                        if hasattr(item, key):
+                            return getattr(item, key)
+                        if isinstance(item, dict):
+                            return item.get(key, 0)
+                        return 0
+                    return [x for x in node_input if _get_value(x) >= threshold]
                 return node_input
             elif node.type == NodeType.CUSTOM:

bioflow/plugins/__init__.py CHANGED Viewed

@@ -46,13 +46,21 @@ def register_all(registry=None):
     Args:
         registry: ToolRegistry instance (uses global if None)
     """
     from bioflow.core import ToolRegistry
     registry = registry or ToolRegistry
-    # Note: Encoders are lazy-loaded, so we don't instantiate here
-    # They will be registered when first used
-    print("Plugins available for registration:")
-    print("  Encoders: OBMEncoder, TextEncoder, MoleculeEncoder, ProteinEncoder")
-    print("  Retrievers: QdrantRetriever")
-    print("  Predictors: DeepPurposePredictor")

     Args:
         registry: ToolRegistry instance (uses global if None)
+    Returns:
+        dict: Available plugin classes by category
     """
+    import logging
+    logger = logging.getLogger(__name__)
     from bioflow.core import ToolRegistry
     registry = registry or ToolRegistry
+    available = {
+        "encoders": ["OBMEncoder", "TextEncoder", "MoleculeEncoder", "ProteinEncoder"],
+        "retrievers": ["QdrantRetriever"],
+        "predictors": ["DeepPurposePredictor"],
+    }
+    logger.info(f"Plugins available for registration: {available}")
+    return available

bioflow/plugins/encoders/protein_encoder.py CHANGED Viewed

@@ -173,8 +173,13 @@ class ProteinEncoder(BioEncoder):
         with torch.no_grad():
             outputs = self.model(**inputs)
             hidden_states = outputs.last_hidden_state
-            attention_mask = inputs["attention_mask"].unsqueeze(-1)
-            embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
         results = []
         for i, emb in enumerate(embeddings):

         with torch.no_grad():
             outputs = self.model(**inputs)
             hidden_states = outputs.last_hidden_state
+            # Apply same pooling strategy as encode()
+            if self.pooling == "cls":
+                embeddings = hidden_states[:, 0, :]
+            else:  # mean pooling
+                attention_mask = inputs["attention_mask"].unsqueeze(-1)
+                embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
         results = []
         for i, emb in enumerate(embeddings):

bioflow/plugins/encoders/text_encoder.py CHANGED Viewed

@@ -159,11 +159,16 @@ class TextEncoder(BioEncoder):
             outputs = self.model(**inputs)
             hidden_states = outputs.last_hidden_state
-            if self.pooling == "mean":
                 attention_mask = inputs["attention_mask"].unsqueeze(-1)
                 embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
             else:
-                embeddings = hidden_states[:, 0, :]
         results = []
         for i, emb in enumerate(embeddings):

             outputs = self.model(**inputs)
             hidden_states = outputs.last_hidden_state
+            # Apply same pooling strategy as encode()
+            if self.pooling == "cls":
+                embeddings = hidden_states[:, 0, :]
+            elif self.pooling == "mean":
                 attention_mask = inputs["attention_mask"].unsqueeze(-1)
                 embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
+            elif self.pooling == "max":
+                embeddings = hidden_states.max(dim=1).values
             else:
+                raise ValueError(f"Unknown pooling: {self.pooling}")
         results = []
         for i, emb in enumerate(embeddings):

bioflow/plugins/qdrant_retriever.py CHANGED Viewed

@@ -193,13 +193,29 @@ class QdrantRetriever(BioRetriever):
                 query_filter=qdrant_filter
             )
-        # Convert to RetrievalResult
         return [
             RetrievalResult(
                 id=str(r.id),
                 score=r.score,
                 content=r.payload.get("content", ""),
-                modality=Modality(r.payload.get("modality", "text")),
                 payload=r.payload
             )
             for r in results

                 query_filter=qdrant_filter
             )
+        # Convert to RetrievalResult with safe modality mapping
+        def _safe_modality(payload: dict) -> Modality:
+            raw = payload.get("modality")
+            if isinstance(raw, Modality):
+                return raw
+            if not isinstance(raw, str):
+                return Modality.TEXT
+            norm = raw.strip().lower()
+            # Map legacy/synonym values
+            synonym_map = {"molecule": "smiles", "drug": "smiles"}
+            if norm in synonym_map:
+                norm = synonym_map[norm]
+            try:
+                return Modality(norm)
+            except ValueError:
+                return Modality.TEXT
         return [
             RetrievalResult(
                 id=str(r.id),
                 score=r.score,
                 content=r.payload.get("content", ""),
+                modality=_safe_modality(r.payload),
                 payload=r.payload
             )
             for r in results

bioflow/runs/20260125_080409_BindingDB_Kd/run_summary.json CHANGED Viewed

@@ -20,12 +20,12 @@
     "ci_approx": 0.8053618329014657
   },
   "files": {
-    "predictions_test_csv": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\predictions_test.csv",
-    "scatter_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\scatter.png",
-    "curves_sorted_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\curves_sorted.png",
-    "residuals_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\residuals.png",
-    "hist_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\hist_true_pred.png",
-    "ecdf_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 6569.407529115677,

     "ci_approx": 0.8053618329014657
   },
   "files": {
+    "predictions_test_csv": "predictions_test.csv",
+    "scatter_png": "scatter.png",
+    "curves_sorted_png": "curves_sorted.png",
+    "residuals_png": "residuals.png",
+    "hist_true_pred_png": "hist_true_pred.png",
+    "ecdf_true_pred_png": "ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 6569.407529115677,

bioflow/runs/20260125_104915_KIBA/run_summary.json CHANGED Viewed

@@ -20,12 +20,12 @@
     "ci_approx": 0.7031028951074637
   },
   "files": {
-    "predictions_test_csv": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\predictions_test.csv",
-    "scatter_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\scatter.png",
-    "curves_sorted_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\curves_sorted.png",
-    "residuals_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\residuals.png",
-    "hist_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\hist_true_pred.png",
-    "ecdf_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 26236.644289016724,

     "ci_approx": 0.7031028951074637
   },
   "files": {
+    "predictions_test_csv": "predictions_test.csv",
+    "scatter_png": "scatter.png",
+    "curves_sorted_png": "curves_sorted.png",
+    "residuals_png": "residuals.png",
+    "hist_true_pred_png": "hist_true_pred.png",
+    "ecdf_true_pred_png": "ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 26236.644289016724,

bioflow/search/enhanced_search.py CHANGED Viewed

@@ -252,6 +252,12 @@ class EnhancedSearchService:
             diversity_score = None
             enhanced_results = self._raw_to_enhanced(raw_results[:top_k])
         # Apply evidence linking
         enhanced_results = self._add_evidence_links(enhanced_results)
@@ -413,14 +419,26 @@ class EnhancedSearchService:
         for coll in collections:
             try:
-                results = client.query_points(
-                    collection_name=coll,
-                    query=query_embedding,
-                    limit=limit,
-                    query_filter=query_filter,
-                    with_payload=True,
-                    with_vectors=with_vectors,
-                ).points
                 for r in results:
                     payload_modality = r.payload.get('modality', 'unknown')

             diversity_score = None
             enhanced_results = self._raw_to_enhanced(raw_results[:top_k])
+        # Sort by original score for display (MMR selection already done)
+        enhanced_results.sort(key=lambda x: x.score, reverse=True)
+        # Update ranks after sorting
+        for i, r in enumerate(enhanced_results):
+            r.rank = i + 1
         # Apply evidence linking
         enhanced_results = self._add_evidence_links(enhanced_results)
         for coll in collections:
             try:
+                # Use search() for qdrant-client < 1.10, query_points() for >= 1.10
+                try:
+                    results = client.query_points(
+                        collection_name=coll,
+                        query=query_embedding,
+                        limit=limit,
+                        query_filter=query_filter,
+                        with_payload=True,
+                        with_vectors=with_vectors,
+                    ).points
+                except AttributeError:
+                    # Fallback to older API (qdrant-client < 1.10)
+                    results = client.search(
+                        collection_name=coll,
+                        query_vector=query_embedding,
+                        limit=limit,
+                        query_filter=query_filter,
+                        with_payload=True,
+                        with_vectors=with_vectors,
+                    )
                 for r in results:
                     payload_modality = r.payload.get('modality', 'unknown')

dockerfile CHANGED Viewed

@@ -28,33 +28,36 @@ RUN conda init bash \
     && conda activate OpenBioMed \
     && pip install --upgrade pip setuptools
-# Installing PyTorch and torchvision
-RUN pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 \
-    && pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.1+cu117.html \
-    && pip install pytorch_lightning==2.0.8 peft==0.9.0 accelerate==1.3.0 --no-deps -i https://pypi.tuna.tsinghua.edu.cn/simple
 # Install additional packages from requirements.txt
-RUN pip install -r requirements.txt
 # Install visualization tools
-RUN conda install -c conda-forge pymol-open-source -y \
-    && pip install imageio
-# Install AutoDockVina tools
-RUN git config --global http.proxy http://100.68.173.241:3128 \
-    && git config --global https.proxy http://100.68.173.241:3128 \
-    && pip install meeko==0.1.dev3 pdb2pqr vina==1.2.2 \
-    && pip install git+https://github.com/Valdes-Tresanco-MS/AutoDockTools_py3
 # Install NLTK
-RUN pip install spacy rouge_score nltk \
-    && python -c "import nltk; nltk.download('wordnet'); nltk.download('omw-1.4')"
-# Set working directory
-WORKDIR /app
 # Activate the OpenBioMed environment by default
 RUN echo "source activate OpenBioMed" >> ~/.bashrc
 # Set default command
 ENTRYPOINT ["./scripts/run_server.sh"]

     && conda activate OpenBioMed \
     && pip install --upgrade pip setuptools
+# Installing PyTorch and torchvision (using conda run to install in OpenBioMed env)
+RUN conda run -n OpenBioMed pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 \
+    && conda run -n OpenBioMed pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.1+cu117.html \
+    && conda run -n OpenBioMed pip install pytorch_lightning==2.0.8 peft==0.9.0 accelerate==1.3.0 --no-deps
+# Set working directory and copy application files
+WORKDIR /app
+COPY . /app/
 # Install additional packages from requirements.txt
+RUN conda run -n OpenBioMed pip install -r requirements.txt
 # Install visualization tools
+RUN conda install -n OpenBioMed -c conda-forge pymol-open-source -y \
+    && conda run -n OpenBioMed pip install imageio
+# Install AutoDockVina tools (proxy removed - use Docker build args if needed)
+RUN conda run -n OpenBioMed pip install meeko==0.1.dev3 pdb2pqr vina==1.2.2 \
+    && conda run -n OpenBioMed pip install git+https://github.com/Valdes-Tresanco-MS/AutoDockTools_py3
 # Install NLTK
+RUN conda run -n OpenBioMed pip install spacy rouge_score nltk \
+    && conda run -n OpenBioMed python -c "import nltk; nltk.download('wordnet'); nltk.download('omw-1.4')"
 # Activate the OpenBioMed environment by default
 RUN echo "source activate OpenBioMed" >> ~/.bashrc
+ENV PATH="/root/miniconda3/envs/OpenBioMed/bin:$PATH"
+# Make entrypoint executable
+RUN chmod +x ./scripts/run_server.sh || true
 # Set default command
 ENTRYPOINT ["./scripts/run_server.sh"]

examples/model_customization.ipynb CHANGED Viewed

@@ -9,7 +9,8 @@
     "from open_biomed.core.pipeline import InferencePipeline\n",
     "from open_biomed.data import Molecule, Text\n",
     "\n",
-    "cfg_path = \"./configs/text_based_molecule_editing/molt5.json\"\n",
     "pipeline = InferencePipeline(cfg_path)\n",
     "mol = [Molecule.from_smiles(\"CCCCC\")]\n",
     "text = [Text.from_str(\"wow\")]\n",

     "from open_biomed.core.pipeline import InferencePipeline\n",
     "from open_biomed.data import Molecule, Text\n",
     "\n",
+    "# Path relative to repo root (run from examples/ directory)\n",
+    "cfg_path = \"../configs/text_based_molecule_editing/molt5.json\"\n",
     "pipeline = InferencePipeline(cfg_path)\n",
     "mol = [Molecule.from_smiles(\"CCCCC\")]\n",
     "text = [Text.from_str(\"wow\")]\n",

ingest_dti_data.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Ingest KIBA/DAVIS Drug-Target Interaction datasets into Qdrant.
+Uses OBMEncoder (768-dim) to create searchable vectors from real DTI data.
+"""
+import sys
+import os
+import argparse
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT_DIR)
+import pandas as pd
+from tqdm import tqdm
+from bioflow.api.qdrant_service import get_qdrant_service
+def load_dataset(dataset_name: str, limit: int = None) -> pd.DataFrame:
+    """Load KIBA or DAVIS dataset from local .tab files."""
+    filepath = os.path.join(ROOT_DIR, "data", f"{dataset_name.lower()}.tab")
+    if not os.path.exists(filepath):
+        raise FileNotFoundError(f"Dataset not found: {filepath}")
+    print(f"Loading {dataset_name} from {filepath}...")
+    df = pd.read_csv(filepath, sep='\t')
+    # Rename columns for consistency
+    # Format: ID1, X1 (SMILES), ID2, X2 (sequence), Y (affinity)
+    df.columns = ['drug_id', 'smiles', 'target_id', 'target_seq', 'affinity']
+    # Remove duplicates (keep unique drug-target pairs)
+    df = df.drop_duplicates(subset=['smiles', 'target_id'])
+    if limit:
+        df = df.head(limit)
+    print(f"  Loaded {len(df)} unique drug-target pairs")
+    return df
+def get_affinity_class(affinity: float, dataset: str) -> str:
+    """Classify affinity into high/medium/low based on dataset thresholds."""
+    if dataset.upper() == "KIBA":
+        # KIBA: lower is better (inhibition constant)
+        if affinity < 6:
+            return "high"
+        elif affinity < 8:
+            return "medium"
+        else:
+            return "low"
+    else:  # DAVIS
+        # DAVIS: Kd values, lower is better
+        if affinity < 6:
+            return "high"
+        elif affinity < 7:
+            return "medium"
+        else:
+            return "low"
+def get_drug_name(drug_id, smiles: str) -> str:
+    """Generate a readable drug name from ID or SMILES."""
+    drug_id_str = str(drug_id)
+    # If drug_id is numeric (like PubChem ID), create a friendly name
+    if drug_id_str.isdigit():
+        # Use PubChem CID format for known numeric IDs
+        return f"CID-{drug_id_str}"
+    return drug_id_str
+def ingest_molecules(qdrant, df: pd.DataFrame, dataset: str, batch_size: int = 50):
+    """Ingest unique molecules (drugs) from the dataset."""
+    print("\n[1/2] Ingesting molecules (drugs)...")
+    # Get unique SMILES with their best affinity
+    unique_drugs = df.groupby('smiles').agg({
+        'drug_id': 'first',
+        'affinity': 'min',  # Best affinity
+        'target_id': 'count'  # Number of targets
+    }).reset_index()
+    unique_drugs.columns = ['smiles', 'drug_id', 'best_affinity', 'num_targets']
+    print(f"  Found {len(unique_drugs)} unique molecules")
+    success_count = 0
+    for idx, row in tqdm(unique_drugs.iterrows(), total=len(unique_drugs), desc="  Molecules"):
+        try:
+            affinity_class = get_affinity_class(row['best_affinity'], dataset)
+            drug_name = get_drug_name(row['drug_id'], row['smiles'])
+            result = qdrant.ingest(
+                content=row['smiles'],
+                modality="molecule",
+                metadata={
+                    "name": drug_name,
+                    "drug_id": str(row['drug_id']),  # Keep original ID
+                    "smiles": row['smiles'],
+                    "description": f"Drug from {dataset.upper()} dataset",
+                    "source": dataset.lower(),
+                    "dataset": dataset.lower(),
+                    "best_affinity": float(row['best_affinity']),
+                    "affinity_class": affinity_class,
+                    "num_targets": int(row['num_targets']),
+                }
+            )
+            success_count += 1
+        except Exception as e:
+            if success_count == 0:
+                print(f"\n  First error: {e}")  # Show first error for debugging
+    print(f"  âœ“ Ingested {success_count}/{len(unique_drugs)} molecules")
+    return success_count
+def ingest_proteins(qdrant, df: pd.DataFrame, dataset: str, batch_size: int = 50):
+    """Ingest unique proteins (targets) from the dataset."""
+    print("\n[2/2] Ingesting proteins (targets)...")
+    # Get unique proteins with their best affinity
+    unique_targets = df.groupby('target_id').agg({
+        'target_seq': 'first',
+        'affinity': 'min',  # Best affinity
+        'smiles': 'count'  # Number of drugs
+    }).reset_index()
+    unique_targets.columns = ['target_id', 'target_seq', 'best_affinity', 'num_drugs']
+    print(f"  Found {len(unique_targets)} unique proteins")
+    success_count = 0
+    for idx, row in tqdm(unique_targets.iterrows(), total=len(unique_targets), desc="  Proteins"):
+        try:
+            # Truncate very long sequences for embedding
+            sequence = str(row['target_seq'])[:1000]
+            affinity_class = get_affinity_class(row['best_affinity'], dataset)
+            result = qdrant.ingest(
+                content=sequence,
+                modality="protein",
+                metadata={
+                    "name": row['target_id'],
+                    "uniprot_id": row['target_id'],
+                    "sequence": sequence,
+                    "full_length": len(str(row['target_seq'])),
+                    "description": f"Target from {dataset.upper()} dataset",
+                    "source": dataset.lower(),
+                    "dataset": dataset.lower(),
+                    "best_affinity": float(row['best_affinity']),
+                    "affinity_class": affinity_class,
+                    "num_drugs": int(row['num_drugs']),
+                }
+            )
+            success_count += 1
+        except Exception as e:
+            if success_count == 0:
+                print(f"\n  First error: {e}")  # Show first error for debugging
+    print(f"  âœ“ Ingested {success_count}/{len(unique_targets)} proteins")
+    return success_count
+def main():
+    parser = argparse.ArgumentParser(description="Ingest KIBA/DAVIS datasets into Qdrant")
+    parser.add_argument("--dataset", choices=["kiba", "davis", "both"], default="davis",
+                        help="Dataset to ingest (default: davis)")
+    parser.add_argument("--limit", type=int, default=1000,
+                        help="Limit number of records per dataset (default: 1000, 0 for all)")
+    parser.add_argument("--clear", action="store_true",
+                        help="Clear existing collections before ingesting")
+    args = parser.parse_args()
+    print("=" * 60)
+    print("  KIBA/DAVIS -> QDRANT INGESTION")
+    print("=" * 60)
+    qdrant = get_qdrant_service()
+    if args.clear:
+        print("\nClearing existing collections...")
+        try:
+            client = qdrant._get_client()
+            for coll in qdrant.list_collections():
+                client.delete_collection(coll)
+                print(f"  Deleted: {coll}")
+            # Clear the cache so collections will be recreated
+            qdrant._initialized_collections.clear()
+        except Exception as e:
+            print(f"  Warning: {e}")
+    datasets = ["kiba", "davis"] if args.dataset == "both" else [args.dataset]
+    limit = args.limit if args.limit > 0 else None
+    total_molecules = 0
+    total_proteins = 0
+    for dataset in datasets:
+        print(f"\n{'='*60}")
+        print(f"  Processing {dataset.upper()}")
+        print("=" * 60)
+        try:
+            df = load_dataset(dataset, limit=limit)
+            total_molecules += ingest_molecules(qdrant, df, dataset)
+            total_proteins += ingest_proteins(qdrant, df, dataset)
+        except FileNotFoundError as e:
+            print(f"  ERROR: {e}")
+            continue
+    print("\n" + "=" * 60)
+    print("  INGESTION COMPLETE")
+    print("=" * 60)
+    print(f"  Total molecules: {total_molecules}")
+    print(f"  Total proteins: {total_proteins}")
+    print(f"\nSearch at: http://localhost:3000/dashboard/discovery")
+if __name__ == "__main__":
+    main()

runs/20260125_080409_BindingDB_Kd/run_summary.json CHANGED Viewed

@@ -20,12 +20,12 @@
     "ci_approx": 0.8053618329014657
   },
   "files": {
-    "predictions_test_csv": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\predictions_test.csv",
-    "scatter_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\scatter.png",
-    "curves_sorted_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\curves_sorted.png",
-    "residuals_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\residuals.png",
-    "hist_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\hist_true_pred.png",
-    "ecdf_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_080409_BindingDB_Kd\\ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 6569.407529115677,

     "ci_approx": 0.8053618329014657
   },
   "files": {
+    "predictions_test_csv": "predictions_test.csv",
+    "scatter_png": "scatter.png",
+    "curves_sorted_png": "curves_sorted.png",
+    "residuals_png": "residuals.png",
+    "hist_true_pred_png": "hist_true_pred.png",
+    "ecdf_true_pred_png": "ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 6569.407529115677,

runs/20260125_104915_KIBA/run_summary.json CHANGED Viewed

@@ -20,12 +20,12 @@
     "ci_approx": 0.7031028951074637
   },
   "files": {
-    "predictions_test_csv": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\predictions_test.csv",
-    "scatter_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\scatter.png",
-    "curves_sorted_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\curves_sorted.png",
-    "residuals_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\residuals.png",
-    "hist_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\hist_true_pred.png",
-    "ecdf_true_pred_png": "c:\\Users\\hamza\\Downloads\\Free Spitfire 3D printed RC plane - 2118624\\files\\runs\\20260125_104915_KIBA\\ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 26236.644289016724,

     "ci_approx": 0.7031028951074637
   },
   "files": {
+    "predictions_test_csv": "predictions_test.csv",
+    "scatter_png": "scatter.png",
+    "curves_sorted_png": "curves_sorted.png",
+    "residuals_png": "residuals.png",
+    "hist_true_pred_png": "hist_true_pred.png",
+    "ecdf_true_pred_png": "ecdf_true_pred.png"
   },
   "timing": {
     "total_seconds": 26236.644289016724,

ui/app/api/agents/workflow/route.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 import { NextResponse } from "next/server"
 import { API_CONFIG } from "@/config/api.config"
@@ -67,44 +66,3 @@ export async function POST(request: Request) {
   const mockResult = generateMockWorkflowResult(query, num_candidates)
   return NextResponse.json(mockResult)
 }
-=======
-import { NextResponse } from "next/server"
-import { API_CONFIG } from "@/config/api.config"
-export async function POST(request: Request) {
-  const body = await request.json().catch(() => ({}))
-  try {
-    const response = await fetch(`${API_CONFIG.baseUrl}/api/agents/workflow`, {
-      method: "POST",
-      headers: { "Content-Type": "application/json" },
-      body: JSON.stringify(body),
-      cache: "no-store",
-    })
-    const data = await response.json().catch(() => null)
-    if (!response.ok) {
-      return NextResponse.json(
-        { error: data?.detail || data?.error || `Backend returned ${response.status}` },
-        { status: response.status }
-      )
-    }
-    return NextResponse.json(data)
-  } catch (error) {
-    console.warn("Workflow API error, using mock response:", error)
-    return NextResponse.json({
-      success: true,
-      status: "mock",
-      steps_completed: 0,
-      total_steps: 0,
-      execution_time_ms: 0,
-      top_candidates: [],
-      all_outputs: {},
-      errors: ["Backend unavailable"],
-    })
-  }
-}
->>>>>>> Rami

 import { NextResponse } from "next/server"
 import { API_CONFIG } from "@/config/api.config"
   const mockResult = generateMockWorkflowResult(query, num_candidates)
   return NextResponse.json(mockResult)
 }

ui/app/dashboard/discovery/page.tsx CHANGED Viewed

@@ -16,15 +16,24 @@ const API_BASE = process.env.NEXT_PUBLIC_API_URL || "http://localhost:8000";
 interface SearchResult {
   id: string;
   score: number;
-  smiles: string;
-  target_seq: string;
-  label: number;
-  affinity_class: string;
 }
 export default function DiscoveryPage() {
   const [query, setQuery] = React.useState("")
   const [searchType, setSearchType] = React.useState("Similarity")
   const [isSearching, setIsSearching] = React.useState(false)
   const [step, setStep] = React.useState(0)
   const [results, setResults] = React.useState<SearchResult[]>([])
@@ -70,7 +79,8 @@ export default function DiscoveryPage() {
         body: JSON.stringify({
           query: query.trim(),
           type: apiType,
-          limit: 10
         })
       });
@@ -146,13 +156,14 @@ export default function DiscoveryPage() {
                     </div>
                      <div className="space-y-2">
                         <Label>Database</Label>
-                        <Select defaultValue="KIBA">
                             <SelectTrigger>
                                 <SelectValue placeholder="Select database" />
                             </SelectTrigger>
                             <SelectContent>
-                                <SelectItem value="KIBA">KIBA (23.5K pairs)</SelectItem>
-                                <SelectItem value="DAVIS">DAVIS Kinase</SelectItem>
                             </SelectContent>
                         </Select>
                     </div>
@@ -221,19 +232,40 @@ export default function DiscoveryPage() {
                         <Card key={result.id}>
                             <CardContent className="p-4 flex items-center justify-between">
                                 <div className="flex-1">
-                                    <div className="font-mono text-sm font-medium">
-                                      {result.smiles?.slice(0, 50)}{result.smiles?.length > 50 ? '...' : ''}
                                     </div>
-                                    <div className="flex gap-4 text-sm text-muted-foreground mt-1">
-                                        <span>Affinity: {result.affinity_class}</span>
-                                        <span>Label: {result.label?.toFixed(2)}</span>
                                     </div>
                                 </div>
                                 <div className="text-right">
                                     <div className="text-sm text-muted-foreground">Similarity</div>
                                     <div className={`text-xl font-bold ${
                                         result.score >= 0.9 ? 'text-green-600' :
-                                        result.score >= 0.7 ? 'text-green-500' : 'text-amber-500'
                                     }`}>
                                         {result.score.toFixed(3)}
                                     </div>

 interface SearchResult {
   id: string;
   score: number;
+  mmr_score?: number;
+  content: string;
+  modality: string;
+  metadata: {
+    name?: string;
+    smiles?: string;
+    description?: string;
+    source?: string;
+    label_true?: number;
+    affinity_class?: string;
+    [key: string]: unknown;
+  };
 }
 export default function DiscoveryPage() {
   const [query, setQuery] = React.useState("")
   const [searchType, setSearchType] = React.useState("Similarity")
+  const [database, setDatabase] = React.useState("both")
   const [isSearching, setIsSearching] = React.useState(false)
   const [step, setStep] = React.useState(0)
   const [results, setResults] = React.useState<SearchResult[]>([])
         body: JSON.stringify({
           query: query.trim(),
           type: apiType,
+          limit: 10,
+          dataset: database !== "both" ? database.toLowerCase() : undefined
         })
       });
                     </div>
                      <div className="space-y-2">
                         <Label>Database</Label>
+                        <Select value={database} onValueChange={setDatabase}>
                             <SelectTrigger>
                                 <SelectValue placeholder="Select database" />
                             </SelectTrigger>
                             <SelectContent>
+                                <SelectItem value="both">All Datasets</SelectItem>
+                                <SelectItem value="kiba">KIBA (Kinase Inhibitors)</SelectItem>
+                                <SelectItem value="davis">DAVIS (Kinase Targets)</SelectItem>
                             </SelectContent>
                         </Select>
                     </div>
                         <Card key={result.id}>
                             <CardContent className="p-4 flex items-center justify-between">
                                 <div className="flex-1">
+                                    <div className="font-semibold text-base mb-1">
+                                      {result.metadata?.name || `Result ${i + 1}`}
                                     </div>
+                                    <div className="font-mono text-sm text-muted-foreground">
+                                      {(result.metadata?.smiles || result.content)?.slice(0, 60)}
+                                      {(result.metadata?.smiles || result.content)?.length > 60 ? '...' : ''}
+                                    </div>
+                                    {result.metadata?.description && (
+                                      <div className="text-sm text-muted-foreground mt-1">
+                                        {result.metadata.description}
+                                      </div>
+                                    )}
+                                    <div className="flex gap-4 text-xs text-muted-foreground mt-2">
+                                        {result.metadata?.affinity_class && (
+                                          <span className="bg-muted px-2 py-0.5 rounded">
+                                            Affinity: {result.metadata.affinity_class}
+                                          </span>
+                                        )}
+                                        {result.metadata?.label_true != null && (
+                                          <span className="bg-muted px-2 py-0.5 rounded">
+                                            Label: {result.metadata.label_true.toFixed(2)}
+                                          </span>
+                                        )}
+                                        <span className="bg-muted px-2 py-0.5 rounded">
+                                          {result.modality}
+                                        </span>
                                     </div>
                                 </div>
                                 <div className="text-right">
                                     <div className="text-sm text-muted-foreground">Similarity</div>
                                     <div className={`text-xl font-bold ${
                                         result.score >= 0.9 ? 'text-green-600' :
+                                        result.score >= 0.7 ? 'text-green-500' :
+                                        result.score >= 0.5 ? 'text-amber-500' : 'text-muted-foreground'
                                     }`}>
                                         {result.score.toFixed(3)}
                                     </div>