Spaces:

GrimSqueaker
/

OTRec

Sleeping

App Files Files Community

GrimSqueaker commited on Dec 13, 2025

Commit

3d9bb2a

verified ·

1 Parent(s): 27eb6dc

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.github/workflows/update_space.yml +28 -0
README.md +36 -12
__pycache__/dl_model_def.cpython-310.pyc +0 -0
app.py +282 -0
data/proc/df_learn_sub.parquet +3 -0
data/proc/disease_df.parquet +3 -0
data/proc/target_df.parquet +3 -0
dl_model_def.py +208 -0
requirements.txt +6 -0

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

README.md CHANGED Viewed

@@ -1,12 +1,36 @@
----
-title: OTRec
-emoji: 🔥
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 6.1.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: OTRec
+app_file: app.py
+sdk: gradio
+sdk_version: 6.0.1
+---
+# Disease–Target Recommender (Open Targets)
+This Space exposes a two-tower recommender model trained on Open Targets–derived
+disease–target data. Given a **disease ID** (matching the `diseaseId` column from
+the preprocessed data), it returns a ranked list of predicted **target IDs**.
+The backend is a TensorFlow / Keras model with:
+- A **query tower** for diseases (disease text + disease ID embedding)
+- A **key tower** for targets (target text only)
+- Cosine similarity between disease and target embeddings
+All candidate target embeddings are currently precomputed at startup for fast inference. (can drop)
+---
+## Files and structure
+Expected repo layout:
+```text
+.
+├── app.py
+├── requirements.txt
+├── model.weights.h5
+└── data/
+    └── proc/
+        ├── disease_df.parquet
+        └── target_df.parquet
+        └── df_learn.parquet

__pycache__/dl_model_def.cpython-310.pyc ADDED Viewed

Binary file (3.82 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+# from keras.layers import ...
+import gradio as gr
+import h5py
+from dl_model_def import make_fs, TwoTowerDual, build_two_tower_model
+# ============================================
+#  CONFIG
+# ============================================
+DATA_DIR = "./data/proc"
+# Download the model weights from your specific HF Repo
+print("Downloading model weights from Hugging Face Hub...")
+WEIGHTS_FILE = hf_hub_download(
+    repo_id="GrimSqueaker/OTRec",
+    filename="model.weights.h5"
+)
+print(f"Weights downloaded to: {WEIGHTS_FILE}")
+# ============================================
+#  LOAD TRAINING DATA
+# ============================================
+df_learn   = pd.read_parquet(f"{DATA_DIR}/df_learn_sub.parquet")
+disease_df = pd.read_parquet(f"{DATA_DIR}/disease_df.parquet")
+target_df  = pd.read_parquet(f"{DATA_DIR}/target_df.parquet")
+# Ensure column names match training
+df_learn = df_learn.rename(columns={
+    "disease_text_embed": "disease_text",
+    "target_text_embed": "target_text"
+}, errors="ignore")
+disease_df.rename(columns={"disease_text_embed": "disease_text"}, errors="ignore",inplace=True)
+target_df.rename(columns={"target_text_embed":"target_text"}, errors="ignore",inplace=True)
+# ============================================
+#  BUILD MODEL + LOAD WEIGHTS
+# ============================================
+print("Building TwoTowerDual...")
+# 1. Reset Keras Session to ensure layer names start at index 0 (matches clean training)
+tf.keras.backend.clear_session()
+# 2. Rebuild architecture
+model = build_two_tower_model(df_learn)
+print("Loading weights...")
+try:
+    # Try standard load
+    model.load_weights(WEIGHTS_FILE)
+except ValueError as e:
+    print(f"Standard load failed ({e}). Attempting name-mismatch fix...")
+    # FALLBACK: The training notebook likely generated layer names like 'dise_emb_1'
+    # due to multiple runs. We inspect the .h5 file and map the names.
+    with h5py.File(WEIGHTS_FILE, 'r') as f:
+        h5_keys = list(f.keys())
+        print(f"Weights file contains layers: {h5_keys}")
+        # Helper to find the matching key in h5 file for a given prefix
+        def match_layer_name(target_attr, prefix):
+            # Find key in h5 that starts with prefix (e.g. 'dise_emb')
+            match = next((k for k in h5_keys if k.startswith(prefix)), None)
+            if match and hasattr(model, target_attr):
+                layer = getattr(model, target_attr)
+                print(f"Renaming model layer '{layer.name}' to '{match}' to match file.")
+                layer._name = match
+        # Apply renames for known components
+        match_layer_name('dise_emb', 'dise_emb')
+        match_layer_name('q_tower', 'tower') # Attempt to catch tower/tower_1
+        # k_tower might share the name 'tower' prefix in H5, which is tricky in subclasses
+        # usually save_weights on subclass saves attributes directly.
+    # Retry load after renaming
+    model.load_weights(WEIGHTS_FILE)
+print("Weights loaded successfully.")
+# ============================================
+#  PRECOMPUTE CANDIDATE EMBEDDINGS
+# ============================================
+# # Note: In TF 2.16+, Ensure inputs are tf.constant or numpy compatible
+# cand_embs = model.encode_k(target_texts, target_ids)
+# cand_embs = tf.nn.l2_normalize(cand_embs, axis=1).numpy()
+# print("Candidate embeddings ready.")
+print("Precomputing candidate embeddings (batched)...")
+target_texts = target_df["target_text"].astype(str).to_numpy()
+target_ids   = target_df["targetId"].astype(str).to_numpy()
+# FIX: Process in batches to avoid OOM
+BATCH_SIZE = 1024 # Conservative batch size for wide inputs
+cand_embs_list = []
+total = len(target_texts)
+for i in range(0, total, BATCH_SIZE):
+    # Slice the batch
+    end = min(i + BATCH_SIZE, total)
+    batch_txt = target_texts[i:end]
+    batch_ids = target_ids[i:end]
+    # Run inference on the batch (keeps memory usage low)
+    # Using tf.device conversion is optional but good for safety if GPU is fragmented
+    emb_batch = model.encode_k(batch_txt, batch_ids)
+    cand_embs_list.append(emb_batch)
+    if i % 5000 == 0:
+        print(f"  Processed {i}/{total} candidates...")
+# Concatenate all batches back into one tensor
+cand_embs = tf.concat(cand_embs_list, axis=0)
+# Normalize the final result
+cand_embs = tf.nn.l2_normalize(cand_embs, axis=1).numpy()
+print(f"Candidate embeddings ready. Shape: {cand_embs.shape}")
+# ============================================
+#  RECOMMENDATION FUNCTION
+# ============================================
+def recommend_targets(disease_id, top_k=10):
+    # 1. Validate Input
+    if not disease_id:
+        return pd.DataFrame(), None
+    row = disease_df.loc[disease_df["diseaseId"] == disease_id]
+    if row.empty:
+        return pd.DataFrame(), None
+    # 2. Encode Query
+    disease_text = row["disease_text"].iloc[0]
+    q_emb = model.encode_q(
+        tf.constant([disease_text]),
+        tf.constant([disease_id])
+    )
+    q_emb = tf.nn.l2_normalize(q_emb, axis=1).numpy()[0]
+    # 3. Calculate Raw Cosine Similarity
+    # Shape: (N_targets,)
+    raw_sim = cand_embs @ q_emb
+    # 4. Convert to Probability (Fixes negative scores)
+    # The model has a trained 'cls_head' (Sigmoid) that maps Similarity -> Probability
+    # We reshape to (N, 1) because the Keras Dense layer expects a matrix
+    scores = model.cls_head(raw_sim.reshape(-1, 1)).numpy().flatten()
+    # 5. Get Top K
+    k = int(top_k)
+    idx = np.argsort(scores)[::-1][:k]
+    # 6. Build Result DataFrame
+    results = target_df.iloc[idx].copy()
+    # Force standard python float for clean rounding
+    raw_scores = scores[idx]
+    results["score"] = [round(float(x), 4) for x in raw_scores]
+    # 7. Select Columns
+    desc_col = "functionDescription" if "functionDescription" in results.columns else "functionDescriptions"
+    desired_cols = [
+        "targetId",
+        "approvedSymbol",
+        "approvedName",
+        desc_col,
+        "score"
+    ]
+    final_cols = [c for c in desired_cols if c in results.columns]
+    results = results[final_cols]
+    # 8. Save to CSV for download
+    csv_path = "recommendations.csv"
+    results.to_csv(csv_path, index=False)
+    return results, csv_path
+# ============================================
+#  GRADIO APP
+# ============================================
+def search_diseases(query):
+    if not query or len(query) < 2:
+        return gr.update(choices=[], value=None)
+    mask = (
+        disease_df["name"].str.contains(query, case=False, na=False) |
+        disease_df["diseaseId"].str.contains(query, case=False, na=False)
+    )
+    matches = disease_df.loc[mask].head(30)
+    choices = [
+        (f"{row['name']} ({row['diseaseId']})", row['diseaseId'])
+        for _, row in matches.iterrows()
+    ]
+    first_val = choices[0][1] if choices else None
+    return gr.update(choices=choices, value=first_val)
+def launch():
+    examples = ["synuclein", "diabetes", "doid_0050890"]
+    with gr.Blocks() as demo:
+        gr.Markdown("# Disease → Target Recommender")
+        gr.Markdown("Search for a disease by **Name** or **ID** to get target recommendations.")
+        with gr.Row():
+            search_box = gr.Textbox(
+                label="1. Search Disease",
+                placeholder="Type name (e.g., 'Parkinson') or ID...",
+                lines=1
+            )
+            did_dropdown = gr.Dropdown(
+                label="2. Select Disease",
+                choices=[],
+                interactive=True
+            )
+            topk = gr.Slider(1, 400, value=10, step=5, label="Top K Targets")
+        # Search Logic (Updates dropdown options and default value)
+        search_box.change(fn=search_diseases, inputs=search_box, outputs=did_dropdown)
+        # Output Components (Stacked vertically for full width)
+        out_df = gr.Dataframe(
+            label="Predictions",
+            interactive=False,
+            wrap=True,
+            show_search="filter",
+        )
+        out_file = gr.File(label="Download CSV")
+        # === TRIGGER LOGIC ===
+        # 1. Manual Trigger (Keep the button just in case)
+        btn = gr.Button("Recommend Targets", variant="primary")
+        btn.click(
+            fn=recommend_targets,
+            inputs=[did_dropdown, topk],
+            outputs=[out_df, out_file]
+        )
+        # 2. Auto-Trigger on Change
+        # This handles the Examples too: Example -> Search -> Dropdown Update -> Trigger
+        did_dropdown.change(
+            fn=recommend_targets,
+            inputs=[did_dropdown, topk],
+            outputs=[out_df, out_file]
+        )
+        # Also update when slider moves
+        topk.change(
+            fn=recommend_targets,
+            inputs=[did_dropdown, topk],
+            outputs=[out_df, out_file]
+        )
+        gr.Examples(examples=examples, inputs=search_box)
+    demo.launch()
+if __name__ == "__main__":
+    launch()

data/proc/df_learn_sub.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20f1834245178fbf27f385aaef6a757921ced1c6a37fce1fe29d86b1d11a4854
+size 25162164

data/proc/disease_df.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:def4b7f42efca118bdbc84745249ff37fa8c9dc1ac8740feff17ffb99ac3c316
+size 13255480

data/proc/target_df.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3182b52698a4513eeb88cf678189d62897815263ff170fd23d4978e3a869f823
+size 27290365

dl_model_def.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# dl_model_def.py
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras.utils import FeatureSpace
+# REMOVED: from keras.layers import ...
+MAX_TOK = 160_000
+EMB_ID  = 64
+@keras.utils.register_keras_serializable(package="OTRec")
+def make_fs():
+    return FeatureSpace(
+        {
+            "text": FeatureSpace.feature(
+                preprocessor=keras.layers.TextVectorization(
+                    max_tokens=MAX_TOK,
+                    output_mode="count",
+                ),
+                dtype="string",
+                output_mode="float",
+            )
+        },
+        output_mode="concat",
+    )
+# @keras.utils.register_keras_serializable() # added to here instead of inside the model
+# def build_tower(input_dim: int,EMB_ID:int=64) -> keras.Model:
+#     inp = keras.Input(shape=(input_dim + EMB_ID,))
+#     x   = keras.layers.LayerNormalization()(inp)
+#     # x   = keras.layers.BatchNormalization()(inp)
+#     ## BatchNormalization
+#     x = keras.layers.Dropout(0.2)(x)
+#     # x  = keras.layers.Dense(768, activation="gelu")(x)
+#     # out = keras.layers.Dense(256, activation="tanh")(x)
+#     # out = keras.layers.Dense(256, activation="gelu")(inp)
+#     # out = keras.layers.Dense(256, activation="linear")(x) # orig, 95.9 auc
+#     # out = keras.layers.Dense(256, activation="gelu")(x) #
+#     out = keras.layers.Dense(512, activation="elu")(x)
+#     return keras.Model(inp, out, name="tower")
+@keras.utils.register_keras_serializable()
+def build_tower(input_dim: int, EMB_ID: int = 64) -> keras.Model:
+    inp = keras.Input(shape=(input_dim + EMB_ID,))
+    norm_x = keras.layers.LayerNormalization()(inp)
+    # Path 1: The Linear Projection (Wide)
+    linear_out = keras.layers.Dense(384, activation="linear")(norm_x)
+    # Path 2: Non-linear capture (Optional complex interactions)
+    deep = keras.layers.Dense(384, activation="elu")(norm_x)
+    deep = keras.layers.LayerNormalization()(deep) # Norm inside deep block is fine
+    deep = keras.layers.Dropout(0.35)(deep)
+    deep = keras.layers.Dense(64, activation="elu")(deep)
+    deep = keras.layers.Dropout(0.15)(deep)
+    # # Remove the LN here if you are putting it at the end,
+    # # OR keep it if you want the deep branch specifically standardized.
+    # # (Keeping it is fine/standard for a block).
+    # deep = keras.layers.LayerNormalization()(deep)
+    deep = keras.layers.Dense(384, activation="linear")(deep)
+    # Add them (Residual style)
+    out = keras.layers.Add()([linear_out, deep])
+    # out = keras.layers.LayerNormalization(name="final_norm")(out)
+    return keras.Model(inp, out, name="tower")
+@keras.utils.register_keras_serializable(package="OTRec")
+class TwoTowerDual(keras.Model):
+    def __init__(self,
+                 dise_lookup,
+                 dise_emb,
+                 q_fs,
+                 k_fs,
+                 q_tower,
+                 k_tower,
+                 concat_layer,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.dise_lookup = dise_lookup
+        self.dise_emb    = dise_emb
+        self.q_fs        = q_fs
+        self.k_fs        = k_fs
+        self.q_tower     = q_tower
+        self.k_tower     = k_tower
+        self.concat      = concat_layer
+        self.dot         = keras.layers.Dot(axes=-1, normalize=True, name="cosine")
+        self.cls_head    = keras.layers.Dense(1, activation="sigmoid",
+            name="cls",
+            # 1. Start with a high scaling factor so Sigmoid isn't trapped in the middle.
+        #    (This is trainable, so the model can lower it if 20 is too high).
+        # kernel_initializer=tf.keras.initializers.Constant(5.0),
+        #     bias_initializer=tf.keras.initializers.Constant(-2.2)
+            )
+        self.score_head  = keras.layers.Dense(
+            1,
+            activation=None,
+            name="score",
+            bias_initializer=tf.keras.initializers.Constant(0.049),
+        )
+        self.build_tower = build_tower # added new!
+    def encode_q(self, txt, did):
+        return self.q_tower(
+            self.concat([
+                self.q_fs({"text": txt}),
+                self.dise_emb(self.dise_lookup(did)),
+            ])
+        )
+    def encode_k(self, txt, tid):
+        txt_vec = self.k_fs({"text": txt})
+        return self.k_tower(txt_vec)
+    def call(self, feats):
+        q = self.encode_q(
+            feats["query"]["disease_text"],
+            feats["query"]["diseaseId"],
+        )
+        k = self.encode_k(
+            feats["candidate"]["target_text"],
+            feats["candidate"]["targetId"],
+        )
+        sim  = self.dot([q, k])
+        prob = self.cls_head(sim)
+        reg  = self.score_head(sim)
+        return {"cls": prob, "score": reg}
+@keras.utils.register_keras_serializable() # added
+def build_two_tower_model(df_learn) -> TwoTowerDual:
+    # 1) Feature spaces
+    q_fs = make_fs()
+    k_fs = make_fs()
+    q_fs.adapt(
+        tf.data.Dataset.from_tensor_slices({"text": df_learn["disease_text"]})
+          .batch(4096)
+          .prefetch(tf.data.AUTOTUNE)
+    )
+    k_fs.adapt(
+        tf.data.Dataset.from_tensor_slices({"text": df_learn["target_text"]})
+          .batch(4096)
+          .prefetch(tf.data.AUTOTUNE)
+    )
+    # 2) Lookup + embedding
+    dise_lookup = keras.layers.StringLookup(name="disease_lookup")
+    dise_lookup.adapt(df_learn["diseaseId"])
+    dise_emb = keras.layers.Embedding(
+        input_dim=dise_lookup.vocabulary_size(),
+        output_dim=EMB_ID,
+        name="dise_emb",
+    )
+    # # 3) Towers
+    # # def build_tower(input_dim: int) -> keras.Model:
+    # #     inp = keras.Input(shape=(input_dim + EMB_ID,))
+    # #     # out = keras.layers.Dense(128)(inp)
+    # #     out = keras.layers.Dense(128)(inp)
+    # #     return keras.Model(inp, out, name="tower")
+    # @keras.utils.register_keras_serializable() # added
+    # def build_tower(input_dim: int,EMB_ID:int=64) -> keras.Model:
+    #     inp = keras.Input(shape=(input_dim + EMB_ID,))
+    #     x   = keras.layers.LayerNormalization()(inp)
+    #     # x   = keras.layers.BatchNormalization()(inp)
+    #     ## BatchNormalization
+    #     # x = keras.layers.Dropout(0.1)(x)
+    #     # x  = keras.layers.Dense(768, activation="gelu")(x)
+    #     # out = keras.layers.Dense(256, activation="tanh")(x)
+    #     # out = keras.layers.Dense(256, activation="gelu")(inp)
+    #     out = keras.layers.Dense(256, activation="linear")(x)
+    #     return keras.Model(inp, out, name="tower")
+    q_tower = build_tower(q_fs.get_encoded_features().shape[-1])
+    k_tower = build_tower(k_fs.get_encoded_features().shape[-1] - EMB_ID)
+    concat = keras.layers.Concatenate(name="concat")
+    # 4) Build model
+    model = TwoTowerDual(
+        dise_lookup=dise_lookup,
+        dise_emb=dise_emb,
+        q_fs=q_fs,
+        k_fs=k_fs,
+        q_tower=q_tower,
+        k_tower=k_tower,
+        concat_layer=concat,
+        name="two_tower_dual",
+    )
+    # Dummy build
+    dummy = {
+        "query": {
+            "disease_text": tf.constant(["dummy"]),
+            "diseaseId": tf.constant([df_learn["diseaseId"].iloc[0]]),
+        },
+        "candidate": {
+            "target_text": tf.constant(["dummy target"]),
+            "targetId": tf.constant([df_learn["targetId"].iloc[0]]),
+        },
+    }
+    _ = model(dummy)
+    return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+tensorflow==2.16
+numpy
+pandas
+pyarrow
+gradio
+huggingface-hub