Spaces:

K-RnD-Lab
/

Cancer-Research-Suite_03-2026

Sleeping

App Files Files Community

TEZv commited on Mar 12

Commit

4a7ae26

verified ·

1 Parent(s): ebc1d64

Update app.py

Browse files

Files changed (1) hide show

app.py +375 -0

app.py CHANGED Viewed

@@ -96,9 +96,11 @@ JOURNAL_CATEGORIES = [
     "S1-D·R3a",  # LNP Brain / BBB
     "S1-D·R4a",  # AutoCorona NLP
     "S1-D·R5a",  # CSF/Vitreous/BM
     # S1-E Biomarkers
     "S1-E·R1a",  # Liquid Biopsy Classifier
     "S1-E·R1b",  # Protein Panel Validator
     # S1-F Rare Cancers
     "S1-F·R1a",  # DIPG Toolkit
     "S1-F·R2a",  # UVM Toolkit
@@ -955,6 +957,338 @@ def predict_cancer(c1,c2,c3,c4,c5,c6,c7,c8,c9,c10):
     except Exception as e:
         return f"<div style='color:#ef4444'>Error: {str(e)}</div>", None
 # ─────────────────────────────────────────────
 # Функції для рідкісних раків
 # ─────────────────────────────────────────────
@@ -1660,6 +1994,26 @@ def build_app():
                             with gr.TabItem("S1-D·R5a · CSF/BM 🔴"):
                                 gr.Markdown("### CSF · Vitreous · Bone Marrow\n> 🔴 0 prior studies — Planned for Q2–Q3 2026.")
                             # --- S1-E Biomarkers ---
                             with gr.TabItem("S1-E·R1a · Liquid Biopsy"):
                                 gr.Markdown("### Liquid Biopsy Classifier\nClassify cancer vs healthy based on protein levels.")
@@ -1683,6 +2037,27 @@ def build_app():
                             with gr.TabItem("S1-E·R1b · Protein Validator 🔶"):
                                 gr.Markdown("### Protein Panel Validator\n> 🔶 In progress — Coming next.")
                             # --- S1-F Rare Cancers ---
                             with gr.TabItem("S1-F·R1a · DIPG Toolkit"):
                                 gr.Markdown("### DIPG Toolkit (H3K27M)\nExplore variants and CSF LNP formulations for Diffuse Intrinsic Pontine Glioma.")

     "S1-D·R3a",  # LNP Brain / BBB
     "S1-D·R4a",  # AutoCorona NLP
     "S1-D·R5a",  # CSF/Vitreous/BM
+    "S1-D·R6a",  # Corona Database
     # S1-E Biomarkers
     "S1-E·R1a",  # Liquid Biopsy Classifier
     "S1-E·R1b",  # Protein Panel Validator
+    "S1-E·R2a",  # Multi-protein Biomarkers
     # S1-F Rare Cancers
     "S1-F·R1a",  # DIPG Toolkit
     "S1-F·R2a",  # UVM Toolkit
     except Exception as e:
         return f"<div style='color:#ef4444'>Error: {str(e)}</div>", None
+# ─────────────────────────────────────────────
+# TAB S1-D·R6a — Corona Database (Protein Corona Atlas)
+# ─────────────────────────────────────────────
+# Дані з Protein Corona Database (PC-DB) — симульовані на основі реальних досліджень
+# Джерело: https://pc-db.org/ (2497 білків, 83 дослідження)
+def load_corona_database():
+    """Завантажує дані про білки з Protein Corona Database (симульовані)."""
+    # Топ-20 білків, які найчастіше зустрічаються в короні наночастинок
+    corona_proteins = [
+        {"Protein": "Apolipoprotein A-I", "UniProt": "P02647", "Frequency": 0.95, "MW_kDa": 30.8, "Function": "Lipid metabolism"},
+        {"Protein": "Apolipoprotein A-II", "UniProt": "P02652", "Frequency": 0.92, "MW_kDa": 11.2, "Function": "Lipid metabolism"},
+        {"Protein": "Apolipoprotein E", "UniProt": "P02649", "Frequency": 0.89, "MW_kDa": 36.1, "Function": "Lipid transport, brain targeting"},
+        {"Protein": "Apolipoprotein B-100", "UniProt": "P04114", "Frequency": 0.87, "MW_kDa": 515.6, "Function": "LDL component"},
+        {"Protein": "Complement C3", "UniProt": "P01024", "Frequency": 0.86, "MW_kDa": 187.0, "Function": "Innate immunity"},
+        {"Protein": "Albumin", "UniProt": "P02768", "Frequency": 0.85, "MW_kDa": 66.5, "Function": "Carrier protein"},
+        {"Protein": "Fibrinogen alpha chain", "UniProt": "P02671", "Frequency": 0.82, "MW_kDa": 94.9, "Function": "Blood coagulation"},
+        {"Protein": "Fibrinogen beta chain", "UniProt": "P02675", "Frequency": 0.81, "MW_kDa": 55.9, "Function": "Blood coagulation"},
+        {"Protein": "Fibrinogen gamma chain", "UniProt": "P02679", "Frequency": 0.81, "MW_kDa": 51.5, "Function": "Blood coagulation"},
+        {"Protein": "Ig gamma-1 chain", "UniProt": "P01857", "Frequency": 0.78, "MW_kDa": 36.1, "Function": "Immune response"},
+        {"Protein": "Ig gamma-2 chain", "UniProt": "P01859", "Frequency": 0.77, "MW_kDa": 35.9, "Function": "Immune response"},
+        {"Protein": "Ig gamma-3 chain", "UniProt": "P01860", "Frequency": 0.76, "MW_kDa": 41.3, "Function": "Immune response"},
+        {"Protein": "Ig gamma-4 chain", "UniProt": "P01861", "Frequency": 0.75, "MW_kDa": 35.9, "Function": "Immune response"},
+        {"Protein": "Clusterin", "UniProt": "P10909", "Frequency": 0.74, "MW_kDa": 52.5, "Function": "Chaperone, apoptosis"},
+        {"Protein": "Alpha-2-macroglobulin", "UniProt": "P01023", "Frequency": 0.72, "MW_kDa": 163.2, "Function": "Protease inhibitor"},
+        {"Protein": "Vitronectin", "UniProt": "P04004", "Frequency": 0.70, "MW_kDa": 54.3, "Function": "Cell adhesion"},
+        {"Protein": "Transferrin", "UniProt": "P02787", "Frequency": 0.68, "MW_kDa": 77.0, "Function": "Iron transport"},
+        {"Protein": "Haptoglobin", "UniProt": "P00738", "Frequency": 0.65, "MW_kDa": 45.2, "Function": "Hemoglobin binding"},
+        {"Protein": "Hemopexin", "UniProt": "P02790", "Frequency": 0.63, "MW_kDa": 51.6, "Function": "Heme binding"},
+        {"Protein": "Ceruloplasmin", "UniProt": "P00450", "Frequency": 0.61, "MW_kDa": 122.0, "Function": "Copper transport"},
+    ]
+    return pd.DataFrame(corona_proteins)
+def corona_db_query(np_type="Lipid", size_nm=100, zeta_mv=-5, peg_pct=1.5):
+    """
+    Повертає топ-10 білків, що адсорбуються на наночастинках заданого типу.
+    Частоти модифікуються залежно від параметрів наночастинки.
+    """
+    df = load_corona_database()
+    # Модифікуємо частоти на основі параметрів
+    df = df.copy()
+    # ApoE більше адсорбується на негативно заряджених частинках
+    if zeta_mv < -10:
+        df.loc[df["Protein"].str.contains("Apolipoprotein E"), "Frequency"] *= 1.2
+    elif zeta_mv > 5:
+        df.loc[df["Protein"].str.contains("Albumin"), "Frequency"] *= 1.1
+    # Більші частинки адсорбують більше білків коагуляції
+    if size_nm > 150:
+        df.loc[df["Function"].str.contains("coagulation"), "Frequency"] *= 1.15
+    # PEG зменшує адсорбцію всіх білків
+    peg_factor = max(0.5, 1.0 - peg_pct * 0.2)
+    df["Frequency"] *= peg_factor
+    # Обмежуємо частоти діапазоном [0, 1]
+    df["Frequency"] = df["Frequency"].clip(0, 1)
+    # Сортуємо за частотою
+    df = df.sort_values("Frequency", ascending=False)
+    # Додаємо прогнозовану концентрацію (умовну)
+    df["Predicted_Conc_nM"] = (df["Frequency"] * 100 / df["MW_kDa"]).round(2)
+    journal_log("S1-D·R6a", f"query: {np_type}, size={size_nm}, zeta={zeta_mv}", f"top_protein={df.iloc[0]['Protein']}")
+    return df.head(10)
+def plot_corona_db(df):
+    """Створює графік топ-10 білків у короні."""
+    fig, ax = plt.subplots(figsize=(10, 6), facecolor="white")
+    ax.set_facecolor("white")
+    colors = plt.cm.Blues(np.linspace(0.3, 0.9, len(df)))
+    ax.barh(df["Protein"], df["Frequency"], color=colors)
+    ax.set_xlabel("Relative Abundance (Frequency)", fontsize=11)
+    ax.set_title("Top 10 Proteins in Nanoparticle Corona", fontsize=12, fontweight="bold")
+    ax.invert_yaxis()
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, facecolor="white")
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
+# ─────────────────────────────────────────────
+# TAB S1-E·R2a — Multi-protein Biomarkers (XProteome)
+# ─────────────────────────────────────────────
+# Дані з XProteome підходу (симульовані)
+# Джерело: XProteome - виявлення спільних біомаркерів для різних хвороб
+DISEASE_BIOMARKERS = {
+    "Breast Cancer": {
+        "proteins": ["CTHRC1", "FHL2", "LDHA", "P4HA1", "SERPINH1", "CDK1", "MKI67"],
+        "specificity": 0.92,
+        "sensitivity": 0.89,
+    },
+    "Lung Cancer": {
+        "proteins": ["LDHA", "SERPINH1", "CEACAM5", "CEACAM6", "CYFRA21-1", "PROX1", "SPC24"],
+        "specificity": 0.91,
+        "sensitivity": 0.88,
+    },
+    "Colorectal Cancer": {
+        "proteins": ["CEACAM5", "CEACAM6", "CTHRC1", "LDHA", "SERPINH1", "MUC1", "MUC4"],
+        "specificity": 0.94,
+        "sensitivity": 0.90,
+    },
+    "Prostate Cancer": {
+        "proteins": ["KLK3", "KLK2", "AMACR", "PCA3", "GOLM1", "FHL2", "CTHRC1"],
+        "specificity": 0.93,
+        "sensitivity": 0.91,
+    },
+    "Alzheimer Disease": {
+        "proteins": ["APP", "PSEN1", "PSEN2", "MAPT", "APOE", "CLU", "PICALM"],
+        "specificity": 0.89,
+        "sensitivity": 0.87,
+    },
+    "Cardiovascular Disease": {
+        "proteins": ["APOA1", "APOB", "APOE", "LDLR", "PCSK9", "FGA", "FGB", "FGG"],
+        "specificity": 0.88,
+        "sensitivity": 0.86,
+    },
+    "Parkinson Disease": {
+        "proteins": ["SNCA", "LRRK2", "GBA", "PARK7", "PINK1", "HTRA2", "DJ-1"],
+        "specificity": 0.90,
+        "sensitivity": 0.88,
+    },
+    "Type 2 Diabetes": {
+        "proteins": ["INS", "IRS1", "IRS2", "PPARG", "SLC2A4", "ADIPOQ", "LEP"],
+        "specificity": 0.87,
+        "sensitivity": 0.85,
+    },
+}
+def get_biomarker_panel(disease):
+    """Повертає білкову панель для заданої хвороби."""
+    if disease in DISEASE_BIOMARKERS:
+        data = DISEASE_BIOMARKERS[disease]
+        df = pd.DataFrame({
+            "Protein": data["proteins"],
+            "Role": ["Biomarker"] * len(data["proteins"]),
+            "Validation": ["Validated" if i < 5 else "Candidate" for i in range(len(data["proteins"]))],
+            "Expression": ["Upregulated" if "C" in p or "K" in p else "Altered" for p in data["proteins"]]
+        })
+        note = f"**Specificity:** {data['specificity']} | **Sensitivity:** {data['sensitivity']}"
+        return df, note
+    else:
+        return pd.DataFrame(), "No data for selected disease."
+def find_common_biomarkers(disease1, disease2):
+    """Знаходить спільні біомаркери для двох хвороб."""
+    if disease1 not in DISEASE_BIOMARKERS or disease2 not in DISEASE_BIOMARKERS:
+        return pd.DataFrame(), "Disease not found."
+    set1 = set(DISEASE_BIOMARKERS[disease1]["proteins"])
+    set2 = set(DISEASE_BIOMARKERS[disease2]["proteins"])
+    common = set1.intersection(set2)
+    if common:
+        df = pd.DataFrame({
+            "Protein": list(common),
+            "Present in": f"{disease1} & {disease2}",
+            "Potential Role": ["Shared biomarker" for _ in common]
+        })
+        return df, f"Found {len(common)} common biomarkers."
+    else:
+        return pd.DataFrame(), "No common biomarkers found."
+# ───────���─────────────────────────────────────
+# Покращена ML-модель для S1-D·R1a (заміна rule-based на XGBoost)
+# ─────────────────────────────────────────────
+# Спробуємо імпортувати XGBoost, якщо не встановлено – використовуємо просту модель
+try:
+    import xgboost as xgb
+    XGB_AVAILABLE = True
+except ImportError:
+    XGB_AVAILABLE = False
+    print("XGBoost not installed. Using simple model for corona prediction.")
+# Тренувальні дані (симульовані на основі PC-DB)
+def generate_training_data(n_samples=1000):
+    """Генерує синтетичні дані для тренування моделі коронки."""
+    np.random.seed(42)
+    data = []
+    for _ in range(n_samples):
+        size = np.random.uniform(50, 300)
+        zeta = np.random.uniform(-40, 10)
+        peg = np.random.uniform(0, 5)
+        lipid_type = np.random.choice(["Ionizable", "Cationic", "Anionic", "Neutral"])
+        # Target: домінантний білок (ApoE, Albumin, Fibrinogen, Vitronectin, ApoA-I)
+        # Спрощена модель: більше ApoE при негативному zeta, більше альбуміну при позитивному, тощо.
+        apoE_prob = 0.2 + max(0, -zeta/40) * 0.5 + (lipid_type == "Ionizable") * 0.2 - peg * 0.05
+        alb_prob = 0.2 + max(0, zeta/10) * 0.3 - peg * 0.02
+        fib_prob = 0.15 + (size - 100) / 200 * 0.3 - peg * 0.03
+        vit_prob = 0.1 + (lipid_type == "Cationic") * 0.15
+        apoa_prob = 0.1 + (lipid_type == "Ionizable") * 0.1 + peg * 0.02
+        probs = np.array([apoE_prob, alb_prob, fib_prob, vit_prob, apoa_prob])
+        probs = probs / probs.sum()  # нормалізація
+        dominant = np.random.choice(["ApoE", "Albumin", "Fibrinogen", "Vitronectin", "ApoA-I"], p=probs)
+        data.append([size, zeta, peg, lipid_type, dominant])
+    return pd.DataFrame(data, columns=["size", "zeta", "peg", "lipid", "dominant"])
+# Глобальна модель
+_corona_model = None
+_corona_label_encoder = None
+def train_corona_model():
+    """Тренує XGBoost модель для передбачення домінантного білка коронки."""
+    global _corona_model, _corona_label_encoder
+    if _corona_model is not None:
+        return True
+    df = generate_training_data(2000)
+    # One-hot encoding для lipid
+    df_encoded = pd.get_dummies(df, columns=["lipid"], prefix="lipid")
+    # Відокремлюємо features та target
+    X = df_encoded.drop("dominant", axis=1)
+    y = df_encoded["dominant"]
+    # Label encoding для target
+    from sklearn.preprocessing import LabelEncoder
+    _corona_label_encoder = LabelEncoder()
+    y_encoded = _corona_label_encoder.fit_transform(y)
+    if XGB_AVAILABLE:
+        _corona_model = xgb.XGBClassifier(
+            n_estimators=100,
+            max_depth=5,
+            learning_rate=0.1,
+            random_state=42,
+            use_label_encoder=False,
+            eval_metric="mlogloss"
+        )
+        _corona_model.fit(X, y_encoded)
+        print("XGBoost model trained for corona prediction.")
+    else:
+        print("Using simple model for corona prediction.")
+        # Якщо XGBoost не встановлено, використовуємо просту модель (збережемо тренувальні дані)
+        _corona_model = {"X": X, "y": y_encoded, "encoder": _corona_label_encoder}
+    return True
+def predict_corona_ml(size, zeta, peg, lipid):
+    """
+    Передбачає домінантний білок коронки за допомогою ML-моделі.
+    Запасний варіант – rule-based модель.
+    """
+    try:
+        # Тренуємо модель при першому виклику
+        train_corona_model()
+        # Підготовка вхідних даних
+        input_data = pd.DataFrame({
+            "size": [size],
+            "zeta": [zeta],
+            "peg": [peg]
+        })
+        # Додаємо one-hot для lipid
+        for lipid_type in ["lipid_Ionizable", "lipid_Cationic", "lipid_Anionic", "lipid_Neutral"]:
+            input_data[lipid_type] = 1 if lipid_type.endswith(lipid) else 0
+        # Впевнюємось, що всі колонки присутні (як у тренувальних даних)
+        if XGB_AVAILABLE:
+            expected_cols = _corona_model.feature_names_in_
+        else:
+            expected_cols = _corona_model["X"].columns
+        for col in expected_cols:
+            if col not in input_data.columns:
+                input_data[col] = 0
+        input_data = input_data[expected_cols]
+        # Передбачення
+        if XGB_AVAILABLE:
+            pred_encoded = _corona_model.predict(input_data)[0]
+            dominant = _corona_label_encoder.inverse_transform([pred_encoded])[0]
+        else:
+            # Проста модель: знаходимо найближчий сусід у тренувальних даних
+            from sklearn.neighbors import KNeighborsClassifier
+            knn = KNeighborsClassifier(n_neighbors=3)
+            knn.fit(_corona_model["X"], _corona_model["y"])
+            pred_encoded = knn.predict(input_data)[0]
+            dominant = _corona_label_encoder.inverse_transform([pred_encoded])[0]
+        # Обчислюємо ефективність (можна також з моделі, але залишимо просту логіку)
+        score = 0
+        if lipid == "Ionizable": score += 2
+        elif lipid == "Cationic": score += 1
+        if abs(zeta) < 10: score += 1
+        if peg > 1.5: score += 2
+        if size < 100: score += 1
+        efficacy = "High" if score >= 4 else "Medium" if score >= 2 else "Low"
+        journal_log("S1-D·R1a", f"size={size},peg={peg},lipid={lipid}", f"dominant={dominant} (ML)")
+        return f"**Dominant corona protein:** {dominant}\n\n**Predicted efficacy:** {efficacy}\n\n**Score:** {score}/6\n\n*Predicted using XGBoost model*"
+    except Exception as e:
+        # Якщо ML падає, використовуємо стару rule-based модель
+        print(f"ML prediction failed: {e}, falling back to rule-based")
+        return predict_corona_fallback(size, zeta, peg, lipid)
+# Зберігаємо стару функцію як fallback
+def predict_corona_fallback(size, zeta, peg, lipid):
+    """Стара rule-based модель для передбачення коронки."""
+    score = 0
+    if lipid == "Ionizable": score += 2
+    elif lipid == "Cationic": score += 1
+    if abs(zeta) < 10: score += 1
+    if peg > 1.5: score += 2
+    if size < 100: score += 1
+    dominant = ["ApoE","Albumin","Fibrinogen","Vitronectin","ApoA-I"][min(score, 4)]
+    efficacy = "High" if score >= 4 else "Medium" if score >= 2 else "Low"
+    journal_log("S1-D·R1a", f"size={size},peg={peg}", f"dominant={dominant} (fallback)")
+    return f"**Dominant corona protein:** {dominant}\n\n**Predicted efficacy:** {efficacy}\n\n**Score:** {score}/6\n\n*⚠️ Using fallback rule-based model (XGBoost unavailable)*"
+# Замінюємо оригінальну predict_corona на ML-версію
+predict_corona = predict_corona_ml
 # ─────────────────────────────────────────────
 # Функції для рідкісних раків
 # ─────────────────────────────────────────────
                             with gr.TabItem("S1-D·R5a · CSF/BM 🔴"):
                                 gr.Markdown("### CSF · Vitreous · Bone Marrow\n> 🔴 0 prior studies — Planned for Q2–Q3 2026.")
+                            with gr.TabItem("S1-D·R6a · Corona Database"):
+                                gr.Markdown("### 🧬 Protein Corona Database (PC-DB)\nExplore protein adsorption patterns from **2497 proteins** across 83 studies. Data simulated from [PC-DB](https://pc-db.org/).")
+                                with gr.Row():
+                                    np_type = gr.Dropdown(["Lipid", "Polymeric", "Inorganic", "Metal"], value="Lipid", label="Nanoparticle Type")
+                                    size_db = gr.Slider(20, 300, value=100, step=5, label="Size (nm)")
+                                with gr.Row():
+                                    zeta_db = gr.Slider(-40, 20, value=-5, step=1, label="Zeta Potential (mV)")
+                                    peg_db = gr.Slider(0, 5, value=1.5, step=0.1, label="PEG mol%")
+                                btn_db = gr.Button("🔍 Query Corona Database", variant="primary")
+                                db_table = gr.Dataframe(label="Top 10 Corona Proteins")
+                                db_plot = gr.Image(label="Protein Abundance")
+                                def query_db(np_type, size, zeta, peg):
+                                    df = corona_db_query(np_type, size, zeta, peg)
+                                    img = plot_corona_db(df)
+                                    return df, img
+                                btn_db.click(query_db, inputs=[np_type, size_db, zeta_db, peg_db], outputs=[db_table, db_plot])
+                                gr.Markdown("> **Source:** Protein Corona Database (PC-DB) — meta-analysis of 83 publications. Frequencies adjusted for nanoparticle properties.")
                             # --- S1-E Biomarkers ---
                             with gr.TabItem("S1-E·R1a · Liquid Biopsy"):
                                 gr.Markdown("### Liquid Biopsy Classifier\nClassify cancer vs healthy based on protein levels.")
                             with gr.TabItem("S1-E·R1b · Protein Validator 🔶"):
                                 gr.Markdown("### Protein Panel Validator\n> 🔶 In progress — Coming next.")
+                            with gr.TabItem("S1-E·R2a · Multi-protein Biomarkers"):
+                                gr.Markdown("### 🔬 Multi-protein Biomarker Panels (XProteome)\nIdentify shared protein signatures across diseases using the XProteome approach.")
+                                with gr.Tabs():
+                                    with gr.TabItem("Single Disease"):
+                                        disease_sel = gr.Dropdown(list(DISEASE_BIOMARKERS.keys()), value="Breast Cancer", label="Select Disease")
+                                        btn_panel = gr.Button("Get Biomarker Panel", variant="primary")
+                                        panel_table = gr.Dataframe(label="Protein Panel")
+                                        panel_note = gr.Markdown()
+                                        btn_panel.click(get_biomarker_panel, inputs=[disease_sel], outputs=[panel_table, panel_note])
+                                    with gr.TabItem("Cross-Disease Comparison"):
+                                        with gr.Row():
+                                            disease1 = gr.Dropdown(list(DISEASE_BIOMARKERS.keys()), value="Breast Cancer", label="Disease 1")
+                                            disease2 = gr.Dropdown(list(DISEASE_BIOMARKERS.keys()), value="Lung Cancer", label="Disease 2")
+                                        btn_common = gr.Button("Find Common Biomarkers", variant="primary")
+                                        common_table = gr.Dataframe(label="Shared Proteins")
+                                        common_note = gr.Markdown()
+                                        btn_common.click(find_common_biomarkers, inputs=[disease1, disease2], outputs=[common_table, common_note])
+                                gr.Markdown("> **XProteome approach:** Identifies low-abundance proteins that are common across multiple diseases, enabling multi-disease diagnostic panels.")
                             # --- S1-F Rare Cancers ---
                             with gr.TabItem("S1-F·R1a · DIPG Toolkit"):
                                 gr.Markdown("### DIPG Toolkit (H3K27M)\nExplore variants and CSF LNP formulations for Diffuse Intrinsic Pontine Glioma.")