Spaces:

Tong314
/

Recommend_system

Sleeping

tong

revise streamlit temp files

aab3f3d about 2 months ago

19.7 kB

	import streamlit as st
	import pandas as pd
	import time
	import numpy as np
	import sys
	import os
	import json
	import torch

	sys.path.append(os.path.dirname(__file__))
	#
	# import pdb;pdb.set_trace()
	from server_infer import retrieve_topk, PATH_CONFIG
	from server_train import (
	load_dual_tower_model,
	recompute_candidate_vectors,
	continuous_train
	)

	# 1.Core Model Interfaces

	class DualTowerModel:
	def __init__(self):
	"""
	Initialize Dual tower model
	"""
	# check conifg
	if 'checkpoint_path' not in st.session_state:
	st.session_state['checkpoint_path'] = None
	if 'protein_model_path' not in st.session_state:
	st.session_state['protein_model_path'] = './SaProt_650M_AF2'
	if 'molecule_model_path' not in st.session_state:
	st.session_state['molecule_model_path'] = './ChemBERTa-zinc-base-v1'
	if 'vectors_ready' not in st.session_state:
	st.session_state['vectors_ready'] = self._check_vectors_exist()

	st.session_state['model_loaded'] = True

	def _check_vectors_exist(self):
	"""Check vector"""
	protein_pt = PATH_CONFIG['protein']['pt']
	molecule_pt = PATH_CONFIG['molecule']['pt']
	return os.path.exists(protein_pt) and os.path.exists(molecule_pt)

	def retrieve_molecules_for_protein(self, protein_query, top_k=5):
	"""
	Task 1: Protein -> Molecule
	Input: Protein sequence/ID/Gene symbol
	Output: Recommended molecule list
	"""
	try:
	# Call retrieve_topk from server_infer
	results = retrieve_topk(
	input_type="protein",
	input_query=protein_query,
	topk=top_k,
	device='cpu' # Using CPU for web interface
	)

	# Format results for frontend
	formatted_results = []
	for res in results:
	formatted_results.append({
	"smiles": res['id'],
	"score": res['score'],
	"name": res['info'].get('compound__name', 'Unknown'),
	"drugbank_id": res['info'].get('compound__drugbank_id', 'N/A'),
	"cas": res['info'].get('compound__cas', 'N/A')
	})
	return formatted_results
	except Exception as e:
	st.error(f"Retrieval failed: {str(e)}")
	return []

	def retrieve_proteins_for_molecule(self, molecule_query, top_k=5):
	"""
	Task 2: Molecule -> Protein
	Input: Molecule SMILES/Name/ID
	Output: Recommended protein list
	"""
	try:
	results = retrieve_topk(
	input_type="molecule",
	input_query=molecule_query,
	topk=top_k,
	device='cpu'
	)

	# Format results
	formatted_results = []
	for res in results:
	formatted_results.append({
	"protein_id": res['id'], # Truncated display
	"desc": res['info'].get('target__gene', 'Unknown'),
	"score": res['score'],
	"class": res['info'].get('target__class', 'N/A'),
	"full_id": res['id']
	})
	return formatted_results
	except Exception as e:
	st.error(f"Retrieval failed: {str(e)}")
	return []

	def fine_tune(self, uploaded_file, epochs, learning_rate, batch_size):
	"""
	Training Interface: Fine-tune model with uploaded data
	"""
	import tempfile
	try:
	# use temfile
	with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	tmp.write(uploaded_file.getbuffer())
	temp_dataset_path = tmp.name

	progress_bar = st.progress(0)
	status_text = st.empty()

	status_text.text("Initializing training environment...")

	# check exist
	model_save_dir = 'Dual_Tower_Model/customized_checkpoints'
	if not os.path.exists(model_save_dir):
	os.makedirs(model_save_dir, exist_ok=True)

	checkpoint_path = st.session_state.get('checkpoint_path', None)

	status_text.text("Starting actual training process...")

	# call the actual training function
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	if device == 'cpu':
	st.warning("⚠️ Detected CPU environment. Training might be very slow.")

	success = continuous_train(
	dataset_path=temp_dataset_path,
	model_save_dir=model_save_dir,
	best_model_path=checkpoint_path,
	epochs=epochs,
	lr=learning_rate,
	batch_size=batch_size,
	device=device
	)

	if success:
	status_text.success("✅ Training completed successfully!")

	# Clean up temporary file
	if os.path.exists(temp_dataset_path):
	os.remove(temp_dataset_path)

	return success


	except Exception as e:
	st.error(f"Training failed: {str(e)}")
	return False

	def update_vectors(self, protein_json, molecule_json, output_dir, checkpoint_path=None):
	"""
	Update candidate vector library
	"""
	try:
	recompute_candidate_vectors(
	protein_json_path=protein_json,
	molecule_json_path=molecule_json,
	output_dir=output_dir,
	pt_path=checkpoint_path,
	batch_size=32,
	device='cpu'
	)
	st.session_state['vectors_ready'] = True
	return True
	except Exception as e:
	st.error(f"Vector update failed: {str(e)}")
	return False

	# Initialize model (Singleton pattern)
	if 'model' not in st.session_state:
	with st.spinner('Initializing model...'):
	st.session_state['model'] = DualTowerModel()

	model = st.session_state['model']

	# 2. 页面布局 (UI Layout)

	st.set_page_config(page_title="Bio-Retrieval Service", page_icon="🧬", layout="wide")

	# Sidebar Configuration
	with st.sidebar:
	st.title("🧬 Dual-Tower Service")
	st.markdown("---")

	mode = st.radio(
	"Select Mode:",
	["🔍 Inference (Retrieval)", "⚙️ Training (Fine-tune)", "📊 Configuration", "👥 Team Info"]
	)

	st.markdown("---")
	st.info("Project Info\n\nBidirectional Protein-Molecule Retrieval.\nBased on SaProt & Molecular Encoder.")

	# Header Section
	st.title("🧬 Bidirectional Protein-Molecule Retrieval")
	st.markdown("""
	This service implements a Dual-Tower Architecture to bridge the gap between Proteins and molecules.
	""")


	# ==========================================
	# 3. Inference Mode
	# ==========================================
	if mode == "🔍 Inference (Retrieval)":
	st.subheader("Interactive Retrieval System")

	# Check if vectors are ready
	if not st.session_state.get('vectors_ready', False):
	st.error("❌ Vectors not ready. Please generate them in 'Configuration' first.")
	st.stop()

	# Use Tabs for tasks
	tab1, tab2 = st.tabs(["Protein → Molecule", "Molecule → Protein"])

	# --- Task 1: Protein to Molecule ---
	with tab1:
	st.markdown("### Recommend molecules for a specific protein target")

	# Quick Fill for Protein
	st.markdown("Quick Fill Examples:")
	q_col1, q_col2, q_col3 = st.columns([1, 1, 2])
	if q_col1.button("Example: ROS1", key="btn_ros1"):
	st.session_state['p_input_val'] = "ROS1"
	if q_col2.button("Example: P08922", key="btn_p08922"):
	st.session_state['p_input_val'] = "P08922"

	col1, col2 = st.columns([2, 1])
	with col1:
	protein_input = st.text_area(
	"Enter Protein Sequence (FASTA format or raw sequence):",
	height=150,
	value=st.session_state.get('p_input_val', ""),
	placeholder="e.g., ROS1, P08922, or full sequence...",
	help="Supports Gene Symbols, UniProt IDs, or raw sequences"
	)
	top_k_p2m = st.slider("Top K Results", 1, 20, 5, key="slider_p2m")

	with col2:
	st.markdown("#### Configuration")
	st.write(f"Encoder: SaProt (650M)")
	st.write("Search Space: ChEMBL / ZINC15")
	search_btn_p2m = st.button("🚀 Retrieve Molecules", type="primary")

	if search_btn_p2m and protein_input:
	with st.spinner("Encoding protein and searching vector database..."):
	results = model.retrieve_molecules_for_protein(protein_input, top_k_p2m)

	if results:
	st.success(f"Found {len(results)} potential molecules.")

	for idx, res in enumerate(results):
	with st.expander(
	f"🏆 Rank {idx+1}: {res['name']} \| Similarity: {res['score']:.4f}",
	expanded=(idx < 1)
	):
	col_a, col_b = st.columns([3, 2])
	with col_a:
	st.markdown("SMILES Structure:")
	st.code(res['smiles'], language="text")
	with col_b:
	st.markdown("Compound Info:")
	st.markdown(f"- Name: {res['name']}")
	st.markdown(f"- DrugBank ID: {res.get('drugbank_id', 'N/A')}")
	st.markdown(f"- CAS No.: {res.get('cas', 'N/A')}")

	st.markdown(f"Similarity Score:")
	# Ensure score is within [0, 1] for st.progress
	display_score = max(0.0, min(1.0, float(res['score'])))
	st.progress(display_score)
	else:
	st.warning("No matches found. Please check your input.")

	# --- Task 2: Molecule to Protein ---
	with tab2:
	st.markdown("### Retrieve potential protein targets for a specific molecule")

	# Quick Fill for Molecule
	st.markdown("Quick Fill Examples:")
	m_q_col1, m_q_col2 = st.columns([1, 3])
	if m_q_col1.button("Example: Lorlatinib", key="btn_lor"):
	st.session_state['m_input_val'] = "Lorlatinib"
	if m_q_col2.button("Example: SMILES", key="btn_smiles"):
	st.session_state['m_input_val'] = "C[C@H]1OC2=C(N)N=CC(=C2)C2=C(C#N)N(C)N=C2CN(C)C(=O)C2=C1C=C(F)C=C2"

	col1, col2 = st.columns([2, 1])
	with col1:
	mol_input = st.text_input(
	"Enter Molecule SMILES:",
	value=st.session_state.get('m_input_val', ""),
	placeholder="e.g., Lorlatinib, DB12130, or SMILES string",
	help="Supports Compound Names, DrugBank IDs, or SMILES strings"
	)
	top_k_m2p = st.slider("Top K Results", 1, 20, 5, key="slider_m2p")

	with col2:
	st.markdown("#### Configuration")
	st.write(f"Encoder: ChemBERTa")
	st.write("Search Space: UniProt / PDB")
	search_btn_m2p = st.button("🚀 Retrieve Proteins", type="primary")

	if search_btn_m2p and mol_input:
	with st.spinner("Encoding molecule and searching vector database..."):
	results = model.retrieve_proteins_for_molecule(mol_input, top_k_m2p)

	if results:
	st.success(f"Found {len(results)} potential protein targets.")

	st.markdown("---")
	st.markdown("#### 🏆 Retrieval Results")
	# import pdb;pdb.set_trace()
	for idx, res in enumerate(results):
	with st.expander(
	f"🏆 Rank {idx+1}: {res['desc']} \| Similarity: {res['score']:.4f}",
	expanded=(idx < 1)
	):
	st.markdown(f"Full ID: `{res['full_id']}`")
	st.markdown(f"Gene Symbol: {res['desc']}")
	st.markdown(f"Class: {res.get('class', 'N/A')}")
	st.markdown(f"Similarity Score:")
	# Ensure score is within [0, 1] for st.progress
	display_score = max(0.0, min(1.0, float(res['score'])))
	st.progress(display_score)
	else:
	st.warning("No matches found. Please check your input.")



	# 4. Training Mode
	elif mode == "⚙️ Training (Fine-tune)":
	st.subheader("Fine-tune the Dual-Tower Model")
	st.warning("This module allows you to input custom data to fine-tune the encoders.")

	# File Upload
	st.markdown("### 📁 Upload Training Data")
	uploaded_file = st.file_uploader(
	"Upload dataset (CSV/JSON)",
	type=["csv", "json"],
	help="Dataset should include: compound__smiles, target__foldseek_seq, outcome_potency_pxc50"
	)


	if uploaded_file:
	st.success("✅ File uploaded successfully")
	st.markdown("Data Preview:")
	try:
	df = pd.read_csv(uploaded_file)
	st.dataframe(df.head(10), use_container_width=True)
	st.info(f"📊 Dataset Size: {len(df)} records, {len(df.columns)} fields")
	except Exception as e:
	st.error(f"⚠️ Parsing failed: {str(e)}. Ensure format is correct.")

	# Hyperparameters
	st.markdown("---")
	st.markdown("### ⚙️ Hyperparameters")

	col1, col2, col3 = st.columns(3)
	with col1:
	epochs = st.number_input("Epochs", min_value=1, max_value=100, value=5)
	with col2:
	lr = st.number_input("Learning Rate", min_value=0.0001, max_value=0.1, value=0.001, format="%.4f")
	with col3:
	batch_size = st.selectbox("Batch Size", [16, 32, 64, 128], index=1)

	# Checkpoint Config
	st.markdown("---")
	st.markdown("### 📦 Model Checkpoint")

	checkpoint_option = st.radio(
	"Start training from:",
	["Pre-trained Backbones", "Existing Checkpoint"],
	help="Use backbones for first run, or checkpoint for incremental training"
	)

	if checkpoint_option == "Existing Checkpoint":
	checkpoint_path = st.text_input(
	"Checkpoint Path:",
	value="Dual_Tower_Model/output_checkpoints_ddp/model_epoch_7_acc_0.3259.pt",
	help="Full path to .pt file"
	)
	st.session_state['checkpoint_path'] = checkpoint_path
	else:
	st.session_state['checkpoint_path'] = None
	st.info("Using default SaProt & ChemBERTa backbones.")

	st.markdown("---")
	start_train = st.button("🔥 Start Fine-tuning", type="primary", use_container_width=True)

	if start_train:
	if uploaded_file is None:
	st.error("❌ Please upload a dataset first.")
	else:
	st.markdown("---")
	st.markdown("### 📈 Training Progress")

	success = model.fine_tune(uploaded_file, epochs, lr, batch_size)

	if success:
	st.balloons()
	st.success("🎉 Training pipeline initialized!")
	st.info("""
	Notes:
	- Real training requires a GPU-enabled environment.
	- Remember to update vector libraries after training.
	- Manage models in 'Configuration' section.
	""")


	# 5. Configuration Mode
	elif mode == "📊 Configuration":
	st.subheader("System Configuration")

	# Vector Library Management
	st.markdown("### 🗄️ Vector Management")

	col1, col2 = st.columns(2)
	with col1:
	st.markdown("#### Protein Vector Library")
	protein_json = PATH_CONFIG['protein']['json']
	protein_pt = PATH_CONFIG['protein']['pt']

	if os.path.exists(protein_pt):
	st.success("✅ Ready")
	data = torch.load(protein_pt, map_location='cpu')
	st.metric("Total Vectors", len(data['ids']))
	else:
	st.warning("⚠️ Missing")

	with col2:
	st.markdown("#### Molecule Vector Library")
	molecule_json = PATH_CONFIG['molecule']['json']
	molecule_pt = PATH_CONFIG['molecule']['pt']

	if os.path.exists(molecule_pt):
	st.success("✅ Ready")
	data = torch.load(molecule_pt, map_location='cpu')
	st.metric("Total Vectors", len(data['ids']))
	else:
	st.warning("⚠️ Missing")


	st.markdown("---")

	# Recompute Vectors
	st.markdown("### 🔄 Recompute Vector Library")
	st.warning("⚠️ This will overwrite existing vector files.")

	checkpoint_for_vector = st.text_input(
	"Model Checkpoint (optional):",
	placeholder="Leave empty for default backbones",
	help="Specify path to a trained checkpoint if available"
	)

	if st.button("🚀 Start Vector Recomputation", type="primary"):
	with st.spinner("Recomputing vectors, this may take a few minutes..."):
	try:
	success = model.update_vectors(
	protein_json=protein_json,
	molecule_json=molecule_json,
	output_dir='drug_target_activity/candidates',
	checkpoint_path=checkpoint_for_vector if checkpoint_for_vector else None
	)
	if success:
	st.success("✅ Vector library updated!")
	st.rerun()
	except Exception as e:
	st.error(f"❌ Recomputation failed: {str(e)}")

	st.markdown("---")

	# Model Info
	st.markdown("### 🤖 Model Pipeline Info")
	info_col1, info_col2 = st.columns(2)

	with info_col1:
	st.markdown("Protein Encoder")
	st.markdown("- Model: SaProt 650M")
	st.markdown(f"- Path: `{st.session_state.get('protein_model_path', 'N/A')}`")

	with info_col2:
	st.markdown("Molecule Encoder")
	st.markdown("- Model: ChemBERTa ZINC Base")
	st.markdown(f"- Path: `{st.session_state.get('molecule_model_path', 'N/A')}`")



	# 6. Team Info Mode
	elif mode == "👥 Team Info":
	st.subheader("🎓 Project Information")

	st.markdown("""
	Course: Recommender System

	Final Project: Bidirectional Protein-Molecule Retrieval (Choice Two)

	Date: January 2026

	---

	#### 👨‍🔬 Members:
	- Zhiyun Jiang
	- Xinyang Tong

	---

	#### 🛠️ Technical Contributions:
	- Architecture Design: Dual-Tower Encoder Integration
	- Model Training: SaProt & ChemBERTa Fine-tuning
	- Frontend: Streamlit Academic UI Development


	""")



	# Footer
	st.markdown("---")
	st.caption("Final Project \| Dual-Tower Architecture Implementation \| Zhiyun Jiang, Xinyang Tong")