""" Phase 2 Verification Script ============================= Tests the real open-source encoder implementations. Usage: python scripts/verify_phase2.py [--full] --full: Run full tests with model downloads (slow, requires GPU recommended) Without --full, runs quick tests with fallback/mock behavior. """ import sys import os from pathlib import Path # Add project root sys.path.insert(0, str(Path(__file__).parent.parent)) import logging logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') logger = logging.getLogger(__name__) def test_registry(): """Test that all plugins are importable and registry works.""" print("\n" + "="*60) print("šŸ› ļø TEST 1: Plugin Registry") print("="*60) from bioflow.core import ToolRegistry, Modality from bioflow.plugins import ( OBMEncoder, TextEncoder, MoleculeEncoder, ProteinEncoder, QdrantRetriever, DeepPurposePredictor ) print("āœ… All plugins imported successfully") # List available print("\nAvailable plugins:") print(" • OBMEncoder (multimodal)") print(" • TextEncoder (PubMedBERT/SciBERT)") print(" • MoleculeEncoder (ChemBERTa/RDKit)") print(" • ProteinEncoder (ESM-2/ProtBERT)") print(" • QdrantRetriever (Qdrant)") print(" • DeepPurposePredictor (DTI)") return True def test_rdkit_fallback(): """Test RDKit molecule encoding (no GPU needed).""" print("\n" + "="*60) print("🧪 TEST 2: RDKit Molecule Encoder (CPU-only)") print("="*60) try: from bioflow.plugins.encoders.molecule_encoder import MoleculeEncoder from bioflow.core import Modality encoder = MoleculeEncoder(backend="rdkit_morgan", fp_size=2048) test_molecules = [ ("CCO", "Ethanol"), ("CC(=O)Oc1ccccc1C(=O)O", "Aspirin"), ("CN1C=NC2=C1C(=O)N(C(=O)N2C)C", "Caffeine"), ] print(f"Encoder dimension: {encoder.dimension}") print("\nEncoding molecules:") for smiles, name in test_molecules: result = encoder.encode(smiles, Modality.SMILES) nonzero = sum(1 for v in result.vector if v > 0) print(f" • {name}: {nonzero} bits set (of {len(result.vector)})") print("āœ… RDKit encoding works!") return True except ImportError as e: print(f"āš ļø RDKit not installed: {e}") print(" Install with: pip install rdkit") return False def test_deeppurpose_predictor(): """Test DeepPurpose predictor (with fallback).""" print("\n" + "="*60) print("šŸ”® TEST 3: DeepPurpose Predictor") print("="*60) from bioflow.plugins.deeppurpose_predictor import DeepPurposePredictor predictor = DeepPurposePredictor() # Test data drug = "CC(=O)Oc1ccccc1C(=O)O" # Aspirin target = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG" print(f"Drug: Aspirin (SMILES: {drug[:30]}...)") print(f"Target: Protein ({len(target)} amino acids)") result = predictor.predict(drug, target) print(f"\nPrediction:") print(f" • Score: {result.score:.3f}") print(f" • Label: {result.label}") print(f" • Confidence: {result.confidence:.2f}") print(f" • Method: {result.metadata.get('method', 'unknown')}") if result.metadata.get('warning'): print(f" āš ļø {result.metadata['warning']}") print("āœ… Predictor works (with fallback if DeepPurpose unavailable)") return True def test_qdrant_retriever(): """Test Qdrant retriever with mock encoder.""" print("\n" + "="*60) print("šŸ—„ļø TEST 4: Qdrant Retriever (In-Memory)") print("="*60) try: from bioflow.core import BioEncoder, Modality, EmbeddingResult from bioflow.plugins.qdrant_retriever import QdrantRetriever # Mock encoder for testing class MockEncoder(BioEncoder): def encode(self, content, modality): # Simple hash-based vector import hashlib h = hashlib.md5(content.encode()).hexdigest() vector = [int(c, 16) / 15.0 for c in h] * 48 # 768-dim return EmbeddingResult(vector=vector[:768], modality=modality, dimension=768) @property def dimension(self): return 768 encoder = MockEncoder() retriever = QdrantRetriever(encoder=encoder, collection="test_molecules") # Ingest test data test_data = [ ("CCO", "Ethanol", {"type": "alcohol"}), ("CCCO", "Propanol", {"type": "alcohol"}), ("CC(=O)O", "Acetic acid", {"type": "acid"}), ("c1ccccc1", "Benzene", {"type": "aromatic"}), ] print("Ingesting molecules...") for smiles, name, payload in test_data: retriever.ingest(smiles, Modality.SMILES, {"name": name, **payload}) print(f"Collection size: {retriever.count()}") # Search print("\nSearching for 'CCCCO' (Butanol)...") results = retriever.search("CCCCO", limit=3, modality=Modality.SMILES) print("Results:") for r in results: print(f" • {r.payload.get('name', 'Unknown')}: score={r.score:.3f}") print("āœ… Qdrant retriever works!") return True except ImportError as e: print(f"āš ļø qdrant-client not installed: {e}") print(" Install with: pip install qdrant-client") return False def test_full_obm_encoder(): """Test full OBM encoder with real models (slow, requires downloads).""" print("\n" + "="*60) print("šŸš€ TEST 5: Full OBM Encoder (requires model downloads)") print("="*60) try: from bioflow.plugins.obm_encoder import OBMEncoder from bioflow.core import Modality print("Initializing OBMEncoder...") print("(This will download models on first run - ~500MB)") obm = OBMEncoder( text_model="pubmedbert", molecule_model="chemberta", protein_model="esm2_t6", # Smallest ESM model lazy_load=True ) # Test text print("\n1. Encoding text...") text_result = obm.encode("EGFR inhibitor for lung cancer treatment", Modality.TEXT) print(f" Text embedding: {len(text_result.vector)} dims") # Test molecule print("2. Encoding molecule...") mol_result = obm.encode("CC(=O)Oc1ccccc1C(=O)O", Modality.SMILES) print(f" Molecule embedding: {len(mol_result.vector)} dims") # Test protein print("3. Encoding protein...") prot_result = obm.encode("MKTVRQERLKSIVRILERSKEPVSG", Modality.PROTEIN) print(f" Protein embedding: {len(prot_result.vector)} dims") # Cross-modal similarity print("\n4. Cross-modal similarity:") sim = obm.similarity(text_result, mol_result) print(f" Text-Molecule similarity: {sim:.3f}") print("\nāœ… Full OBM Encoder works!") print(obm.get_encoder_info()) return True except Exception as e: print(f"āŒ Error: {e}") import traceback traceback.print_exc() return False def main(): """Run verification tests.""" print("="*60) print("🧬 BioFlow Phase 2 Verification") print("="*60) full_mode = "--full" in sys.argv if full_mode: print("Running FULL tests (with model downloads)") else: print("Running QUICK tests (no model downloads)") print("Add --full flag for complete testing") results = {} # Always run results["Registry"] = test_registry() results["RDKit"] = test_rdkit_fallback() results["DeepPurpose"] = test_deeppurpose_predictor() results["Qdrant"] = test_qdrant_retriever() # Only in full mode if full_mode: results["OBMEncoder"] = test_full_obm_encoder() # Summary print("\n" + "="*60) print("šŸ“Š VERIFICATION SUMMARY") print("="*60) for test, passed in results.items(): status = "āœ… PASS" if passed else "āŒ FAIL" print(f" {test}: {status}") all_passed = all(results.values()) print("\n" + ("āœ… All tests passed!" if all_passed else "āš ļø Some tests failed")) return 0 if all_passed else 1 if __name__ == "__main__": sys.exit(main())