Commit
adecc9b
·
1 Parent(s): 0f68a94

Phase 1: FastAPI integration with DeepPurpose DTI predictor

Browse files

- Added bioflow/api/ with FastAPI server (port 8000)
- Integrated DeepPurposePredictor from deeppurpose002.py logic
- Updated Next.js API routes to call FastAPI backend
- Created launch_bioflow_full.bat for dual-server startup
- Merged OpenBioMed core + lacoste001 Next.js UI

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -6
  2. .gitignore +159 -0
  3. .gitmodules +3 -0
  4. LICENSE +21 -0
  5. README-CN.md +265 -0
  6. README.md +270 -15
  7. USE_POLICY.md +19 -0
  8. bioflow/__init__.py +61 -0
  9. bioflow/api/__init__.py +7 -0
  10. bioflow/api/dti_predictor.py +346 -0
  11. bioflow/api/requirements.txt +29 -0
  12. bioflow/api/server.py +359 -0
  13. bioflow/app.py +570 -0
  14. bioflow/core/__init__.py +87 -0
  15. bioflow/core/base.py +247 -0
  16. bioflow/core/config.py +92 -0
  17. bioflow/core/nodes.py +465 -0
  18. bioflow/core/orchestrator.py +303 -0
  19. bioflow/core/registry.py +154 -0
  20. bioflow/demo.py +261 -0
  21. bioflow/obm_wrapper.py +355 -0
  22. bioflow/pipeline.py +370 -0
  23. bioflow/plugins/__init__.py +58 -0
  24. bioflow/plugins/deeppurpose_predictor.py +220 -0
  25. bioflow/plugins/encoders/__init__.py +17 -0
  26. bioflow/plugins/encoders/molecule_encoder.py +226 -0
  27. bioflow/plugins/encoders/protein_encoder.py +188 -0
  28. bioflow/plugins/encoders/text_encoder.py +177 -0
  29. bioflow/plugins/obm_encoder.py +294 -0
  30. bioflow/plugins/obm_plugin.py +40 -0
  31. bioflow/plugins/qdrant_retriever.py +312 -0
  32. bioflow/qdrant_manager.py +365 -0
  33. bioflow/ui/__init__.py +15 -0
  34. bioflow/ui/app.py +61 -0
  35. bioflow/ui/components.py +481 -0
  36. bioflow/ui/config.py +583 -0
  37. bioflow/ui/pages/__init__.py +5 -0
  38. bioflow/ui/pages/data.py +163 -0
  39. bioflow/ui/pages/discovery.py +165 -0
  40. bioflow/ui/pages/explorer.py +127 -0
  41. bioflow/ui/pages/home.py +213 -0
  42. bioflow/ui/pages/settings.py +192 -0
  43. bioflow/ui/requirements.txt +31 -0
  44. bioflow/visualizer.py +386 -0
  45. bioflow/workflows/__init__.py +49 -0
  46. bioflow/workflows/discovery.py +400 -0
  47. bioflow/workflows/drug_discovery.yaml +54 -0
  48. bioflow/workflows/ingestion.py +276 -0
  49. bioflow/workflows/literature_mining.yaml +41 -0
  50. checkpoints/.placeholder +0 -0
.gitattributes CHANGED
@@ -1,9 +1,3 @@
1
- *.pth filter=lfs diff=lfs merge=lfs -text
2
- *.ckpt filter=lfs diff=lfs merge=lfs -text
3
- *.pkl filter=lfs diff=lfs merge=lfs -text
4
- *.npy filter=lfs diff=lfs merge=lfs -text
5
- *.pt filter=lfs diff=lfs merge=lfs -text
6
- *.tfevents.* filter=lfs diff=lfs merge=lfs -text
7
  *.tab filter=lfs diff=lfs merge=lfs -text
8
  *.stl filter=lfs diff=lfs merge=lfs -text
9
  *.zip filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
1
  *.tab filter=lfs diff=lfs merge=lfs -text
2
  *.stl filter=lfs diff=lfs merge=lfs -text
3
  *.zip filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ /logs/*
141
+ /.vscode
142
+ /assets/*
143
+ /checkpoints/**/*
144
+ /datasets/**/*
145
+ /misc/*
146
+ /tmp/*
147
+ /third_party/*
148
+ !/third_party/.placeholder
149
+ !/third_party/p2rank_2.5
150
+ !/checkpoints/.placeholder
151
+ !/datasets/**/.placeholder
152
+ !/third_party/.placeholder
153
+
154
+ #files
155
+ *.csv
156
+ *.pt
157
+ *.txt
158
+ !requirements.txt
159
+ *.pth
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "third_party/p2rank_2.5"]
2
+ path = third_party/p2rank_2.5
3
+ url = https://github.com/rdk/p2rank.git
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Pharmolix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README-CN.md ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center"><h1>OpenBioMed</h1></div>
2
+ <h4 align="center">
3
+ <p>
4
+ <b>中文</b> |
5
+ <a href="./README.md">English</a>
6
+ <p>
7
+ </h4>
8
+
9
+ [![GitHub Repo stars](https://img.shields.io/github/stars/PharMolix/OpenBioMed?style=social)](https://github.com/PharMolix/OpenBioMed/stargazers)
10
+ [![GitHub last commit](https://img.shields.io/github/last-commit/PharMolix/OpenBioMed)](https://github.com/PharMolix/OpenBioMed/commits/main)
11
+ [![GitHub contributors](https://img.shields.io/github/contributors/PharMolix/OpenBioMed?color=orange)](https://github.com/PharMolix/OpenBioMed/graphs/contributors)
12
+ [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/PharMolix/OpenBioMed/pulls)
13
+ [![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/PharMolix)
14
+ [![Docker Pulls](https://img.shields.io/docker/pulls/youngking0727/openbiomed_server)](https://hub.docker.com/repository/docker/youngking0727/openbiomed_server)
15
+
16
+ ![platform](images/platform.png)
17
+
18
+ 欢迎用户在[该网站](http://openbiomed.pharmolix.com)使用我们的生物医药智能体开发平台!
19
+
20
+ ## 更新信息 🎉
21
+
22
+ - [2025/05/26] 🔥 我们的框架进行了功能更新,包括新的工具、数据集和模型。我们实现了**LangCell** (📃[论文](https://arxiv.org/abs/2405.06708), 🤖[模型](https://drive.google.com/drive/folders/1cuhVG9v0YoAnjW-t_WMpQQguajumCBTp?usp=sharing), 📎[引用](#to-cite-langcell)) 和细胞数据处理接口(见[示例](./examples/cell_annotation.ipynb))。我们还推出了ADMET、QED、SA、LogP、Lipinski、相似性等分子性质预测工具。
23
+
24
+ - [2025/03/07] 🔥 发布**OpenBioMed生物医药智能体开发平台**,可通过[该链接](http://openbiomed.pharmolix.com)访问。该平台能帮助研发人员零门槛使用AI模型定制化自己的科学研究助手(**AutoPilot**)。平台的[使用文档](https://www.zybuluo.com/icycookies/note/2587490)已经同步发布。
25
+
26
+ - [2025/03/07] 🔥 发布**OpenBioMed v2**. 我们在这次更新中适配了更多的生物医药下游任务,开放了更加易用的数据接口,并继承了更前沿的AI模型。同时,我们发布了试用版**PharmolixFM**模型(📃[技术报告](https://arxiv.org/abs/2503.21788), 🤖[模型](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), 📎[引用](#to-cite-pharmolixfm)),并完成了BioMedGPT-R1模型的推理支持。我们预计于本月内开放BioMedGPT-R1的微调代码。
27
+
28
+ > PharmolixFM是由水木分子与清华大学智能产业研究院联合研发的全原子基础大模型。该模型使用最先进的非自回归式多模态生成模型,在原子尺度上实现了对分子、抗体和蛋白质的联合建模。PharmolixFM能够适配多种下游任务,如分子对接、基于口袋的分子设计、抗体设计、分子构象生成等。在给定口袋位置的分子对接任务中,PharMolixFM的预测精度可与AlphaFold3媲美 (83.9 vs 90.2, RMSD < 2Å) 。
29
+
30
+ - [2025/02/20] 发布**BioMedGPT-R1** (🤗[Huggingface模型](https://huggingface.co/PharMolix/BioMedGPT-R1)).
31
+
32
+ > BioMedGPT-R1-17B是由水木分子与清华大学智能产业研究院(AIR)联合发布的生物医药多模态推理模型。其在上一版本的基础上,用DeepSeek-R1-Distill-Qwen-14B更新了原采用的文本基座模型,并通过跨模态对齐和多模态推理SFT实现模型微调,在生物医药问答任务上效果逼近闭源商用大模型和人类专家水平。
33
+
34
+ - [2024/05/16] 发布 **LangCell** (📃[论文](https://arxiv.org/abs/2405.06708), 💻[代码](https://github.com/PharMolix/LangCell), 🤖[模型](https://drive.google.com/drive/folders/1cuhVG9v0YoAnjW-t_WMpQQguajumCBTp?usp=sharing), 📎[引用](#to-cite-langcell)).
35
+
36
+ > LangCell是由水木分子与清华大学智能产业研究院联合研发的首个“自然语言-单细胞”多模态预训练模型。该模型通过学习富含细胞身份信息的知识性文本,有效提升了对单细胞转录组学的理解能力,并解决了数据匮乏场景下的细胞身份理解任务。LangCell是唯一能有效进行零样本细胞身份理解的单细胞模型,并且在少样本和微调场景下也取得SOTA。LangCell将很快被集成到OpenBioMed。
37
+
38
+ - [2023/08/14] 发布 **BioMedGPT-LM-7B** (🤗[HuggingFace模型](https://huggingface.co/PharMolix/BioMedGPT-LM-7B)) 、 **BioMedGPT-10B** (📃[论文](https://arxiv.org/abs/2308.09442v2), 🤖[模型](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F), 📎[引用](#to-cite-biomedgpt)) 和 **DrugFM** (🤖[模型](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F)).
39
+
40
+ > BioMedGPT-10B是由水木分子联合清华大学智能产业研究院联合发布的首个可商用的多模态生物医药大模型。该模型将以分子结构和蛋白质序列为代表的生命语言与人类的自然语言相结合,在生物医药专业问答能力比肩人类专家水平,在分子和蛋白质跨���态问答中表现出强大的性能。BioMedGPT-LM-7B是首个可商用、生物医药专用的Llama2大模型。
41
+
42
+ > DrugFM是由"清华AIR-智源联合研究中心"联合研发的多模态小分子基础模型。 该模型针对小分子药物的组织规律和表示学习进行了更细粒度的设计,形成了小分子药物预训练模型UniMap,并与多模态小分子基础模型MolFM有机结合。该模型在跨模态抽取任务中取得SOTA。
43
+
44
+ - [2023/06/12] 发布 **MolFM** (📃[论文](https://arxiv.org/abs/2307.09484), 🤖[模型](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F), 📎[引用](#to-cite-molfm)) 和 **CellLM** (📃[论文](https://arxiv.org/abs/2306.04371), 🤖[模型](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg), 📎[引用](#to-cite-celllm)).
45
+
46
+ > MolFM是一个支持统一表示分子结构、生物医学文本和知识图谱的多模态小分子基础模型。在零样本和微调场景下,MolFM的跨模态检索能力分别比现有模型提升了12.03%和5.04%。在分子描述生成、基于文本的分子生成和分子性质预测中,MolFM也取得了显著的结果。
47
+
48
+ > CellLM是首个使用分支对比学习策略在正常细胞和癌症细胞数据上同时训练的大规模细胞表示学习模型。CellLM在细胞类型注释(71.8 vs 68.8)、少样本场景下的单细胞药物敏感性预测(88.9 vs 80.6)和单组学细胞系药物敏感性预测上均优于ScBERT(93.4 vs 87.2)。
49
+
50
+ - [2023/04/23] 发布 **BioMedGPT-1.6B** (🤖[模型](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg)) 和 **OpenBioMed**.
51
+
52
+ ## 目录
53
+
54
+ - [介绍](#介绍)
55
+ - [环境搭建](#环境搭建)
56
+ - [使用指南](#使用指南)
57
+ - [先前版本](#先前版本)
58
+ - [局限性](#局限性)
59
+ - [引用](#引用)
60
+
61
+ ## 介绍
62
+
63
+ OpenBioMed是一个面向生命科学研究和药物研发的Python深度学习工具包。OpenBioMed为小分子结构、蛋白质结构、单细胞转录组学数据、知识图谱和生物医学文本等多模态数据提供了**灵活的数据处理接口**。OpenBioMed构建了**20余个计算工具**,涵盖了大部分AI药物发现任务和最新的针对分子、蛋白质的多模态理解生成任务。此外,OpenBioMed为研究者提供了一套**易用的工作流构建界面**,支持以拖拽形式对接多个模型,并构建基于大语言模型的智能体以解决复杂的科研问题。
64
+
65
+ OpenBioMed为研究者提供了:
66
+
67
+ - **4种不同数据的处理接口**, 包括分子结构、蛋白结构、口袋结构和自然语言文本。我们将在未来加入DNA、RNA、单细胞转录组学数据和知识图谱的处理接口。
68
+ - **20余个工具**, 包括分子性质预测、蛋白折叠为代表的AI预测工具、分子结构的可视化工具和互联网信息、数据库查询工具。
69
+ - **超过20个深度学习模型**, 包括[PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), [BioMedGPT-R1](https://huggingface.co/PharMolix/BioMedGPT-R1), [BioMedGPT](https://ieeexplore.ieee.org/document/10767279/) and [MutaPLM](https://arxiv.org/abs/2410.22949)等自研模型。
70
+
71
+ OpenBioMed的核心特色如下:
72
+
73
+ - **统一的数据处理框架**,能轻松加载不同模态的数据,并将其转换为统一的格式。
74
+ - **现成的模型预测模块**。我们整理并公开了各类模型的参数,并提供了使用案例,能够简便的迁移到其他数据或任务中。
75
+ - **易用的工作流与智能体构建方案**,以帮助研究者针对复杂的科研问题构建多工具协同工作流,通过反复执行工作流以模拟科学试验中的试错过程,并通过大语言模型归纳得到潜在的科学发现。
76
+
77
+ 下表显示了OpenBioMed中支持的工具,它们在未来会被进一步扩展。
78
+
79
+ | 工具名称 | 适配模型 | 简介 |
80
+ | :-----------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
81
+ | 分子性质预测 | [GraphMVP](https://arxiv.org/abs/2110.07728) | 针对给定分子预测其性质,如血脑屏障穿透性和药物副作用 |
82
+ | 分子问答 | [BioT5](https://arxiv.org/abs/2310.07276) | 针对给定分子和某个提问进行解答,如介绍分子结构、询问分子官能团、氢键供体的数量等 |
83
+ | 分子结构可视化 | 无 | 分子结构可视化 |
84
+ | 分子名称/ID检索 | 无 | 基于分子名称或ID,从PubChem数据库中检索分子 |
85
+ | 分子相似结构检索 | 无 | 从PubChem数据库中检索结构相似的分子 |
86
+ | 蛋白质问答 | [BioT5](https://arxiv.org/abs/2310.07276) | 针对给定蛋白和某个提问进行解答,如询问motif、蛋白功能、在细胞中的分布和相关疾病等 |
87
+ | 蛋白质折叠 | [ESMFold](https://www.science.org/doi/10.1126/science.ade2574) | 基于氨基酸序列预测蛋白质的三维结构 |
88
+ | 蛋白结合位点预测 | [P2Rank](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0285-8) | 预测蛋白质中潜在的(与小分子的)结合位点 |
89
+ | 突变效应阐释 | [MutaPLM](https://arxiv.org/abs/2410.22949) | 给定氨基酸序列上的一个单点突变,使用自然语言描述可能得突变效应 |
90
+ | 突变设计 | [MutaPLM](https://arxiv.org/abs/2410.22949) | 基于初始蛋白质序列和自然语言描述的优化目标,生成符合优化目标的突变后蛋白质 |
91
+ | 蛋白质ID检索 | 无 | 基于ID,从UniProtKB数据库中检索蛋白质序列 |
92
+ | 蛋白质结构检索 | 无 | 基于ID,从PDB和AlphaFoldDB数据库中检索蛋白质结构 |
93
+ | 蛋白质结构可视化 | N/A | 蛋白质结构可视化 |
94
+ | 蛋白质-分子刚性对接 | [PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/) | 给定蛋白口袋结构和分子,生成对接后的分子构象 |
95
+ | 基于口袋的分子设计 | [PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), [MolCRAFT](https://github.com/AlgoMole/MolCRAFT) | 给定蛋白口袋结构,生成能与该口袋对接的分子及其构象 |
96
+ | 复合物可视化 | N/A | 可视化蛋白质-小分子结合后的复合物结构 |
97
+ | 口袋可视化 | N/A | 可视化蛋白质的口袋结构 |
98
+ | 互联网搜索 | N/A | 在互联网中检索信息 |
99
+
100
+
101
+ ## 环境搭建
102
+
103
+ 为支持OpenBioMed的基本功能,请执行如下操作:
104
+
105
+ ```bash
106
+ conda create -n OpenBioMed python=3.9
107
+ conda activate OpenBioMed
108
+ pip install torch==1.13.1+{your_cuda_version} torchvision==0.14.1+{your_cuda_version} torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/{your_cuda_version}
109
+ pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.1+{your_cuda_version}.html
110
+ pip install pytorch_lightning==2.0.8 peft==0.9.0 accelerate==1.3.0 --no-deps -i https://pypi.tuna.tsinghua.edu.cn/simple
111
+ pip install -r requirements.txt
112
+ ```
113
+
114
+ 推荐使用11.7版本的cuda驱动来构建环境。开发者尚未测试使用其他版本的cuda驱动是否会产生问题。
115
+
116
+ 为支持可视化工具与vina分数计算工具,请按如下操作下载依赖包:
117
+
118
+ ```
119
+ # 可视化依赖
120
+ conda install -c conda-forge pymol-open-source
121
+ pip install imageio
122
+
123
+ # AutoDockVina依赖
124
+ pip install meeko==0.1.dev3 pdb2pqr vina==1.2.2
125
+ python -m pip install git+https://github.com/Valdes-Tresanco-MS/AutoDockTools_py3
126
+
127
+ # PoseBusters依赖
128
+ pip install posebusters==0.3.1
129
+
130
+ # 部分评估指标依赖
131
+ pip install spacy rouge_score nltk
132
+ python
133
+ >>> import nltk
134
+ >>> nltk.download('wordnet')
135
+ >>> nltk.download('omw-1.4')
136
+
137
+ # LangCell依赖
138
+ pip install geneformer
139
+ ```
140
+
141
+ 下载依赖后,您可以运行以下命令安装OpenBioMed包,从而更方便地使用我们的接口:
142
+
143
+ ```bash
144
+ pip install -e .
145
+ # 使用OpenBioMed的接口
146
+ python
147
+ >>> from open_biomed.data import Molecule
148
+ >>> molecule = Molecule(smiles="CC(=O)OC1=CC=CC=C1C(=O)O")
149
+ >>> print(molecule.calc_logp())
150
+ ```
151
+
152
+ ### 构建docker
153
+
154
+ 直接运行 `./scripts/docker_run.sh`,就可以构建docker镜像并运行容器,并在端口8082和8083运行后端服务。
155
+ ```
156
+ sh ./scripts/docker_run.sh
157
+ ```
158
+ 与此同时,我们也提供了build好的[docker镜像](https://hub.docker.com/repository/docker/youngking0727/openbiomed_server),可以直接拉取使用。
159
+
160
+ ## 使用指南
161
+
162
+ 请移步我们的 [使用案例与教程](./examples) 。
163
+
164
+ | 教程名称 | 简介 |
165
+ | ------------------------------------------------------------ | ------------------------------------------------------------ |
166
+ | [BioMedGPT推理](./examples/biomedgpt_r1.ipynb) | 使用BioMedGPT-10B回答分子与蛋白质相关问题和使用BioMedGPT-R1进行推理的示例。 |
167
+ | [分子与蛋白质数据处理](./examples/manipulate_molecules.ipynb) | 使用OpenBioMed中的接口加载、处理、导出分子与蛋白质数据的示例。 |
168
+ | [深度学习工具的使用](./examples/explore_ai4s_tools.ipynb) | 使用深度学习模型进行预测的示例。 |
169
+ | [可视化](./examples/visualization.ipynb) | 使用OpenBioMed中的接口对小分子、蛋白质、口袋和复合物进行可视化的示例。 |
170
+ | [工作流](./examples/workflow.ipynb) | 构建多工具协同工作流和大模型智能体的示例。 |
171
+ | [模型开发](./examples/model_customization.ipynb) | 在OpenBioMed框架中使用个人数据或模型结构开发新模型的教程。 |
172
+
173
+ ## 先前版本
174
+
175
+ 如果你想使用OpenBioMed先前版本的部分功能,请切换至该仓库的v1.0分支:
176
+
177
+ ```bash
178
+ git checkout v1.0
179
+ ```
180
+
181
+ ## 局限性
182
+
183
+ 本项目包含BioMedGPT-LM-7B,BioMedGPT-10B和BioMedGPT-R1,这些模型应当被负责任地使用。BioMedGPT不应用于向公众提供服务。我们严禁使用BioMedGPT生成任何违反适用法律法规的内容,如煽动颠覆国家政权、危害国家安全和利益、传播恐怖主义、极端主义、种族仇恨和歧视、暴力、色情或虚假有害信息等。BioMedGPT不对用户提供或发布的任何内容、数据或信息产生的任何后果负责。
184
+
185
+ ## 协议
186
+
187
+ 本项目代码依照[MIT](./LICENSE)协议开源。使用BioMedGPT-LM-7B、BioMedGPT-10B和BioMedGPT-R1模型,需要遵循[使用协议](./USE_POLICY.md)。
188
+
189
+ ## 联系方式
190
+
191
+ 我们期待您的反馈以帮助我们改进这一框架。若您在使用过程中有任何技术问题或建议,请随时在GitHub issue中提出。若您有商业合作的意向,请联系[opensource@pharmolix.com](mailto:opensource@pharmolix.com)。
192
+
193
+
194
+ ## 引用
195
+
196
+ 如果您认为我们的开源代码和模型对您的研究有帮助,请考虑给我们的项目点上星标🌟并引用📎以下文章。感谢您的支持!
197
+
198
+ ##### 引用OpenBioMed:
199
+
200
+ ```
201
+ @misc{OpenBioMed_code,
202
+ author={Luo, Yizhen and Yang, Kai and Fan, Siqi and Hong, Massimo and Zhao, Suyuan and Chen, Xinrui and Nie, Zikun and Luo, Wen and Xie, Ailin and Liu, Xing Yi and Zhang, Jiahuan and Wu, Yushuai and Nie, Zaiqing},
203
+ title={Code of OpenBioMed},
204
+ year={2023},
205
+ howpublished={\url{https://github.com/Pharmolix/OpenBioMed.git}}
206
+ }
207
+ ```
208
+
209
+ ##### 引用BioMedGPT:
210
+
211
+ ```
212
+ @article{luo2024biomedgpt,
213
+ title={Biomedgpt: An open multimodal large language model for biomedicine},
214
+ author={Luo, Yizhen and Zhang, Jiahuan and Fan, Siqi and Yang, Kai and Hong, Massimo and Wu, Yushuai and Qiao, Mu and Nie, Zaiqing},
215
+ journal={IEEE Journal of Biomedical and Health Informatics},
216
+ year={2024},
217
+ publisher={IEEE}
218
+ }
219
+ ```
220
+
221
+ ##### 引用PharMolixFM:
222
+
223
+ @article{luo2025pharmolixfm,
224
+ title={PharMolixFM: All-Atom Foundation Models for Molecular Modeling and Generation},
225
+ author={Luo, Yizhen and Wang, Jiashuo and Fan, Siqi and Nie, Zaiqing},
226
+ journal={arXiv preprint arXiv:2503.21788},
227
+ year={2025}
228
+ }
229
+
230
+ ##### 引用MolFM:
231
+ ```
232
+ @misc{luo2023molfm,
233
+ title={MolFM: A Multimodal Molecular Foundation Model},
234
+ author={Yizhen Luo and Kai Yang and Massimo Hong and Xing Yi Liu and Zaiqing Nie},
235
+ year={2023},
236
+ eprint={2307.09484},
237
+ archivePrefix={arXiv},
238
+ primaryClass={q-bio.BM}
239
+ }
240
+ ```
241
+
242
+ ##### 引用LangCell:
243
+ ```
244
+ @misc{zhao2024langcell,
245
+ title={LangCell: Language-Cell Pre-training for Cell Identity Understanding},
246
+ author={Suyuan Zhao and Jiahuan Zhang and Yizhen Luo and Yushuai Wu and Zaiqing Nie},
247
+ year={2024},
248
+ eprint={2405.06708},
249
+ archivePrefix={arXiv},
250
+ primaryClass={q-bio.GN}
251
+ }
252
+ ```
253
+
254
+ ##### 引用MutaPLM
255
+
256
+ ```
257
+ @article{luo2025mutaplm,
258
+ title={MutaPLM: Protein Language Modeling for Mutation Explanation and Engineering},
259
+ author={Luo, Yizhen and Nie, Zikun and Hong, Massimo and Zhao, Suyuan and Zhou, Hao and Nie, Zaiqing},
260
+ journal={Advances in Neural Information Processing Systems},
261
+ volume={37},
262
+ pages={79783--79818},
263
+ year={2025}
264
+ }
265
+ ```
README.md CHANGED
@@ -1,27 +1,282 @@
1
- DeepPurpose002 — Training & Prediction (DTI)
 
 
 
 
 
 
2
 
3
- Ce repo contient un pipeline DeepPurpose pour :
 
 
 
 
 
4
 
5
- entraîner un modèle Drug–Target Interaction (DTI) à partir de paires (SMILES, séquence protéique, label),
6
 
7
- évaluer le modèle (métriques + logs),
8
 
9
- prédire des interactions/affinités sur de nouvelles paires et exporter les résultats.
10
 
11
- Contenu
12
 
13
- deeppurpose002.py : chargement données preprocessing/encodage entraînement évaluation sauvegarde modèle + outputs
 
14
 
15
- prediction_test.py (ou équivalent) : chargement du modèle sauvegardé prédictions export CSV
16
 
17
- Utilisation
18
- python deeppurpose002.py
19
- python prediction_test.py
20
 
21
- Format attendu
22
 
23
- Train (supervisé) : drug_smiles, target_sequence, label
24
 
25
- Predict : drug_smiles, target_sequence
26
 
27
- Outputs : modèles dans models/, résultats/logs dans outputs/.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center"><h1>OpenBioMed</h1></div>
2
+ <h4 align="center">
3
+ <p>
4
+ <b>English</b> |
5
+ <a href="./README-CN.md">中文</a>
6
+ <p>
7
+ </h4>
8
 
9
+ [![GitHub Repo stars](https://img.shields.io/github/stars/PharMolix/OpenBioMed?style=social)](https://github.com/PharMolix/OpenBioMed/stargazers)
10
+ [![GitHub last commit](https://img.shields.io/github/last-commit/PharMolix/OpenBioMed)](https://github.com/PharMolix/OpenBioMed/commits/main)
11
+ [![GitHub contributors](https://img.shields.io/github/contributors/PharMolix/OpenBioMed?color=orange)](https://github.com/PharMolix/OpenBioMed/graphs/contributors)
12
+ [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/PharMolix/OpenBioMed/pulls)
13
+ [![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/PharMolix)
14
+ [![Docker Pulls](https://img.shields.io/docker/pulls/youngking0727/openbiomed_server)](https://hub.docker.com/repository/docker/youngking0727/openbiomed_server)
15
 
16
+ ![platform](images/platform.png)
17
 
18
+ Feel free to use our **Agent Platform for Biomedicine and Life Science** at this [website](http://openbiomed.pharmolix.com)!
19
 
20
+ ## News 🎉
21
 
22
+ - [2025/05/26] 🔥 Our framework has been updated with several new features including new tools, datasets, and models. We implement **LangCell** (📃[Paper](https://arxiv.org/abs/2405.06708), 🤖[Model](https://drive.google.com/drive/folders/1cuhVG9v0YoAnjW-t_WMpQQguajumCBTp?usp=sharing), 📎[Citation](#to-cite-langcell)) and APIs to manipulate cells (See [the Example](./examples/cell_annotation.ipynb)). We also introduce a wider range of tools to calculate molecular properties (ADMET, QED, SA, LogP, Lipinski, Similarity, etc.).
23
 
24
+ - [2025/03/07] 🔥 We present **OpenBioMed Agent Platform** at this [website](http://openbiomed.pharmolix.com) to customize workflows and LLM agents (**AutoPilots**) in solving complicated scientific research tasks. **Tutorials** for using this platform are also [available](https://www.zybuluo.com/icycookies/note/2587490).
25
+ - [2025/03/07] 🔥 Released **OpenBioMed v2**. We present new features including additional downstream biomedical tasks, more flexible data APIs, and advanced models. We also release a preview version of **PharmolixFM** (📃[Paper](https://arxiv.org/abs/2503.21788), 🤖[Model](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), 📎[Citation](#to-cite-pharmolixfm)). BioMedGPT-R1 inference is currently supported, and fine-tuning will be available in this month!
26
 
27
+ > PharmolixFM is an all-atom molecular foundation model jointly released by PharMolix Inc. and Institute of AI Industry Research (AIR), Tsinghua University. It unifies molecules, antibodies, and proteins by jointly modeling them at atom-level with cutting-edge non-autoregressive multi-modal generative models. PharmolixFM is capable of solving mutiple downstream tasks such as docking, structure-based drug design, peptide design, and molecular conformation generation. PharmolixFM achieves competitive performance with AlphaFold3 (83.9 vs 90.2, RMSD < 2Å) on protein-molecule docking (given pocket).
28
 
 
 
 
29
 
30
+ - [2025/02/20] BioMedGPT-R1 (🤗[Huggingface Model](https://huggingface.co/PharMolix/BioMedGPT-R1)) has been released.
31
 
32
+ > BioMedGPT-R1-17B is a multimodal biomedical reasoning model jointly released by PharMolix and Institute of AI Industry Research (AIR) . It updates the language model in last version with DeepSeek-R1-Distill-Qwen-14B and adopts two-stage training for cross-modal alignment and multimodal reasoning SFT, performing on par with commercial model on biomedical QA benchmark.
33
 
34
+ - [2024/05/16] Released implementation of **LangCell** (📃[Paper](https://arxiv.org/abs/2405.06708), 💻[Code](https://github.com/PharMolix/LangCell), 🤖[Model](https://drive.google.com/drive/folders/1cuhVG9v0YoAnjW-t_WMpQQguajumCBTp?usp=sharing), 📎[Citation](#to-cite-langcell)).
35
 
36
+ > LangCell is the first "language-cell" multimodal pre-trained model jointly developed by PharMolix and Institute for AI Industry Research (AIR). It effectively enhances the understanding of single-cell transcriptomics by learning knowledge-rich texts containing cell identity information, and addresses the task of cell identity understanding in data-scarce scenarios. LangCell is the only single-cell model capable of effective zero-shot cell identity understanding and has also achieved SOTA in few-shot and fine-tuning scenarios. LangCell will soon be integrated into OpenBioMed.
37
+
38
+
39
+ - [2023/08/14] Released implementation of **BioMedGPT-10B** (📃[Paper](https://arxiv.org/abs/2308.09442v2), 🤖[Model](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F), 📎[Citation](#to-cite-biomedgpt)), **BioMedGPT-LM-7B** (🤗[HuggingFace Model](https://huggingface.co/PharMolix/BioMedGPT-LM-7B)) and **DrugFM** (🤖[Model](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F)).
40
+
41
+ > BioMedGPT-10B is the first commercial-friendly multimodal biomedical foundation model jointly released by PharMolix and Institute of AI Industry Research (AIR). It aligns the language of life (molecular structures and protein sequences) with human natural language, performing on par with human experts on biomedical QA benchmarks, and demonstrating powerful performance in cross-modal molecule and protein question answering tasks. BioMedGPT-LM-7B is the first commercial-friendly generative foundation model tailored for biomedicine based on Llama-2.
42
+
43
+ > DrugFM is a multi-modal molecular foundation model jointly developed by Institute of AI Industry Research (AIR) and Beijing Academy of Artificial Intelligence, BAAI. It leverages UniMAP, a pre-trained molecular model that captures fine-grained properties and representations of molecules, and incorporates MolFM, our multimodal molecular foundation model. DrugFM achieves SOTA on cross-modal retrieval.
44
+
45
+
46
+ - [2023/06/12] Released implementation of **MolFM** (📃[Paper](https://arxiv.org/abs/2307.09484), 🤖[Model](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg?pwd=7a6b#list/path=%2F), 📎[Citation](#to-cite-molfm)) and **CellLM** (📃[Paper](https://arxiv.org/abs/2306.04371), 🤖[Model](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg), 📎[Citation](#to-cite-celllm)).
47
+
48
+ > MolFM is a multi-modal molecular foundation model that enables joint comprehension of molecular structures, biomedical documents and knowledge graphs. On cross-modal retrieval, MolFM outperforms existing models by 12.03% and 5.04% under zero-shot and fine-tuning settings. MolFM also excels in molecule captioning, text-to-molecule generation and molecule property prediction.
49
+
50
+ > CellLM is the first large-scale cell representation learning model trained on both normal cells and cancer cells with divide-and-conquer contrastive learning. CellLM beats ScBERT on cell type annotation (71.8 vs 68.8), few-shot single-cell drug sensitivity prediction (88.9 vs 80.6) and single-omics cell line drug sensitivity prediction (93.4 vs 87.2).
51
+
52
+
53
+ - [2023/04/23] Released implementation of **BioMedGPT-1.6B** (🤖[Model](https://pan.baidu.com/s/1iAMBkuoZnNAylhopP5OgEg)) and **OpenBioMed**.
54
+
55
+
56
+ ## Table of contents
57
+
58
+
59
+ - [Introduction](#introduction)
60
+ - [Installation](#installation)
61
+ - [Tutorials](#tutorials)
62
+ - [Previous version](#previous-version)
63
+ - [Limitations](#limitations)
64
+ - [Cite us](#cite-us)
65
+
66
+
67
+ ## Introduction
68
+
69
+
70
+ This repository holds OpenBioMed, a Python deep learning toolkit for AI-empowered biomedicine. OpenBioMed provides **flexible APIs to handle multi-modal biomedical data**, including molecules, proteins, single cells, natural language, and knowledge graphs. OpenBioMed builds **20+ tools that covers a wide range of downstream applications**, ranging from traditional AI drug discovery tasks to newly-emerged multi-modal challenges. Moreover, OpenBioMed provides **an easy-to-use interface for building workflows** that connect multiple tools and developing LLM-driven agents for solving complicated biomedical research tasks.
71
+
72
+
73
+ OpenBioMed provide researchers with access to:
74
+
75
+
76
+ - **4 types of data modalities**: OpenBioMed provide easy-to-use APIs for researchers to access and process different types of data including molecules, proteins, pockets, and texts. New data structures for DNAs, RNAs, single cells, and knowledge graphs will be available in future versions.
77
+ - **20+ tools**, ranging from ML-based prediction models for AIDD tasks including molecule property prediction and protein folding, to visualization tools and web-search APIs.
78
+ - **20+ deep learning models**, comprising exclusive models such as [PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), [BioMedGPT-R1](https://huggingface.co/PharMolix/BioMedGPT-R1), [BioMedGPT](https://ieeexplore.ieee.org/document/10767279/) and [MutaPLM](https://arxiv.org/abs/2410.22949).
79
+
80
+
81
+ Key features of OpenBioMed include:
82
+
83
+
84
+ - **Unified Data Processing Pipeline**: easily load and transform the heterogeneous data from different biomedical entities and modalities into a unified format.
85
+ - **Off-the-shelf Inference**: publicly available pre-trained models and inference demos, readily to be transferred to your own data or task.
86
+ - **Easy-to-use Interface for Building Workflows and LLM Agents**: flexibly build solutions for complicated research tasks with multi-tool collaborative workflows, and harvest LLMs for simulating trial-and-errors and gaining scientific insights.
87
+
88
+
89
+ Here is a list of currently available tools. This is a continuing effort and we are working on further growing the list.
90
+
91
+
92
+ | Tool | Supported Model | Description |
93
+ | :----------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
94
+ | Molecular Property Prediction | [GraphMVP](https://arxiv.org/abs/2110.07728) | Predicting the properties of a given molecule (e.g. blood-brain barrier penetration and side effects) |
95
+ | Molecule Question Answering | [BioT5](https://arxiv.org/abs/2310.07276) | Answering textual queries of a given molecule (e.g. structural descriptions, functional groups, number of hydrogen bond donors) |
96
+ | Molecule Visualization | N/A | Visualize a molecule |
97
+ | Molecule Name/ID Request | N/A | Obtaining a molecule from PubChem using its name or PubChemID |
98
+ | Molecule Structure Request | N/A | Obtaining a molecule from PubChem based on similar structures |
99
+ | Protein Question Answering | [BioT5](https://arxiv.org/abs/2310.07276) | Answering textual queries of a given protein (e.g. motifs, functions, subcellular location, related diseases) |
100
+ | Protein Folding | [ESMFold](https://www.science.org/doi/10.1126/science.ade2574) | Predicting the 3D structure of a protein based on its amino acid sequence |
101
+ | Protein Pocket Prediction | [P2Rank](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0285-8) | Predicting potential binding sites within a protein |
102
+ | Mutation Explanation | [MutaPLM](https://arxiv.org/abs/2410.22949) | Providing textual explanations of a single-site substitution mutation on a protein sequence |
103
+ | Mutation Engineering | [MutaPLM](https://arxiv.org/abs/2410.22949) | Generating a mutated protein to fit the textual instructions on the wild-type protein sequence. |
104
+ | Protein UniProtID Request | N/A | Obtaining a protein sequence from UniProtKB based on UniProt accession ID |
105
+ | Protein PDB Request | N/A | Obtaining a protein structure from PDB/AlphaFoldDB based on PDB/AlphaFoldDB accession ID |
106
+ | Protein Visualization | N/A | Visualize a protein |
107
+ | Protein-molecule Rigid Docking | [PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/) | Generate the binding pose of the molecule with a given pocket in a protein |
108
+ | Structure-based Drug Design | [PharmolixFM](https://cloud.tsinghua.edu.cn/f/8f337ed5b58f45138659/), [MolCRAFT](https://github.com/AlgoMole/MolCRAFT) | Generate a molecule that binds with a given pocket in a protein |
109
+ | Complex Visualization | N/A | Visualize a protein-molecule complex |
110
+ | Pocket Visualization | N/A | Visualize a pocket within a protein |
111
+ | Web Request | N/A | Obtaining information by web search |
112
+
113
+ ## Installation
114
+
115
+ To enable basic features of OpenBioMed, please execute the following:
116
+
117
+
118
+ ```bash
119
+ conda create -n OpenBioMed python=3.9
120
+ conda activate OpenBioMed
121
+ pip install torch==1.13.1+{your_cuda_version} torchvision==0.14.1+{your_cuda_version} torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/{your_cuda_version}
122
+ pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.1+{your_cuda_version}.html
123
+ pip install pytorch_lightning==2.0.8 peft==0.9.0 accelerate==1.3.0 --no-deps -i https://pypi.tuna.tsinghua.edu.cn/simple
124
+ pip install -r requirements.txt
125
+ ```
126
+
127
+
128
+ We recommend using cuda=11.7 to set up the environment. Other versions of cudatoolkits may lead to unexpected problems.
129
+
130
+
131
+ To enable visualization tools and vina score computation tools, you should install the following packages:
132
+
133
+ ```
134
+ # For visualization
135
+ conda install -c conda-forge pymol-open-source
136
+ pip install imageio
137
+
138
+ # For AutoDockVina
139
+ pip install meeko==0.1.dev3 pdb2pqr vina==1.2.2
140
+ python -m pip install git+https://github.com/Valdes-Tresanco-MS/AutoDockTools_py3
141
+
142
+ # For PoseBusters
143
+ pip install posebusters==0.3.1
144
+
145
+ # For overlap-based evaluation
146
+ pip install spacy rouge_score nltk
147
+ python
148
+ >>> import nltk
149
+ >>> nltk.download('wordnet')
150
+ >>> nltk.download('omw-1.4')
151
+
152
+ # For LangCell
153
+ pip install geneformer
154
+ ```
155
+
156
+ After downloading the dependencies, you can run the following command to install the package and use our APIs more conveniently:
157
+
158
+ ```bash
159
+ pip install -e .
160
+ # Try using OpenBioMed APIs
161
+ python
162
+ >>> from open_biomed.data import Molecule
163
+ >>> molecule = Molecule(smiles="CC(=O)OC1=CC=CC=C1C(=O)O")
164
+ >>> print(molecule.calc_logp())
165
+ ```
166
+
167
+ ### Build Docker
168
+
169
+ Executing ./scripts/docker_run.sh directly will build the Docker image and run the container, launching the backend services on ports 8082 and 8083.
170
+ ```
171
+ sh ./scripts/docker_run.sh
172
+ ```
173
+ At the same time, we also provide a pre-built [docker image](https://hub.docker.com/repository/docker/youngking0727/openbiomed_server), which can be pulled and used directly.
174
+
175
+ ## Tutorials
176
+
177
+ Checkout our [Jupytor notebooks](./examples/) for a quick start!
178
+
179
+ | Name | Description |
180
+ | ------------------------------------------------------------ | ------------------------------------------------------------ |
181
+ | [BioMedGPT Inference](./examples/biomedgpt_r1.ipynb) | Examples of using BioMedGPT-10B to answer questions about molecules and proteins and BioMedGPT-R1 to perform reasoning. |
182
+ | [Molecule Processing](./examples/manipulate_molecules.ipynb) | Examples of using OpenBioMed APIs to load, process, and export molecules and proteins. |
183
+ | [ML Tool Usage](./examples/explore_ai4s_tools.ipynb) | Examples of using machine learning tools to perform inference. |
184
+ | [Visualization](./examples/visualization.ipynb) | Examples of using OpenBioMed APIs to visualize molecules, proteins, complexes, and pockets. |
185
+ | [Workflow Construction](./examples/workflow.ipynb) | Examples of building and executing workflows and developing LLM agents for complicated scientific tasks. |
186
+ | [Model Customization](./examples/model_customization.ipynb) | Tutorials on how to customize your own model and data using OpenBioMed training pipelines. |
187
+
188
+ ## Previous Version
189
+
190
+ If you hope to use the features of the previous version, please switch to the `v1.0` branch of this repository by running the following command:
191
+
192
+ ```bash
193
+ git checkout v1.0
194
+ ```
195
+
196
+ ## Limitations
197
+
198
+ This repository holds BioMedGPT-LM-7B, BioMedGPT-10B, and BioMedGPT-R1, and we emphasize the responsible and ethical use of these models. BioMedGPT should NOT be used to provide services to the general public. Generating any content that violates applicable laws and regulations, such as inciting subversion of state power, endangering national security and interests, propagating terrorism, extremism, ethnic hatred and discrimination, violence, pornography, or false and harmful information, etc. is strictly prohibited. BioMedGPT is not liable for any consequences arising from any content, data, or information provided or published by users.
199
+
200
+ ## License
201
+
202
+ This repository is licensed under the [MIT License](./LICENSE). The use of BioMedGPT-LM-7B and BioMedGPT-10B models is accompanied with [Acceptable Use Policy](./USE_POLICY.md).
203
+
204
+ ## Contact Us
205
+
206
+ We are looking forward to user feedback to help us improve our framework. If you have any technical questions or suggestions, please feel free to open an issue. For commercial support or collaboration, please contact [opensource@pharmolix.com](mailto:opensource@pharmolix.com).
207
+
208
+
209
+ ## Cite Us
210
+
211
+ If you find our open-sourced code and models helpful to your research, please consider giving this repository a 🌟star and 📎citing our research papers. Thank you for your support!
212
+
213
+ ##### To cite OpenBioMed:
214
+
215
+ ```
216
+ @misc{OpenBioMed_code,
217
+ author={Luo, Yizhen and Yang, Kai and Fan, Siqi and Hong, Massimo and Zhao, Suyuan and Chen, Xinrui and Nie, Zikun and Luo, Wen and Xie, Ailin and Liu, Xing Yi and Zhang, Jiahuan and Wu, Yushuai and Nie, Zaiqing},
218
+ title={Code of OpenBioMed},
219
+ year={2023},
220
+ howpublished={\url{https://github.com/Pharmolix/OpenBioMed.git}}
221
+ }
222
+ ```
223
+
224
+ ##### To cite BioMedGPT:
225
+
226
+ ```
227
+ @article{luo2024biomedgpt,
228
+ title={Biomedgpt: An open multimodal large language model for biomedicine},
229
+ author={Luo, Yizhen and Zhang, Jiahuan and Fan, Siqi and Yang, Kai and Hong, Massimo and Wu, Yushuai and Qiao, Mu and Nie, Zaiqing},
230
+ journal={IEEE Journal of Biomedical and Health Informatics},
231
+ year={2024},
232
+ publisher={IEEE}
233
+ }
234
+ ```
235
+
236
+ ##### To cite PharmolixFM:
237
+
238
+ @article{luo2025pharmolixfm,
239
+ title={PharMolixFM: All-Atom Foundation Models for Molecular Modeling and Generation},
240
+ author={Luo, Yizhen and Wang, Jiashuo and Fan, Siqi and Nie, Zaiqing},
241
+ journal={arXiv preprint arXiv:2503.21788},
242
+ year={2025}
243
+ }
244
+
245
+ ##### To cite MolFM:
246
+
247
+ ```
248
+ @misc{luo2023molfm,
249
+ title={MolFM: A Multimodal Molecular Foundation Model},
250
+ author={Yizhen Luo and Kai Yang and Massimo Hong and Xing Yi Liu and Zaiqing Nie},
251
+ year={2023},
252
+ eprint={2307.09484},
253
+ archivePrefix={arXiv},
254
+ primaryClass={q-bio.BM}
255
+ }
256
+ ```
257
+
258
+ ##### To cite LangCell:
259
+
260
+ ```
261
+ @misc{zhao2024langcell,
262
+ title={LangCell: Language-Cell Pre-training for Cell Identity Understanding},
263
+ author={Suyuan Zhao and Jiahuan Zhang and Yizhen Luo and Yushuai Wu and Zaiqing Nie},
264
+ year={2024},
265
+ eprint={2405.06708},
266
+ archivePrefix={arXiv},
267
+ primaryClass={q-bio.GN}
268
+ }
269
+ ```
270
+
271
+ ##### To cite MutaPLM:
272
+
273
+ ```
274
+ @article{luo2025mutaplm,
275
+ title={MutaPLM: Protein Language Modeling for Mutation Explanation and Engineering},
276
+ author={Luo, Yizhen and Nie, Zikun and Hong, Massimo and Zhao, Suyuan and Zhou, Hao and Nie, Zaiqing},
277
+ journal={Advances in Neural Information Processing Systems},
278
+ volume={37},
279
+ pages={79783--79818},
280
+ year={2025}
281
+ }
282
+ ```
USE_POLICY.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## BioMedGPT Acceptable Use Policy
2
+
3
+ BioMedGPT is only for internal use by registered users. You agree and acknowledge that you will use BioMedGPT solely for internal use purposes and undertake not to use it, directly or indirectly, to provide services to the general public with the territory of the PRC. Otherwise, you will be subject to all the damages caused to BioMedGPT.
4
+
5
+ You have the right to use BioMedGPT pursuant to relevant agreements, but you cannot engage in any unlawful activities or disturb the orderly operation of BioMedGPT. You are not allowed to generate any content through BioMedGPT or induce it to output any speech containing the following contents, or we will block or delete the information in accordance with the applicable laws and regulations and report the matter to the relevant authorities:
6
+
7
+ 1. inciting to resist or undermine the implementation of the Constitution, laws and administrative regulations;
8
+ 2. inciting to subvert the state power and the overthrow of the political system;
9
+ 3. inciting to sperate the state or undermine unity of the country;
10
+ 4. inciting national enmity or discrimination, undermine the unity of nations;
11
+ 5. content involving discrimination on the basis of race, sex, religion, geographical content, etc.;
12
+ 6. fabricating or distorting facts, spreading disinformation, or disturbing the public order;
13
+ 7. propagating heretical teachings or feudal superstitions, disseminating obscenity, pornography, gambling, violence, homicide, terror or instigating others to commit crimes;
14
+ 8. publicly humiliating others, inventing stories to defame others, or committing other malicious attacks;
15
+ 9. harming the credibility of state organs;
16
+ 10. violating the public interest or public morality or not suitable for publication on BioMedGPT in accordance with the provisions of the relevant BioMedGPT agreements and rules;
17
+ 11. violating the Constitution, laws and administrative regulations.
18
+
19
+ You fully understand and acknowledge that you are responsible for all your activities and consequences that occur in using the BioMedGPT services, including any content, data or information you provide or publish. BioMedGPT will not be responsible for any losses thereof.
bioflow/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Multimodal Biological Intelligence Framework
3
+ ========================================================
4
+
5
+ A modular, open-source platform for biological discovery integrating:
6
+ - Multimodal encoders (text, molecules, proteins, images)
7
+ - Vector database memory (Qdrant)
8
+ - Prediction tools (DTI, ADMET)
9
+ - Workflow orchestration
10
+
11
+ Core Modules:
12
+ - core: Abstract interfaces, registry, and orchestrator
13
+ - plugins: Tool implementations (OBM, DeepPurpose, etc.)
14
+ - workflows: YAML-based pipeline definitions
15
+
16
+ Open-Source Models Supported:
17
+ - Text: PubMedBERT, SciBERT, Specter
18
+ - Molecules: ChemBERTa, RDKit FP
19
+ - Proteins: ESM-2, ProtBERT
20
+ - Images: CLIP, BioMedCLIP
21
+ """
22
+
23
+ __version__ = "0.2.0"
24
+ __author__ = "BioFlow Team"
25
+
26
+ # Core abstractions
27
+ from bioflow.core import (
28
+ Modality,
29
+ BioEncoder,
30
+ BioPredictor,
31
+ BioGenerator,
32
+ BioRetriever,
33
+ ToolRegistry,
34
+ BioFlowOrchestrator,
35
+ WorkflowConfig,
36
+ NodeConfig,
37
+ )
38
+
39
+ # Legacy imports (for backward compatibility)
40
+ try:
41
+ from bioflow.obm_wrapper import OBMWrapper
42
+ from bioflow.qdrant_manager import QdrantManager
43
+ except ImportError:
44
+ OBMWrapper = None
45
+ QdrantManager = None
46
+
47
+ __all__ = [
48
+ # Core
49
+ "Modality",
50
+ "BioEncoder",
51
+ "BioPredictor",
52
+ "BioGenerator",
53
+ "BioRetriever",
54
+ "ToolRegistry",
55
+ "BioFlowOrchestrator",
56
+ "WorkflowConfig",
57
+ "NodeConfig",
58
+ # Wrappers
59
+ "OBMWrapper",
60
+ "QdrantManager",
61
+ ]
bioflow/api/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow API
3
+ ============
4
+ FastAPI backend bridging the Next.js UI with OpenBioMed core.
5
+ """
6
+
7
+ __version__ = "2.0.0"
bioflow/api/dti_predictor.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow DTI Predictor
3
+ ======================
4
+ Drug-Target Interaction prediction using DeepPurpose.
5
+ Integrated from lacoste001/deeppurpose002.py for the hackathon.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import logging
11
+ import numpy as np
12
+ import pandas as pd
13
+ from typing import Any, Dict, List, Optional, Tuple
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # ============================================================================
21
+ # Data Classes
22
+ # ============================================================================
23
+ @dataclass
24
+ class DTIPrediction:
25
+ """Result of a DTI prediction."""
26
+ drug_smiles: str
27
+ target_sequence: str
28
+ binding_affinity: float # pKd or similar
29
+ confidence: float
30
+ model_name: str
31
+ metadata: Dict[str, Any] = field(default_factory=dict)
32
+
33
+ def to_dict(self) -> Dict[str, Any]:
34
+ return {
35
+ "drug_smiles": self.drug_smiles,
36
+ "target_sequence": self.target_sequence[:50] + "..." if len(self.target_sequence) > 50 else self.target_sequence,
37
+ "binding_affinity": self.binding_affinity,
38
+ "confidence": self.confidence,
39
+ "model_name": self.model_name,
40
+ "metadata": self.metadata,
41
+ }
42
+
43
+
44
+ @dataclass
45
+ class DTIMetrics:
46
+ """Metrics from model evaluation."""
47
+ mse: float
48
+ rmse: float
49
+ mae: float
50
+ pearson: float
51
+ spearman: float
52
+ concordance_index: float
53
+
54
+ def to_dict(self) -> Dict[str, float]:
55
+ return {
56
+ "mse": self.mse,
57
+ "rmse": self.rmse,
58
+ "mae": self.mae,
59
+ "pearson": self.pearson,
60
+ "spearman": self.spearman,
61
+ "concordance_index": self.concordance_index,
62
+ }
63
+
64
+
65
+ # ============================================================================
66
+ # Metric Functions (from deeppurpose002.py)
67
+ # ============================================================================
68
+ def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
69
+ """Mean Squared Error."""
70
+ y_true = np.asarray(y_true, dtype=float).reshape(-1)
71
+ y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
72
+ return float(np.mean((y_true - y_pred) ** 2))
73
+
74
+
75
+ def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
76
+ """Mean Absolute Error."""
77
+ y_true = np.asarray(y_true, dtype=float).reshape(-1)
78
+ y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
79
+ return float(np.mean(np.abs(y_true - y_pred)))
80
+
81
+
82
+ def pearson(y_true: np.ndarray, y_pred: np.ndarray) -> float:
83
+ """Pearson correlation coefficient."""
84
+ y_true = np.asarray(y_true, dtype=float).reshape(-1)
85
+ y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
86
+ if y_true.size < 2 or np.std(y_true) == 0 or np.std(y_pred) == 0:
87
+ return float("nan")
88
+ return float(np.corrcoef(y_true, y_pred)[0, 1])
89
+
90
+
91
+ def spearman(y_true: np.ndarray, y_pred: np.ndarray) -> float:
92
+ """Spearman rank correlation."""
93
+ a = pd.Series(np.asarray(y_true, dtype=float).reshape(-1)).rank(method="average").to_numpy()
94
+ b = pd.Series(np.asarray(y_pred, dtype=float).reshape(-1)).rank(method="average").to_numpy()
95
+ return pearson(a, b)
96
+
97
+
98
+ def concordance_index(y_true: np.ndarray, y_pred: np.ndarray, max_n: int = 2000, seed: int = 0) -> float:
99
+ """
100
+ Concordance Index (CI) - approximated for large datasets.
101
+ Measures pairwise ranking accuracy.
102
+ """
103
+ y_true = np.asarray(y_true, dtype=float).reshape(-1)
104
+ y_pred = np.asarray(y_pred, dtype=float).reshape(-1)
105
+ n = len(y_true)
106
+
107
+ if n < 2:
108
+ return float("nan")
109
+
110
+ # Sample if too large
111
+ if n > max_n:
112
+ rng = np.random.default_rng(seed)
113
+ idx = rng.choice(n, size=max_n, replace=False)
114
+ y_true = y_true[idx]
115
+ y_pred = y_pred[idx]
116
+ n = max_n
117
+
118
+ conc = 0.0
119
+ total = 0.0
120
+ for i in range(n):
121
+ for j in range(i + 1, n):
122
+ if y_true[i] == y_true[j]:
123
+ continue
124
+ total += 1.0
125
+ dt = y_true[i] - y_true[j]
126
+ dp = y_pred[i] - y_pred[j]
127
+ prod = dt * dp
128
+ if prod > 0:
129
+ conc += 1.0
130
+ elif prod == 0:
131
+ conc += 0.5
132
+
133
+ if total == 0:
134
+ return float("nan")
135
+ return float(conc / total)
136
+
137
+
138
+ # ============================================================================
139
+ # DeepPurpose Predictor Class
140
+ # ============================================================================
141
+ class DeepPurposePredictor:
142
+ """
143
+ Drug-Target Interaction predictor using DeepPurpose.
144
+
145
+ Supports multiple encoding strategies:
146
+ - Drug: Morgan, CNN, Transformer, MPNN
147
+ - Target: CNN, Transformer, AAC
148
+
149
+ Example:
150
+ >>> predictor = DeepPurposePredictor()
151
+ >>> result = predictor.predict("CCO", "MKTVRQERLKSIVRILERSKEPVSG")
152
+ >>> print(result.binding_affinity)
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ drug_encoding: str = "Morgan",
158
+ target_encoding: str = "CNN",
159
+ model_path: Optional[str] = None,
160
+ device: str = "cpu",
161
+ ):
162
+ self.drug_encoding = drug_encoding
163
+ self.target_encoding = target_encoding
164
+ self.model_path = model_path
165
+ self.device = device
166
+ self.model = None
167
+ self._loaded = False
168
+
169
+ def load_model(self) -> bool:
170
+ """Load the DeepPurpose model."""
171
+ try:
172
+ from DeepPurpose import DTI as dp_models
173
+ from DeepPurpose import utils
174
+
175
+ if self.model_path and os.path.exists(self.model_path):
176
+ # Load pre-trained model
177
+ self.model = dp_models.model_pretrained(self.model_path)
178
+ logger.info(f"Loaded model from {self.model_path}")
179
+ else:
180
+ # Initialize new model (for inference with pre-trained weights)
181
+ config = utils.generate_config(
182
+ drug_encoding=self.drug_encoding,
183
+ target_encoding=self.target_encoding,
184
+ cls_hidden_dims=[1024, 1024, 512],
185
+ )
186
+ self.model = dp_models.model_initialize(**config)
187
+ logger.info(f"Initialized new model: {self.drug_encoding}-{self.target_encoding}")
188
+
189
+ self._loaded = True
190
+ return True
191
+
192
+ except ImportError:
193
+ logger.warning("DeepPurpose not installed. Using mock predictions.")
194
+ self._loaded = False
195
+ return False
196
+ except Exception as e:
197
+ logger.error(f"Failed to load model: {e}")
198
+ self._loaded = False
199
+ return False
200
+
201
+ def predict(self, drug_smiles: str, target_sequence: str) -> DTIPrediction:
202
+ """
203
+ Predict binding affinity between drug and target.
204
+
205
+ Args:
206
+ drug_smiles: SMILES string of the drug molecule
207
+ target_sequence: Amino acid sequence of the target protein
208
+
209
+ Returns:
210
+ DTIPrediction with binding affinity and confidence
211
+ """
212
+ if not self._loaded:
213
+ self.load_model()
214
+
215
+ if self.model is not None:
216
+ try:
217
+ from DeepPurpose import utils
218
+
219
+ # Prepare data for prediction
220
+ X_drug = [drug_smiles]
221
+ X_target = [target_sequence]
222
+ y_dummy = [0] # Not used for prediction
223
+
224
+ data = utils.data_process(
225
+ X_drug, X_target, y_dummy,
226
+ drug_encoding=self.drug_encoding,
227
+ target_encoding=self.target_encoding,
228
+ split_method="no_split",
229
+ )
230
+
231
+ # Get prediction
232
+ pred = self.model.predict(data)
233
+ affinity = float(pred[0]) if len(pred) > 0 else 0.0
234
+
235
+ return DTIPrediction(
236
+ drug_smiles=drug_smiles,
237
+ target_sequence=target_sequence,
238
+ binding_affinity=affinity,
239
+ confidence=0.85, # TODO: Implement uncertainty estimation
240
+ model_name=f"DeepPurpose-{self.drug_encoding}-{self.target_encoding}",
241
+ metadata={
242
+ "timestamp": datetime.utcnow().isoformat(),
243
+ "device": self.device,
244
+ }
245
+ )
246
+
247
+ except Exception as e:
248
+ logger.error(f"Prediction failed: {e}")
249
+
250
+ # Fallback: Mock prediction
251
+ return self._mock_predict(drug_smiles, target_sequence)
252
+
253
+ def _mock_predict(self, drug_smiles: str, target_sequence: str) -> DTIPrediction:
254
+ """Generate a mock prediction when model is unavailable."""
255
+ import hashlib
256
+
257
+ # Deterministic "prediction" based on input hash
258
+ hash_input = f"{drug_smiles}:{target_sequence}"
259
+ hash_val = int(hashlib.md5(hash_input.encode()).hexdigest()[:8], 16)
260
+
261
+ # Generate realistic-looking pKd value (typically 4-10)
262
+ affinity = 4.0 + (hash_val % 6000) / 1000.0
263
+ confidence = 0.7 + (hash_val % 300) / 1000.0
264
+
265
+ return DTIPrediction(
266
+ drug_smiles=drug_smiles,
267
+ target_sequence=target_sequence,
268
+ binding_affinity=round(affinity, 3),
269
+ confidence=round(confidence, 3),
270
+ model_name="Mock-Predictor",
271
+ metadata={
272
+ "timestamp": datetime.utcnow().isoformat(),
273
+ "note": "Mock prediction - DeepPurpose not loaded",
274
+ }
275
+ )
276
+
277
+ def batch_predict(
278
+ self,
279
+ drug_target_pairs: List[Tuple[str, str]],
280
+ ) -> List[DTIPrediction]:
281
+ """Predict for multiple drug-target pairs."""
282
+ return [self.predict(d, t) for d, t in drug_target_pairs]
283
+
284
+ def evaluate(
285
+ self,
286
+ y_true: np.ndarray,
287
+ y_pred: np.ndarray,
288
+ ) -> DTIMetrics:
289
+ """Evaluate predictions against ground truth."""
290
+ import math
291
+
292
+ m_mse = mse(y_true, y_pred)
293
+
294
+ return DTIMetrics(
295
+ mse=m_mse,
296
+ rmse=math.sqrt(m_mse),
297
+ mae=mae(y_true, y_pred),
298
+ pearson=pearson(y_true, y_pred),
299
+ spearman=spearman(y_true, y_pred),
300
+ concordance_index=concordance_index(y_true, y_pred),
301
+ )
302
+
303
+
304
+ # ============================================================================
305
+ # Factory function
306
+ # ============================================================================
307
+ def get_dti_predictor(
308
+ drug_encoding: str = "Morgan",
309
+ target_encoding: str = "CNN",
310
+ model_path: Optional[str] = None,
311
+ ) -> DeepPurposePredictor:
312
+ """Factory function to get a DTI predictor instance."""
313
+ predictor = DeepPurposePredictor(
314
+ drug_encoding=drug_encoding,
315
+ target_encoding=target_encoding,
316
+ model_path=model_path,
317
+ )
318
+ return predictor
319
+
320
+
321
+ # ============================================================================
322
+ # CLI for standalone usage
323
+ # ============================================================================
324
+ if __name__ == "__main__":
325
+ import argparse
326
+
327
+ parser = argparse.ArgumentParser(description="DTI Prediction")
328
+ parser.add_argument("--drug", required=True, help="Drug SMILES")
329
+ parser.add_argument("--target", required=True, help="Target protein sequence")
330
+ parser.add_argument("--drug-enc", default="Morgan", help="Drug encoding method")
331
+ parser.add_argument("--target-enc", default="CNN", help="Target encoding method")
332
+
333
+ args = parser.parse_args()
334
+
335
+ predictor = get_dti_predictor(args.drug_enc, args.target_enc)
336
+ result = predictor.predict(args.drug, args.target)
337
+
338
+ print(f"\n{'='*60}")
339
+ print("DTI Prediction Result")
340
+ print(f"{'='*60}")
341
+ print(f"Drug: {result.drug_smiles}")
342
+ print(f"Target: {result.target_sequence}")
343
+ print(f"Binding Affinity (pKd): {result.binding_affinity:.3f}")
344
+ print(f"Confidence: {result.confidence:.3f}")
345
+ print(f"Model: {result.model_name}")
346
+ print(f"{'='*60}\n")
bioflow/api/requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BioFlow API Requirements
2
+ # ========================
3
+
4
+ # Core
5
+ fastapi>=0.109.0
6
+ uvicorn[standard]>=0.27.0
7
+ pydantic>=2.5.0
8
+
9
+ # Async
10
+ httpx>=0.26.0
11
+ aiofiles>=23.2.0
12
+
13
+ # Data
14
+ numpy>=1.24.0
15
+ pandas>=2.0.0
16
+
17
+ # DeepPurpose (DTI prediction)
18
+ DeepPurpose>=0.1.4
19
+ torch>=2.0.0
20
+
21
+ # TDC - Therapeutics Data Commons
22
+ PyTDC>=0.4.0
23
+
24
+ # Vector DB
25
+ qdrant-client>=1.7.0
26
+
27
+ # Utilities
28
+ python-multipart>=0.0.6
29
+ python-dotenv>=1.0.0
bioflow/api/server.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow API - Main Server
3
+ ==========================
4
+ FastAPI application serving the Next.js frontend.
5
+ Endpoints for discovery, prediction, and data management.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import uuid
11
+ import logging
12
+ from datetime import datetime
13
+ from typing import Any, Dict, List, Optional
14
+ from contextlib import asynccontextmanager
15
+
16
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from pydantic import BaseModel, Field
19
+
20
+ # Add project root to path
21
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22
+ sys.path.insert(0, ROOT_DIR)
23
+
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # ============================================================================
28
+ # In-Memory Job Store (replace with Redis/DB in production)
29
+ # ============================================================================
30
+ JOBS: Dict[str, Dict[str, Any]] = {}
31
+
32
+
33
+ # ============================================================================
34
+ # Pydantic Models
35
+ # ============================================================================
36
+ class DiscoveryRequest(BaseModel):
37
+ """Request for drug discovery pipeline."""
38
+ query: str = Field(..., description="SMILES, FASTA, or natural language query")
39
+ search_type: str = Field(default="similarity", description="similarity | binding | properties")
40
+ database: str = Field(default="all", description="Target database")
41
+ limit: int = Field(default=10, ge=1, le=100)
42
+
43
+
44
+ class PredictRequest(BaseModel):
45
+ """Request for DTI prediction."""
46
+ drug_smiles: str = Field(..., description="SMILES string of drug")
47
+ target_sequence: str = Field(..., description="Protein sequence (FASTA)")
48
+
49
+
50
+ class IngestRequest(BaseModel):
51
+ """Request to ingest data into vector DB."""
52
+ content: str
53
+ modality: str = Field(default="smiles", description="smiles | protein | text")
54
+ metadata: Optional[Dict[str, Any]] = None
55
+
56
+
57
+ class JobStatus(BaseModel):
58
+ """Status of an async job."""
59
+ job_id: str
60
+ status: str # pending | running | completed | failed
61
+ progress: int = 0
62
+ result: Optional[Dict[str, Any]] = None
63
+ error: Optional[str] = None
64
+ created_at: str
65
+ updated_at: str
66
+
67
+
68
+ class HealthResponse(BaseModel):
69
+ """Health check response."""
70
+ status: str
71
+ version: str
72
+ timestamp: str
73
+
74
+
75
+ # ============================================================================
76
+ # Lifespan (startup/shutdown)
77
+ # ============================================================================
78
+ @asynccontextmanager
79
+ async def lifespan(app: FastAPI):
80
+ """Initialize resources on startup, cleanup on shutdown."""
81
+ logger.info("🚀 BioFlow API starting up...")
82
+ # TODO: Initialize Qdrant connection, load models
83
+ yield
84
+ logger.info("🛑 BioFlow API shutting down...")
85
+
86
+
87
+ # ============================================================================
88
+ # FastAPI App
89
+ # ============================================================================
90
+ app = FastAPI(
91
+ title="BioFlow API",
92
+ description="AI-Powered Drug Discovery Platform API",
93
+ version="2.0.0",
94
+ lifespan=lifespan,
95
+ )
96
+
97
+ # CORS for Next.js frontend
98
+ app.add_middleware(
99
+ CORSMiddleware,
100
+ allow_origins=[
101
+ "http://localhost:3000",
102
+ "http://127.0.0.1:3000",
103
+ "http://localhost:3001",
104
+ ],
105
+ allow_credentials=True,
106
+ allow_methods=["*"],
107
+ allow_headers=["*"],
108
+ )
109
+
110
+
111
+ # ============================================================================
112
+ # Health & Info
113
+ # ============================================================================
114
+ @app.get("/", response_model=HealthResponse)
115
+ async def root():
116
+ """Health check endpoint."""
117
+ return HealthResponse(
118
+ status="healthy",
119
+ version="2.0.0",
120
+ timestamp=datetime.utcnow().isoformat(),
121
+ )
122
+
123
+
124
+ @app.get("/health", response_model=HealthResponse)
125
+ async def health():
126
+ """Health check endpoint."""
127
+ return HealthResponse(
128
+ status="healthy",
129
+ version="2.0.0",
130
+ timestamp=datetime.utcnow().isoformat(),
131
+ )
132
+
133
+
134
+ # ============================================================================
135
+ # Discovery Pipeline
136
+ # ============================================================================
137
+ def run_discovery_pipeline(job_id: str, request: DiscoveryRequest):
138
+ """Background task for discovery pipeline."""
139
+ import time
140
+
141
+ try:
142
+ JOBS[job_id]["status"] = "running"
143
+ JOBS[job_id]["updated_at"] = datetime.utcnow().isoformat()
144
+
145
+ # Step 1: Encode
146
+ JOBS[job_id]["progress"] = 25
147
+ JOBS[job_id]["current_step"] = "encode"
148
+ time.sleep(1) # TODO: Replace with actual encoding
149
+
150
+ # Step 2: Search
151
+ JOBS[job_id]["progress"] = 50
152
+ JOBS[job_id]["current_step"] = "search"
153
+ time.sleep(1) # TODO: Replace with vector search
154
+
155
+ # Step 3: Predict
156
+ JOBS[job_id]["progress"] = 75
157
+ JOBS[job_id]["current_step"] = "predict"
158
+ time.sleep(1) # TODO: Replace with DTI prediction
159
+
160
+ # Step 4: Results
161
+ JOBS[job_id]["progress"] = 100
162
+ JOBS[job_id]["current_step"] = "complete"
163
+ JOBS[job_id]["status"] = "completed"
164
+ JOBS[job_id]["result"] = {
165
+ "candidates": [
166
+ {"name": "Candidate A", "smiles": "CCO", "score": 0.95, "mw": 342.4, "logp": 2.1},
167
+ {"name": "Candidate B", "smiles": "CC(=O)O", "score": 0.89, "mw": 298.3, "logp": 1.8},
168
+ {"name": "Candidate C", "smiles": "c1ccccc1", "score": 0.82, "mw": 415.5, "logp": 3.2},
169
+ ],
170
+ "query": request.query,
171
+ "search_type": request.search_type,
172
+ }
173
+ JOBS[job_id]["updated_at"] = datetime.utcnow().isoformat()
174
+
175
+ except Exception as e:
176
+ JOBS[job_id]["status"] = "failed"
177
+ JOBS[job_id]["error"] = str(e)
178
+ JOBS[job_id]["updated_at"] = datetime.utcnow().isoformat()
179
+ logger.error(f"Discovery pipeline failed: {e}")
180
+
181
+
182
+ @app.post("/api/discovery")
183
+ async def start_discovery(request: DiscoveryRequest, background_tasks: BackgroundTasks):
184
+ """Start a discovery pipeline (async)."""
185
+ job_id = f"disc_{uuid.uuid4().hex[:12]}"
186
+ now = datetime.utcnow().isoformat()
187
+
188
+ JOBS[job_id] = {
189
+ "job_id": job_id,
190
+ "status": "pending",
191
+ "progress": 0,
192
+ "current_step": "queued",
193
+ "result": None,
194
+ "error": None,
195
+ "created_at": now,
196
+ "updated_at": now,
197
+ "request": request.model_dump(),
198
+ }
199
+
200
+ background_tasks.add_task(run_discovery_pipeline, job_id, request)
201
+
202
+ return {
203
+ "success": True,
204
+ "job_id": job_id,
205
+ "status": "pending",
206
+ "message": "Discovery pipeline started",
207
+ }
208
+
209
+
210
+ @app.get("/api/discovery/{job_id}")
211
+ async def get_discovery_status(job_id: str):
212
+ """Get status of a discovery job."""
213
+ if job_id not in JOBS:
214
+ raise HTTPException(status_code=404, detail="Job not found")
215
+ return JOBS[job_id]
216
+
217
+
218
+ # ============================================================================
219
+ # DTI Prediction
220
+ # ============================================================================
221
+ # Import the predictor
222
+ from bioflow.api.dti_predictor import get_dti_predictor, DeepPurposePredictor
223
+
224
+ # Global predictor instance (lazy loaded)
225
+ _dti_predictor: Optional[DeepPurposePredictor] = None
226
+
227
+ def get_predictor() -> DeepPurposePredictor:
228
+ """Get or create the DTI predictor instance."""
229
+ global _dti_predictor
230
+ if _dti_predictor is None:
231
+ _dti_predictor = get_dti_predictor()
232
+ return _dti_predictor
233
+
234
+
235
+ @app.post("/api/predict")
236
+ async def predict_dti(request: PredictRequest):
237
+ """
238
+ Predict drug-target interaction.
239
+ Uses DeepPurpose under the hood.
240
+ """
241
+ try:
242
+ predictor = get_predictor()
243
+ result = predictor.predict(request.drug_smiles, request.target_sequence)
244
+
245
+ return {
246
+ "success": True,
247
+ "prediction": {
248
+ "drug_smiles": result.drug_smiles,
249
+ "target_sequence": result.target_sequence,
250
+ "binding_affinity": result.binding_affinity,
251
+ "confidence": result.confidence,
252
+ "interaction_probability": min(result.confidence + 0.05, 1.0),
253
+ },
254
+ "metadata": {
255
+ "model": result.model_name,
256
+ "timestamp": datetime.utcnow().isoformat(),
257
+ **result.metadata,
258
+ }
259
+ }
260
+
261
+ except Exception as e:
262
+ logger.error(f"Prediction failed: {e}")
263
+ raise HTTPException(status_code=500, detail=str(e))
264
+
265
+
266
+ # ============================================================================
267
+ # Data Management
268
+ # ============================================================================
269
+ @app.post("/api/ingest")
270
+ async def ingest_data(request: IngestRequest):
271
+ """Ingest data into vector database."""
272
+ try:
273
+ # TODO: Integrate with Qdrant via bioflow.qdrant_manager
274
+ doc_id = f"doc_{uuid.uuid4().hex[:12]}"
275
+
276
+ return {
277
+ "success": True,
278
+ "id": doc_id,
279
+ "modality": request.modality,
280
+ "message": "Data ingested successfully",
281
+ }
282
+
283
+ except Exception as e:
284
+ logger.error(f"Ingest failed: {e}")
285
+ raise HTTPException(status_code=500, detail=str(e))
286
+
287
+
288
+ @app.get("/api/molecules")
289
+ async def list_molecules(limit: int = 20, offset: int = 0):
290
+ """List molecules in the database."""
291
+ # TODO: Query from Qdrant
292
+ mock_molecules = [
293
+ {"id": "mol_001", "smiles": "CCO", "name": "Ethanol", "mw": 46.07},
294
+ {"id": "mol_002", "smiles": "CC(=O)O", "name": "Acetic Acid", "mw": 60.05},
295
+ {"id": "mol_003", "smiles": "c1ccccc1", "name": "Benzene", "mw": 78.11},
296
+ ]
297
+ return {
298
+ "molecules": mock_molecules,
299
+ "total": len(mock_molecules),
300
+ "limit": limit,
301
+ "offset": offset,
302
+ }
303
+
304
+
305
+ @app.get("/api/proteins")
306
+ async def list_proteins(limit: int = 20, offset: int = 0):
307
+ """List proteins in the database."""
308
+ # TODO: Query from Qdrant
309
+ mock_proteins = [
310
+ {"id": "prot_001", "uniprot_id": "P00533", "name": "EGFR", "length": 1210},
311
+ {"id": "prot_002", "uniprot_id": "P04637", "name": "p53", "length": 393},
312
+ {"id": "prot_003", "uniprot_id": "P38398", "name": "BRCA1", "length": 1863},
313
+ ]
314
+ return {
315
+ "proteins": mock_proteins,
316
+ "total": len(mock_proteins),
317
+ "limit": limit,
318
+ "offset": offset,
319
+ }
320
+
321
+
322
+ # ============================================================================
323
+ # Explorer (Embeddings)
324
+ # ============================================================================
325
+ @app.get("/api/explorer/embeddings")
326
+ async def get_embeddings(dataset: str = "default", method: str = "umap"):
327
+ """Get 2D projections of embeddings for visualization."""
328
+ import random
329
+
330
+ # TODO: Get actual embeddings from Qdrant and project
331
+ # Generate mock UMAP-like data
332
+ random.seed(42)
333
+
334
+ points = []
335
+ for i in range(100):
336
+ cluster = i % 4
337
+ cx, cy = [(2, 3), (-2, -1), (4, -2), (-1, 4)][cluster]
338
+ points.append({
339
+ "id": f"mol_{i:03d}",
340
+ "x": cx + random.gauss(0, 0.8),
341
+ "y": cy + random.gauss(0, 0.8),
342
+ "cluster": cluster,
343
+ "label": f"Molecule {i}",
344
+ })
345
+
346
+ return {
347
+ "points": points,
348
+ "method": method,
349
+ "dataset": dataset,
350
+ "n_clusters": 4,
351
+ }
352
+
353
+
354
+ # ============================================================================
355
+ # Run with: uvicorn bioflow.api.server:app --reload --port 8000
356
+ # ============================================================================
357
+ if __name__ == "__main__":
358
+ import uvicorn
359
+ uvicorn.run(app, host="0.0.0.0", port=8000)
bioflow/app.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Explorer - Streamlit Interface
3
+ =======================================
4
+
5
+ Interactive web interface for testing and exploring the BioFlow
6
+ multimodal biological intelligence system.
7
+
8
+ Run with: streamlit run bioflow/app.py
9
+ """
10
+
11
+ import streamlit as st
12
+ import numpy as np
13
+ import pandas as pd
14
+ from typing import List, Dict, Any
15
+ import json
16
+ import os
17
+ import sys
18
+
19
+ # Add project root to path
20
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
21
+ sys.path.insert(0, ROOT_DIR)
22
+
23
+ # Page config
24
+ st.set_page_config(
25
+ page_title="BioFlow Explorer",
26
+ page_icon="🧬",
27
+ layout="wide",
28
+ initial_sidebar_state="expanded"
29
+ )
30
+
31
+ # Custom CSS
32
+ st.markdown("""
33
+ <style>
34
+ .main-header {
35
+ font-size: 2.5rem;
36
+ font-weight: bold;
37
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
38
+ -webkit-background-clip: text;
39
+ -webkit-text-fill-color: transparent;
40
+ margin-bottom: 1rem;
41
+ }
42
+ .metric-card {
43
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
44
+ padding: 1rem;
45
+ border-radius: 0.5rem;
46
+ margin: 0.5rem 0;
47
+ }
48
+ .result-card {
49
+ border: 1px solid #ddd;
50
+ border-radius: 0.5rem;
51
+ padding: 1rem;
52
+ margin: 0.5rem 0;
53
+ background: white;
54
+ }
55
+ .modality-text { color: #3b82f6; }
56
+ .modality-molecule { color: #10b981; }
57
+ .modality-protein { color: #f59e0b; }
58
+ </style>
59
+ """, unsafe_allow_html=True)
60
+
61
+
62
+ @st.cache_resource
63
+ def init_bioflow(use_mock: bool = True):
64
+ """Initialize BioFlow components (cached)."""
65
+ try:
66
+ from bioflow.obm_wrapper import OBMWrapper
67
+ from bioflow.qdrant_manager import QdrantManager
68
+ from bioflow.pipeline import BioFlowPipeline, MinerAgent, ValidatorAgent
69
+
70
+ obm = OBMWrapper(use_mock=use_mock)
71
+ qdrant = QdrantManager(obm, qdrant_path=None) # In-memory
72
+ qdrant.create_collection("bioflow_demo", recreate=True)
73
+
74
+ pipeline = BioFlowPipeline(obm, qdrant)
75
+ pipeline.register_agent(MinerAgent(obm, qdrant, "bioflow_demo"))
76
+ pipeline.register_agent(ValidatorAgent(obm, qdrant, "bioflow_demo"))
77
+
78
+ return {
79
+ "obm": obm,
80
+ "qdrant": qdrant,
81
+ "pipeline": pipeline,
82
+ "ready": True
83
+ }
84
+ except Exception as e:
85
+ st.error(f"Failed to initialize: {e}")
86
+ return {"ready": False, "error": str(e)}
87
+
88
+
89
+ def render_sidebar():
90
+ """Render the sidebar with controls."""
91
+ st.sidebar.markdown("## 🧬 BioFlow Explorer")
92
+ st.sidebar.markdown("---")
93
+
94
+ # Mode selection
95
+ mode = st.sidebar.selectbox(
96
+ "Mode",
97
+ ["🔍 Search & Explore", "📥 Data Ingestion", "🧪 Cross-Modal Analysis",
98
+ "📊 Visualization", "🔬 Pipeline Demo", "📚 Documentation"]
99
+ )
100
+
101
+ st.sidebar.markdown("---")
102
+
103
+ # Settings
104
+ with st.sidebar.expander("⚙️ Settings"):
105
+ use_mock = st.checkbox("Use Mock Mode (no GPU needed)", value=True)
106
+ vector_dim = st.number_input("Vector Dimension", value=768, disabled=True)
107
+
108
+ st.sidebar.markdown("---")
109
+ st.sidebar.markdown("### Quick Stats")
110
+
111
+ return mode, use_mock
112
+
113
+
114
+ def render_search_page(components):
115
+ """Render the search and explore page."""
116
+ st.markdown('<p class="main-header">🔍 Search & Explore</p>', unsafe_allow_html=True)
117
+
118
+ col1, col2 = st.columns([2, 1])
119
+
120
+ with col1:
121
+ query = st.text_area(
122
+ "Enter your query",
123
+ placeholder="e.g., 'KRAS inhibitor for cancer treatment' or a SMILES string like 'CCO'",
124
+ height=100
125
+ )
126
+
127
+ query_modality = st.selectbox(
128
+ "Query Modality",
129
+ ["text", "smiles", "protein"],
130
+ help="Select the type of your input"
131
+ )
132
+
133
+ with col2:
134
+ target_modality = st.selectbox(
135
+ "Search for",
136
+ ["All", "text", "smiles", "protein"],
137
+ help="Filter results by modality"
138
+ )
139
+
140
+ top_k = st.slider("Number of results", 1, 20, 5)
141
+
142
+ if st.button("🔍 Search", type="primary"):
143
+ if not query:
144
+ st.warning("Please enter a query")
145
+ return
146
+
147
+ with st.spinner("Encoding and searching..."):
148
+ obm = components["obm"]
149
+ qdrant = components["qdrant"]
150
+
151
+ # Encode query
152
+ embedding = obm.encode(query, query_modality)
153
+
154
+ # Display query embedding info
155
+ with st.expander("📊 Query Embedding Details"):
156
+ st.json({
157
+ "modality": embedding.modality.value,
158
+ "dimension": embedding.dimension,
159
+ "content_hash": embedding.content_hash,
160
+ "vector_sample": embedding.vector[:5].tolist()
161
+ })
162
+
163
+ # Search
164
+ filter_mod = None if target_modality == "All" else target_modality
165
+ results = qdrant.search(
166
+ query=query,
167
+ query_modality=query_modality,
168
+ limit=top_k,
169
+ filter_modality=filter_mod
170
+ )
171
+
172
+ if results:
173
+ st.markdown("### 📋 Search Results")
174
+ for i, r in enumerate(results):
175
+ with st.container():
176
+ col1, col2, col3 = st.columns([1, 4, 1])
177
+ with col1:
178
+ st.metric("Rank", i + 1)
179
+ with col2:
180
+ modality_class = f"modality-{r.modality}"
181
+ st.markdown(f"**<span class='{modality_class}'>[{r.modality.upper()}]</span>** {r.content[:100]}...", unsafe_allow_html=True)
182
+ with col3:
183
+ st.metric("Score", f"{r.score:.3f}")
184
+ st.divider()
185
+ else:
186
+ st.info("No results found. Try ingesting some data first!")
187
+
188
+
189
+ def render_ingestion_page(components):
190
+ """Render the data ingestion page."""
191
+ st.markdown('<p class="main-header">📥 Data Ingestion</p>', unsafe_allow_html=True)
192
+
193
+ tab1, tab2, tab3 = st.tabs(["📝 Single Entry", "📄 Batch Upload", "🧪 Sample Data"])
194
+
195
+ with tab1:
196
+ st.markdown("### Add Single Entry")
197
+
198
+ col1, col2 = st.columns(2)
199
+ with col1:
200
+ content = st.text_area("Content", placeholder="Enter text, SMILES, or protein sequence")
201
+ modality = st.selectbox("Type", ["text", "smiles", "protein"])
202
+
203
+ with col2:
204
+ source = st.text_input("Source", placeholder="e.g., PubMed:12345")
205
+ tags = st.text_input("Tags (comma-separated)", placeholder="e.g., cancer, kinase")
206
+
207
+ if st.button("➕ Add Entry"):
208
+ if content:
209
+ qdrant = components["qdrant"]
210
+ item = {
211
+ "content": content,
212
+ "modality": modality,
213
+ "source": source,
214
+ "tags": [t.strip() for t in tags.split(",") if t.strip()]
215
+ }
216
+ stats = qdrant.ingest([item])
217
+ st.success(f"Added successfully! Stats: {stats}")
218
+ else:
219
+ st.warning("Please enter content")
220
+
221
+ with tab2:
222
+ st.markdown("### Batch Upload")
223
+
224
+ uploaded_file = st.file_uploader("Upload JSON or CSV", type=["json", "csv"])
225
+
226
+ if uploaded_file:
227
+ try:
228
+ if uploaded_file.name.endswith('.json'):
229
+ data = json.load(uploaded_file)
230
+ else:
231
+ df = pd.read_csv(uploaded_file)
232
+ data = df.to_dict('records')
233
+
234
+ st.write(f"Found {len(data)} entries")
235
+ st.dataframe(pd.DataFrame(data).head())
236
+
237
+ if st.button("📤 Upload All"):
238
+ qdrant = components["qdrant"]
239
+ stats = qdrant.ingest(data)
240
+ st.success(f"Ingestion complete! {stats}")
241
+ except Exception as e:
242
+ st.error(f"Error parsing file: {e}")
243
+
244
+ with tab3:
245
+ st.markdown("### Load Sample Data")
246
+ st.markdown("Load pre-defined sample data to test the system.")
247
+
248
+ sample_data = [
249
+ {"content": "Aspirin is used to reduce fever and relieve mild to moderate pain", "modality": "text", "source": "sample", "tags": ["pain", "fever"]},
250
+ {"content": "CC(=O)OC1=CC=CC=C1C(=O)O", "modality": "smiles", "source": "ChEMBL", "tags": ["aspirin", "nsaid"]},
251
+ {"content": "Ibuprofen is a nonsteroidal anti-inflammatory drug used for treating pain", "modality": "text", "source": "sample", "tags": ["pain", "nsaid"]},
252
+ {"content": "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", "modality": "smiles", "source": "ChEMBL", "tags": ["ibuprofen", "nsaid"]},
253
+ {"content": "KRAS mutations are found in many cancers and are difficult to target", "modality": "text", "source": "PubMed", "tags": ["cancer", "KRAS"]},
254
+ {"content": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKKSKTKCVIM", "modality": "protein", "source": "UniProt:P01116", "tags": ["KRAS", "GTPase"]},
255
+ {"content": "Sotorasib is a first-in-class KRAS G12C inhibitor", "modality": "text", "source": "PubMed", "tags": ["KRAS", "inhibitor", "cancer"]},
256
+ {"content": "C[C@@H]1CC(=O)N(C2=C1C=CC(=C2)NC(=O)C3=CC=C(C=C3)N4CCN(CC4)C)C5=NC=CC(=N5)C6CCCCC6", "modality": "smiles", "source": "ChEMBL", "tags": ["sotorasib", "KRAS", "inhibitor"]},
257
+ ]
258
+
259
+ if st.button("🧪 Load Sample Data"):
260
+ qdrant = components["qdrant"]
261
+ stats = qdrant.ingest(sample_data)
262
+ st.success(f"Loaded {len(sample_data)} sample entries! {stats}")
263
+ st.balloons()
264
+
265
+
266
+ def render_crossmodal_page(components):
267
+ """Render cross-modal analysis page."""
268
+ st.markdown('<p class="main-header">🧪 Cross-Modal Analysis</p>', unsafe_allow_html=True)
269
+
270
+ st.markdown("""
271
+ Explore how different modalities relate to each other in the shared embedding space.
272
+ This is the core capability of BioFlow - connecting text, molecules, and proteins.
273
+ """)
274
+
275
+ col1, col2 = st.columns(2)
276
+
277
+ with col1:
278
+ st.markdown("### Query")
279
+ query = st.text_area("Enter query", height=100)
280
+ query_mod = st.selectbox("Query type", ["text", "smiles", "protein"], key="q_mod")
281
+
282
+ with col2:
283
+ st.markdown("### Targets")
284
+ targets = st.text_area("Enter targets (one per line)", height=100)
285
+ target_mod = st.selectbox("Target type", ["text", "smiles", "protein"], key="t_mod")
286
+
287
+ if st.button("🔄 Compute Cross-Modal Similarity"):
288
+ if query and targets:
289
+ obm = components["obm"]
290
+ target_list = [t.strip() for t in targets.strip().split("\n") if t.strip()]
291
+
292
+ results = obm.cross_modal_similarity(
293
+ query=query,
294
+ query_modality=query_mod,
295
+ targets=target_list,
296
+ target_modality=target_mod
297
+ )
298
+
299
+ st.markdown("### Results (sorted by similarity)")
300
+
301
+ df = pd.DataFrame(results, columns=["Content", "Similarity"])
302
+ df["Rank"] = range(1, len(df) + 1)
303
+ df = df[["Rank", "Content", "Similarity"]]
304
+
305
+ st.dataframe(df, use_container_width=True)
306
+
307
+ # Visualize
308
+ import plotly.express as px
309
+ fig = px.bar(df, x="Content", y="Similarity", title="Cross-Modal Similarities")
310
+ st.plotly_chart(fig, use_container_width=True)
311
+
312
+
313
+ def render_visualization_page(components):
314
+ """Render visualization page."""
315
+ st.markdown('<p class="main-header">📊 Visualization</p>', unsafe_allow_html=True)
316
+
317
+ tab1, tab2, tab3 = st.tabs(["🌐 Embedding Space", "📈 Similarity Matrix", "🧬 Molecules"])
318
+
319
+ with tab1:
320
+ st.markdown("### Embedding Space Visualization")
321
+
322
+ # Get all points from collection
323
+ qdrant = components["qdrant"]
324
+ info = qdrant.get_collection_info()
325
+
326
+ if info.get("points_count", 0) == 0:
327
+ st.warning("No data in collection. Go to Data Ingestion to add some!")
328
+ return
329
+
330
+ st.metric("Points in collection", info.get("points_count", 0))
331
+
332
+ if st.button("🎨 Generate Embedding Plot"):
333
+ # This would require fetching all vectors - simplified for demo
334
+ st.info("Embedding visualization requires fetching all vectors. In production, use sampling.")
335
+
336
+ # Demo with random data
337
+ n_points = min(info.get("points_count", 20), 50)
338
+ fake_embeddings = np.random.randn(n_points, 2)
339
+
340
+ import plotly.express as px
341
+ fig = px.scatter(
342
+ x=fake_embeddings[:, 0],
343
+ y=fake_embeddings[:, 1],
344
+ title="Embedding Space (Demo - PCA projection)"
345
+ )
346
+ st.plotly_chart(fig, use_container_width=True)
347
+
348
+ with tab2:
349
+ st.markdown("### Compute Similarity Matrix")
350
+
351
+ items = st.text_area("Enter items (one per line)", height=150)
352
+ modality = st.selectbox("Modality", ["text", "smiles", "protein"], key="sim_mod")
353
+
354
+ if st.button("🔢 Compute Matrix"):
355
+ if items:
356
+ obm = components["obm"]
357
+ item_list = [i.strip() for i in items.strip().split("\n") if i.strip()]
358
+
359
+ if modality == "text":
360
+ embeddings = obm.encode_text(item_list)
361
+ elif modality == "smiles":
362
+ embeddings = obm.encode_smiles(item_list)
363
+ else:
364
+ embeddings = obm.encode_protein(item_list)
365
+
366
+ vectors = np.array([e.vector for e in embeddings])
367
+
368
+ # Compute similarity
369
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
370
+ normalized = vectors / np.clip(norms, 1e-9, None)
371
+ similarity = np.dot(normalized, normalized.T)
372
+
373
+ import plotly.figure_factory as ff
374
+ labels = [i[:20] for i in item_list]
375
+ fig = ff.create_annotated_heatmap(
376
+ similarity,
377
+ x=labels,
378
+ y=labels,
379
+ colorscale='RdBu'
380
+ )
381
+ st.plotly_chart(fig, use_container_width=True)
382
+
383
+ with tab3:
384
+ st.markdown("### Molecule Visualization")
385
+
386
+ smiles = st.text_input("Enter SMILES", placeholder="CC(=O)OC1=CC=CC=C1C(=O)O")
387
+
388
+ if smiles:
389
+ try:
390
+ from rdkit import Chem
391
+ from rdkit.Chem import Draw
392
+
393
+ mol = Chem.MolFromSmiles(smiles)
394
+ if mol:
395
+ img = Draw.MolToImage(mol, size=(400, 300))
396
+ st.image(img, caption=f"Molecule: {smiles}")
397
+ else:
398
+ st.error("Invalid SMILES")
399
+ except ImportError:
400
+ st.warning("RDKit not installed. Install with: pip install rdkit")
401
+
402
+
403
+ def render_pipeline_page(components):
404
+ """Render pipeline demo page."""
405
+ st.markdown('<p class="main-header">🔬 Pipeline Demo</p>', unsafe_allow_html=True)
406
+
407
+ st.markdown("""
408
+ Run a complete discovery workflow that:
409
+ 1. Searches for related literature
410
+ 2. Finds similar molecules
411
+ 3. Validates candidates
412
+ 4. Analyzes result diversity
413
+ """)
414
+
415
+ query = st.text_input("Enter discovery query", placeholder="e.g., KRAS inhibitor for lung cancer")
416
+
417
+ col1, col2 = st.columns(2)
418
+ with col1:
419
+ query_mod = st.selectbox("Query modality", ["text", "smiles", "protein"])
420
+ with col2:
421
+ target_mod = st.selectbox("Target modality", ["smiles", "text", "protein"])
422
+
423
+ if st.button("🚀 Run Discovery Pipeline", type="primary"):
424
+ if query:
425
+ pipeline = components["pipeline"]
426
+
427
+ with st.spinner("Running pipeline..."):
428
+ results = pipeline.run_discovery_workflow(
429
+ query=query,
430
+ query_modality=query_mod,
431
+ target_modality=target_mod
432
+ )
433
+
434
+ st.markdown("## 📊 Pipeline Results")
435
+
436
+ # Literature
437
+ with st.expander("📚 Related Literature", expanded=True):
438
+ lit = results.get("stages", {}).get("literature", [])
439
+ if lit:
440
+ for item in lit:
441
+ st.markdown(f"- **Score: {item['score']:.3f}** - {item['content'][:100]}...")
442
+ else:
443
+ st.info("No literature found")
444
+
445
+ # Molecules
446
+ with st.expander("🧪 Similar Molecules", expanded=True):
447
+ mols = results.get("stages", {}).get("molecules", [])
448
+ if mols:
449
+ df = pd.DataFrame(mols)
450
+ st.dataframe(df)
451
+ else:
452
+ st.info("No molecules found")
453
+
454
+ # Validation
455
+ with st.expander("✅ Validation Results"):
456
+ val = results.get("stages", {}).get("validation", [])
457
+ if val:
458
+ st.json(val)
459
+ else:
460
+ st.info("No validation performed")
461
+
462
+ # Diversity
463
+ with st.expander("📈 Diversity Analysis"):
464
+ div = results.get("stages", {}).get("diversity", {})
465
+ if div:
466
+ col1, col2, col3 = st.columns(3)
467
+ col1.metric("Mean Similarity", f"{div.get('mean_similarity', 0):.3f}")
468
+ col2.metric("Diversity Score", f"{div.get('diversity_score', 0):.3f}")
469
+ col3.metric("Modalities", len(div.get('modality_distribution', {})))
470
+ st.json(div)
471
+
472
+
473
+ def render_docs_page():
474
+ """Render documentation page."""
475
+ st.markdown('<p class="main-header">📚 Documentation</p>', unsafe_allow_html=True)
476
+
477
+ st.markdown("""
478
+ ## BioFlow + OpenBioMed Integration
479
+
480
+ ### 🎯 Overview
481
+
482
+ BioFlow is a multimodal biological intelligence framework that leverages OpenBioMed (OBM)
483
+ for encoding biological data and Qdrant for vector storage and retrieval.
484
+
485
+ ### 🧩 Components
486
+
487
+ | Component | Description |
488
+ |-----------|-------------|
489
+ | **OBMWrapper** | Encodes text, molecules (SMILES), and proteins into a shared vector space |
490
+ | **QdrantManager** | Manages vector storage, indexing, and similarity search |
491
+ | **BioFlowPipeline** | Orchestrates agents in discovery workflows |
492
+ | **Visualizer** | Creates plots for embeddings, similarities, and molecules |
493
+
494
+ ### 🔌 API Examples
495
+
496
+ ```python
497
+ from bioflow import OBMWrapper, QdrantManager, BioFlowPipeline
498
+
499
+ # Initialize
500
+ obm = OBMWrapper(device="cuda")
501
+ qdrant = QdrantManager(obm, qdrant_path="./data/qdrant")
502
+
503
+ # Encode different modalities
504
+ text_vec = obm.encode_text("KRAS inhibitor for cancer")
505
+ mol_vec = obm.encode_smiles("CCO")
506
+ prot_vec = obm.encode_protein("MTEYKLVVV...")
507
+
508
+ # Cross-modal search
509
+ results = qdrant.cross_modal_search(
510
+ query="anti-inflammatory drug",
511
+ query_modality="text",
512
+ target_modality="smiles",
513
+ limit=10
514
+ )
515
+ ```
516
+
517
+ ### 🌟 Key Features
518
+
519
+ 1. **Unified Embedding Space**: All modalities map to the same vector dimension
520
+ 2. **Cross-Modal Search**: Find molecules from text queries and vice versa
521
+ 3. **Pipeline Orchestration**: Chain agents for complex discovery workflows
522
+ 4. **Mock Mode**: Test without GPU using deterministic random embeddings
523
+
524
+ ### 📁 File Structure
525
+
526
+ ```
527
+ bioflow/
528
+ ├── __init__.py # Package exports
529
+ ├── obm_wrapper.py # OBM encoding interface
530
+ ├── qdrant_manager.py # Qdrant operations
531
+ ├── pipeline.py # Workflow orchestration
532
+ ├── visualizer.py # Visualization utilities
533
+ └── app.py # Streamlit interface
534
+ ```
535
+ """)
536
+
537
+
538
+ def main():
539
+ """Main application entry point."""
540
+ mode, use_mock = render_sidebar()
541
+
542
+ # Initialize components
543
+ components = init_bioflow(use_mock=use_mock)
544
+
545
+ if not components.get("ready"):
546
+ st.error("System not ready. Check configuration.")
547
+ return
548
+
549
+ # Display collection stats in sidebar
550
+ info = components["qdrant"].get_collection_info()
551
+ st.sidebar.metric("📊 Vectors", info.get("points_count", 0))
552
+ st.sidebar.metric("📐 Dimension", info.get("vector_size", 768))
553
+
554
+ # Route to appropriate page
555
+ if "Search" in mode:
556
+ render_search_page(components)
557
+ elif "Ingestion" in mode:
558
+ render_ingestion_page(components)
559
+ elif "Cross-Modal" in mode:
560
+ render_crossmodal_page(components)
561
+ elif "Visualization" in mode:
562
+ render_visualization_page(components)
563
+ elif "Pipeline" in mode:
564
+ render_pipeline_page(components)
565
+ elif "Documentation" in mode:
566
+ render_docs_page()
567
+
568
+
569
+ if __name__ == "__main__":
570
+ main()
bioflow/core/__init__.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Core
3
+ =============
4
+
5
+ Core abstractions and orchestration for the BioFlow platform.
6
+
7
+ Public API:
8
+ - Modality: Enum of supported data types
9
+ - BioEncoder, BioPredictor, BioGenerator, BioRetriever: Abstract interfaces
10
+ - EmbeddingResult, PredictionResult, RetrievalResult: Data containers
11
+ - ToolRegistry: Central tool management
12
+ - BioFlowOrchestrator: Pipeline execution engine
13
+ - WorkflowConfig, NodeConfig: Configuration classes
14
+ """
15
+
16
+ from bioflow.core.base import (
17
+ Modality,
18
+ BioEncoder,
19
+ BioPredictor,
20
+ BioGenerator,
21
+ BioRetriever,
22
+ BioTool,
23
+ EmbeddingResult,
24
+ PredictionResult,
25
+ RetrievalResult,
26
+ )
27
+
28
+ from bioflow.core.registry import ToolRegistry
29
+
30
+ from bioflow.core.orchestrator import (
31
+ BioFlowOrchestrator,
32
+ ExecutionContext,
33
+ PipelineResult,
34
+ )
35
+
36
+ from bioflow.core.config import (
37
+ NodeType,
38
+ NodeConfig,
39
+ WorkflowConfig,
40
+ EncoderConfig,
41
+ VectorDBConfig,
42
+ BioFlowConfig,
43
+ )
44
+
45
+ from bioflow.core.nodes import (
46
+ EncodeNode,
47
+ RetrieveNode,
48
+ PredictNode,
49
+ IngestNode,
50
+ FilterNode,
51
+ TraceabilityNode,
52
+ )
53
+
54
+ __all__ = [
55
+ # Enums
56
+ "Modality",
57
+ "NodeType",
58
+ # Abstract interfaces
59
+ "BioEncoder",
60
+ "BioPredictor",
61
+ "BioGenerator",
62
+ "BioRetriever",
63
+ "BioTool",
64
+ # Data containers
65
+ "EmbeddingResult",
66
+ "PredictionResult",
67
+ "RetrievalResult",
68
+ # Registry
69
+ "ToolRegistry",
70
+ # Orchestrator
71
+ "BioFlowOrchestrator",
72
+ "ExecutionContext",
73
+ "PipelineResult",
74
+ # Config
75
+ "NodeConfig",
76
+ "WorkflowConfig",
77
+ "EncoderConfig",
78
+ "VectorDBConfig",
79
+ "BioFlowConfig",
80
+ # Nodes
81
+ "EncodeNode",
82
+ "RetrieveNode",
83
+ "PredictNode",
84
+ "IngestNode",
85
+ "FilterNode",
86
+ "TraceabilityNode",
87
+ ]
bioflow/core/base.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Core Abstractions
3
+ ==========================
4
+
5
+ Defines the fundamental interfaces for all tools in the BioFlow platform.
6
+ All encoders, predictors, generators, and retrievers must implement these.
7
+
8
+ Open-Source Models Supported:
9
+ - Text: PubMedBERT, SciBERT, Specter
10
+ - Molecules: ChemBERTa, RDKit FP
11
+ - Proteins: ESM-2, ProtBERT
12
+ - Images: CLIP, BioMedCLIP
13
+ """
14
+
15
+ from abc import ABC, abstractmethod
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from dataclasses import dataclass, field
18
+ from enum import Enum
19
+
20
+
21
+ class Modality(Enum):
22
+ """Supported data modalities in BioFlow."""
23
+ TEXT = "text"
24
+ SMILES = "smiles"
25
+ PROTEIN = "protein"
26
+ IMAGE = "image"
27
+ GENOMIC = "genomic"
28
+ STRUCTURE = "structure"
29
+
30
+
31
+ @dataclass
32
+ class EmbeddingResult:
33
+ """Result of an encoding operation."""
34
+ vector: List[float]
35
+ modality: Modality
36
+ dimension: int
37
+ metadata: Dict[str, Any] = field(default_factory=dict)
38
+
39
+ def __len__(self):
40
+ return len(self.vector)
41
+
42
+
43
+ @dataclass
44
+ class PredictionResult:
45
+ """Result of a prediction operation."""
46
+ score: float
47
+ label: Optional[str] = None
48
+ confidence: Optional[float] = None
49
+ metadata: Dict[str, Any] = field(default_factory=dict)
50
+
51
+
52
+ @dataclass
53
+ class RetrievalResult:
54
+ """Result of a retrieval/search operation."""
55
+ id: str
56
+ score: float
57
+ content: Any
58
+ modality: Modality
59
+ payload: Dict[str, Any] = field(default_factory=dict)
60
+
61
+
62
+ class BioEncoder(ABC):
63
+ """
64
+ Interface for any tool that converts biological data into vectors.
65
+
66
+ Implementations:
67
+ - OBMEncoder: Multimodal (text, SMILES, protein)
68
+ - ESM2Encoder: Protein sequences
69
+ - ChemBERTaEncoder: SMILES molecules
70
+ - PubMedBERTEncoder: Biomedical text
71
+ - CLIPEncoder: Images
72
+
73
+ Example:
74
+ >>> encoder = ESM2Encoder(device="cuda")
75
+ >>> result = encoder.encode("MKTVRQERLKSIVRILERSKEPVSG", Modality.PROTEIN)
76
+ >>> print(len(result.vector)) # 1280
77
+ """
78
+
79
+ @abstractmethod
80
+ def encode(self, content: Any, modality: Modality) -> EmbeddingResult:
81
+ """
82
+ Encode content into a vector representation.
83
+
84
+ Args:
85
+ content: Raw input (text, SMILES string, protein sequence, etc.)
86
+ modality: Type of the input data
87
+
88
+ Returns:
89
+ EmbeddingResult with vector and metadata
90
+ """
91
+ pass
92
+
93
+ @property
94
+ @abstractmethod
95
+ def dimension(self) -> int:
96
+ """Return the dimension of output vectors."""
97
+ pass
98
+
99
+ @property
100
+ def supported_modalities(self) -> List[Modality]:
101
+ """Return list of modalities this encoder supports."""
102
+ return [Modality.TEXT] # Override in subclasses
103
+
104
+ def batch_encode(self, contents: List[Any], modality: Modality) -> List[EmbeddingResult]:
105
+ """Encode multiple items. Override for optimized batch processing."""
106
+ return [self.encode(c, modality) for c in contents]
107
+
108
+
109
+ class BioPredictor(ABC):
110
+ """
111
+ Interface for tools that predict properties, affinities, or interactions.
112
+
113
+ Implementations:
114
+ - DeepPurposePredictor: DTI prediction
115
+ - ToxicityPredictor: ADMET properties
116
+ - BindingAffinityPredictor: Kd/Ki estimation
117
+
118
+ Example:
119
+ >>> predictor = DeepPurposePredictor()
120
+ >>> result = predictor.predict(drug="CCO", target="MKTVRQ...")
121
+ >>> print(result.score) # 0.85
122
+ """
123
+
124
+ @abstractmethod
125
+ def predict(self, drug: str, target: str) -> PredictionResult:
126
+ """
127
+ Predict interaction/property between drug and target.
128
+
129
+ Args:
130
+ drug: SMILES string of drug molecule
131
+ target: Protein sequence or identifier
132
+
133
+ Returns:
134
+ PredictionResult with score and metadata
135
+ """
136
+ pass
137
+
138
+ def batch_predict(self, pairs: List[tuple]) -> List[PredictionResult]:
139
+ """Predict for multiple drug-target pairs."""
140
+ return [self.predict(d, t) for d, t in pairs]
141
+
142
+
143
+ class BioGenerator(ABC):
144
+ """
145
+ Interface for tools that generate new biological candidates.
146
+
147
+ Implementations:
148
+ - MoleculeGenerator: SMILES generation
149
+ - ProteinGenerator: Sequence design
150
+ - VariantGenerator: Mutation suggestions
151
+
152
+ Example:
153
+ >>> generator = MoleculeGenerator()
154
+ >>> candidates = generator.generate(
155
+ ... seed="CCO",
156
+ ... constraints={"mw_max": 500, "logp_max": 5}
157
+ ... )
158
+ """
159
+
160
+ @abstractmethod
161
+ def generate(self, seed: Any, constraints: Dict[str, Any]) -> List[Any]:
162
+ """
163
+ Generate new candidates based on seed and constraints.
164
+
165
+ Args:
166
+ seed: Starting point (molecule, sequence, etc.)
167
+ constraints: Dictionary of constraints (e.g., MW, toxicity)
168
+
169
+ Returns:
170
+ List of generated candidates
171
+ """
172
+ pass
173
+
174
+
175
+ class BioRetriever(ABC):
176
+ """
177
+ Interface for vector database retrieval operations.
178
+
179
+ Implementations:
180
+ - QdrantRetriever: Qdrant vector search
181
+ - FAISSRetriever: FAISS similarity search
182
+
183
+ Example:
184
+ >>> retriever = QdrantRetriever(collection="molecules")
185
+ >>> results = retriever.search(query_vector, limit=10)
186
+ """
187
+
188
+ @abstractmethod
189
+ def search(
190
+ self,
191
+ query: Union[List[float], str],
192
+ limit: int = 10,
193
+ filters: Optional[Dict[str, Any]] = None
194
+ ) -> List[RetrievalResult]:
195
+ """
196
+ Search for similar items in vector database.
197
+
198
+ Args:
199
+ query: Query vector or raw content to encode first
200
+ limit: Maximum number of results
201
+ filters: Metadata filters to apply
202
+
203
+ Returns:
204
+ List of RetrievalResult sorted by similarity
205
+ """
206
+ pass
207
+
208
+ @abstractmethod
209
+ def ingest(
210
+ self,
211
+ content: Any,
212
+ modality: Modality,
213
+ payload: Optional[Dict[str, Any]] = None
214
+ ) -> str:
215
+ """
216
+ Ingest content into the vector database.
217
+
218
+ Args:
219
+ content: Raw content to encode and store
220
+ modality: Type of content
221
+ payload: Additional metadata to store
222
+
223
+ Returns:
224
+ ID of the inserted item
225
+ """
226
+ pass
227
+
228
+
229
+ class BioTool(ABC):
230
+ """
231
+ General wrapper for miscellaneous tools.
232
+
233
+ Implementations:
234
+ - RDKitTool: Molecular operations
235
+ - VisualizationTool: Plotting and visualization
236
+ - FilterTool: Candidate filtering
237
+ """
238
+
239
+ @abstractmethod
240
+ def execute(self, *args, **kwargs) -> Any:
241
+ """Execute the tool with given arguments."""
242
+ pass
243
+
244
+ @property
245
+ def name(self) -> str:
246
+ """Return tool name."""
247
+ return self.__class__.__name__
bioflow/core/config.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Configuration Schema
3
+ =============================
4
+
5
+ Dataclasses and schemas for workflow configuration.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Dict, List, Any, Optional
10
+ from enum import Enum
11
+
12
+
13
+ class NodeType(Enum):
14
+ """Types of nodes in a BioFlow pipeline."""
15
+ ENCODE = "encode" # Vectorize input using encoder
16
+ RETRIEVE = "retrieve" # Search vector DB for neighbors
17
+ PREDICT = "predict" # Run prediction model
18
+ GENERATE = "generate" # Generate new candidates
19
+ FILTER = "filter" # Filter/rank candidates
20
+ CUSTOM = "custom" # User-defined function
21
+
22
+
23
+ @dataclass
24
+ class NodeConfig:
25
+ """Configuration for a single pipeline node."""
26
+ id: str
27
+ type: NodeType
28
+ tool: str # Name of registered tool
29
+ inputs: List[str] = field(default_factory=list) # Node IDs or "input"
30
+ params: Dict[str, Any] = field(default_factory=dict)
31
+
32
+ def __post_init__(self):
33
+ if isinstance(self.type, str):
34
+ self.type = NodeType(self.type)
35
+
36
+
37
+ @dataclass
38
+ class WorkflowConfig:
39
+ """Configuration for an entire workflow."""
40
+ name: str
41
+ description: str = ""
42
+ nodes: List[NodeConfig] = field(default_factory=list)
43
+ output_node: str = "" # ID of final node
44
+ metadata: Dict[str, Any] = field(default_factory=dict)
45
+
46
+ @classmethod
47
+ def from_dict(cls, data: Dict[str, Any]) -> "WorkflowConfig":
48
+ """Create WorkflowConfig from dictionary (e.g., loaded YAML)."""
49
+ nodes = [
50
+ NodeConfig(**node) if isinstance(node, dict) else node
51
+ for node in data.get("nodes", [])
52
+ ]
53
+ return cls(
54
+ name=data.get("name", "unnamed"),
55
+ description=data.get("description", ""),
56
+ nodes=nodes,
57
+ output_node=data.get("output_node", ""),
58
+ metadata=data.get("metadata", {})
59
+ )
60
+
61
+
62
+ @dataclass
63
+ class EncoderConfig:
64
+ """Configuration for an encoder."""
65
+ name: str
66
+ model_type: str # e.g., "esm2", "pubmedbert", "chemberta"
67
+ model_path: Optional[str] = None
68
+ device: str = "cpu"
69
+ dimension: int = 768
70
+ modalities: List[str] = field(default_factory=list)
71
+ extra: Dict[str, Any] = field(default_factory=dict)
72
+
73
+
74
+ @dataclass
75
+ class VectorDBConfig:
76
+ """Configuration for vector database."""
77
+ provider: str = "qdrant" # qdrant, faiss, etc.
78
+ url: Optional[str] = None
79
+ path: Optional[str] = None
80
+ default_collection: str = "bioflow_memory"
81
+ distance_metric: str = "cosine"
82
+
83
+
84
+ @dataclass
85
+ class BioFlowConfig:
86
+ """Master configuration for entire BioFlow system."""
87
+ project_name: str = "BioFlow"
88
+ encoders: Dict[str, EncoderConfig] = field(default_factory=dict)
89
+ vector_db: VectorDBConfig = field(default_factory=VectorDBConfig)
90
+ workflows: Dict[str, WorkflowConfig] = field(default_factory=dict)
91
+ default_encoder: str = "default"
92
+ log_level: str = "INFO"
bioflow/core/nodes.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Workflow Nodes
3
+ =======================
4
+
5
+ Typed node implementations for the BioFlow orchestrator.
6
+ Each node wraps a specific operation in the discovery pipeline.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Union
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from abc import ABC, abstractmethod
14
+
15
+ from bioflow.core import (
16
+ Modality,
17
+ BioEncoder,
18
+ BioPredictor,
19
+ BioRetriever,
20
+ EmbeddingResult,
21
+ PredictionResult,
22
+ RetrievalResult,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @dataclass
29
+ class NodeResult:
30
+ """Result from any node execution."""
31
+ node_id: str
32
+ node_type: str
33
+ data: Any
34
+ metadata: Dict[str, Any] = field(default_factory=dict)
35
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
36
+
37
+ def __repr__(self):
38
+ return f"NodeResult({self.node_type}: {len(self.data) if hasattr(self.data, '__len__') else 1} items)"
39
+
40
+
41
+ class BaseNode(ABC):
42
+ """Base class for all workflow nodes."""
43
+
44
+ def __init__(self, node_id: str):
45
+ self.node_id = node_id
46
+
47
+ @property
48
+ @abstractmethod
49
+ def node_type(self) -> str:
50
+ pass
51
+
52
+ @abstractmethod
53
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
54
+ pass
55
+
56
+
57
+ class EncodeNode(BaseNode):
58
+ """
59
+ Encodes input data into embeddings.
60
+
61
+ Input: Raw content (text, SMILES, protein sequence)
62
+ Output: EmbeddingResult or list of EmbeddingResults
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ node_id: str,
68
+ encoder: BioEncoder,
69
+ modality: Modality = Modality.TEXT,
70
+ auto_detect: bool = False
71
+ ):
72
+ super().__init__(node_id)
73
+ self.encoder = encoder
74
+ self.modality = modality
75
+ self.auto_detect = auto_detect
76
+
77
+ @property
78
+ def node_type(self) -> str:
79
+ return "encode"
80
+
81
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
82
+ """Encode input data."""
83
+ context = context or {}
84
+
85
+ # Handle batch input
86
+ if isinstance(input_data, list):
87
+ if self.auto_detect and hasattr(self.encoder, 'encode_auto'):
88
+ results = [self.encoder.encode_auto(item) for item in input_data]
89
+ else:
90
+ results = self.encoder.batch_encode(input_data, self.modality)
91
+ data = results
92
+ else:
93
+ if self.auto_detect and hasattr(self.encoder, 'encode_auto'):
94
+ result = self.encoder.encode_auto(input_data)
95
+ else:
96
+ result = self.encoder.encode(input_data, self.modality)
97
+ data = result
98
+
99
+ return NodeResult(
100
+ node_id=self.node_id,
101
+ node_type=self.node_type,
102
+ data=data,
103
+ metadata={"modality": self.modality.value, "auto_detect": self.auto_detect}
104
+ )
105
+
106
+
107
+ class RetrieveNode(BaseNode):
108
+ """
109
+ Retrieves similar items from vector database.
110
+
111
+ Input: Query (string or embedding)
112
+ Output: List of RetrievalResults
113
+ """
114
+
115
+ def __init__(
116
+ self,
117
+ node_id: str,
118
+ retriever: BioRetriever,
119
+ collection: str = None,
120
+ limit: int = 10,
121
+ modality: Modality = Modality.TEXT,
122
+ filters: Dict[str, Any] = None
123
+ ):
124
+ super().__init__(node_id)
125
+ self.retriever = retriever
126
+ self.collection = collection
127
+ self.limit = limit
128
+ self.modality = modality
129
+ self.filters = filters or {}
130
+
131
+ @property
132
+ def node_type(self) -> str:
133
+ return "retrieve"
134
+
135
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
136
+ """Retrieve similar items."""
137
+ context = context or {}
138
+
139
+ # Override from context if provided
140
+ limit = context.get("limit", self.limit)
141
+ filters = {**self.filters, **context.get("filters", {})}
142
+
143
+ # Handle EmbeddingResult input
144
+ if isinstance(input_data, EmbeddingResult):
145
+ query = input_data.vector
146
+ else:
147
+ query = input_data
148
+
149
+ results = self.retriever.search(
150
+ query=query,
151
+ limit=limit,
152
+ filters=filters if filters else None,
153
+ collection=self.collection,
154
+ modality=self.modality
155
+ )
156
+
157
+ return NodeResult(
158
+ node_id=self.node_id,
159
+ node_type=self.node_type,
160
+ data=results,
161
+ metadata={
162
+ "count": len(results),
163
+ "collection": self.collection,
164
+ "filters": filters
165
+ }
166
+ )
167
+
168
+
169
+ class PredictNode(BaseNode):
170
+ """
171
+ Runs predictions on drug-target pairs.
172
+
173
+ Input: List of candidates (from retrieval) or direct (drug, target) pairs
174
+ Output: List of PredictionResults with scores
175
+ """
176
+
177
+ def __init__(
178
+ self,
179
+ node_id: str,
180
+ predictor: BioPredictor,
181
+ target_sequence: str = None,
182
+ drug_field: str = "content",
183
+ threshold: float = 0.0
184
+ ):
185
+ super().__init__(node_id)
186
+ self.predictor = predictor
187
+ self.target_sequence = target_sequence
188
+ self.drug_field = drug_field
189
+ self.threshold = threshold
190
+
191
+ @property
192
+ def node_type(self) -> str:
193
+ return "predict"
194
+
195
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
196
+ """Run predictions."""
197
+ context = context or {}
198
+ target = context.get("target", self.target_sequence)
199
+
200
+ if not target:
201
+ raise ValueError("Target sequence is required for prediction")
202
+
203
+ predictions = []
204
+
205
+ # Handle different input types
206
+ if isinstance(input_data, list):
207
+ for item in input_data:
208
+ # Extract drug from RetrievalResult or dict
209
+ if isinstance(item, RetrievalResult):
210
+ drug = item.content
211
+ source_id = item.id
212
+ elif isinstance(item, dict):
213
+ drug = item.get(self.drug_field, item.get("smiles", ""))
214
+ source_id = item.get("id", "unknown")
215
+ else:
216
+ drug = str(item)
217
+ source_id = "unknown"
218
+
219
+ try:
220
+ result = self.predictor.predict(drug, target)
221
+ if result.score >= self.threshold:
222
+ predictions.append({
223
+ "drug": drug,
224
+ "source_id": source_id,
225
+ "prediction": result,
226
+ "score": result.score
227
+ })
228
+ except Exception as e:
229
+ logger.warning(f"Prediction failed for {drug[:20]}...: {e}")
230
+ else:
231
+ result = self.predictor.predict(str(input_data), target)
232
+ predictions.append({
233
+ "drug": str(input_data),
234
+ "prediction": result,
235
+ "score": result.score
236
+ })
237
+
238
+ # Sort by score
239
+ predictions.sort(key=lambda x: x["score"], reverse=True)
240
+
241
+ return NodeResult(
242
+ node_id=self.node_id,
243
+ node_type=self.node_type,
244
+ data=predictions,
245
+ metadata={
246
+ "count": len(predictions),
247
+ "threshold": self.threshold,
248
+ "target_length": len(target) if target else 0
249
+ }
250
+ )
251
+
252
+
253
+ class IngestNode(BaseNode):
254
+ """
255
+ Ingests data into the vector database.
256
+
257
+ Input: List of items to ingest
258
+ Output: List of ingested IDs
259
+ """
260
+
261
+ def __init__(
262
+ self,
263
+ node_id: str,
264
+ retriever: BioRetriever,
265
+ collection: str = None,
266
+ modality: Modality = Modality.TEXT,
267
+ content_field: str = "content"
268
+ ):
269
+ super().__init__(node_id)
270
+ self.retriever = retriever
271
+ self.collection = collection
272
+ self.modality = modality
273
+ self.content_field = content_field
274
+
275
+ @property
276
+ def node_type(self) -> str:
277
+ return "ingest"
278
+
279
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
280
+ """Ingest data into vector DB."""
281
+ context = context or {}
282
+ ids = []
283
+
284
+ if isinstance(input_data, list):
285
+ for item in input_data:
286
+ if isinstance(item, dict):
287
+ content = item.get(self.content_field, item.get("smiles", item.get("sequence", "")))
288
+ payload = {k: v for k, v in item.items() if k != self.content_field}
289
+ else:
290
+ content = str(item)
291
+ payload = {}
292
+
293
+ item_id = self.retriever.ingest(
294
+ content=content,
295
+ modality=self.modality,
296
+ payload=payload,
297
+ collection=self.collection
298
+ )
299
+ ids.append(item_id)
300
+ else:
301
+ item_id = self.retriever.ingest(
302
+ content=str(input_data),
303
+ modality=self.modality,
304
+ payload=context.get("payload", {}),
305
+ collection=self.collection
306
+ )
307
+ ids.append(item_id)
308
+
309
+ return NodeResult(
310
+ node_id=self.node_id,
311
+ node_type=self.node_type,
312
+ data=ids,
313
+ metadata={"count": len(ids), "collection": self.collection}
314
+ )
315
+
316
+
317
+ class FilterNode(BaseNode):
318
+ """
319
+ Filters and ranks results.
320
+
321
+ Input: List of items
322
+ Output: Filtered/ranked list
323
+ """
324
+
325
+ def __init__(
326
+ self,
327
+ node_id: str,
328
+ score_field: str = "score",
329
+ threshold: float = 0.5,
330
+ top_k: int = None,
331
+ diversity: float = 0.0 # For MMR-style diversification
332
+ ):
333
+ super().__init__(node_id)
334
+ self.score_field = score_field
335
+ self.threshold = threshold
336
+ self.top_k = top_k
337
+ self.diversity = diversity
338
+
339
+ @property
340
+ def node_type(self) -> str:
341
+ return "filter"
342
+
343
+ def _get_score(self, item: Any) -> float:
344
+ """Extract score from item."""
345
+ if isinstance(item, dict):
346
+ return item.get(self.score_field, 0)
347
+ elif hasattr(item, self.score_field):
348
+ return getattr(item, self.score_field)
349
+ elif hasattr(item, 'score'):
350
+ return item.score
351
+ return 0
352
+
353
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
354
+ """Filter and rank results."""
355
+ context = context or {}
356
+
357
+ if not isinstance(input_data, list):
358
+ input_data = [input_data]
359
+
360
+ # Filter by threshold
361
+ filtered = [item for item in input_data if self._get_score(item) >= self.threshold]
362
+
363
+ # Sort by score
364
+ filtered.sort(key=lambda x: self._get_score(x), reverse=True)
365
+
366
+ # Apply top_k
367
+ if self.top_k:
368
+ filtered = filtered[:self.top_k]
369
+
370
+ return NodeResult(
371
+ node_id=self.node_id,
372
+ node_type=self.node_type,
373
+ data=filtered,
374
+ metadata={
375
+ "input_count": len(input_data),
376
+ "output_count": len(filtered),
377
+ "threshold": self.threshold
378
+ }
379
+ )
380
+
381
+
382
+ class TraceabilityNode(BaseNode):
383
+ """
384
+ Adds evidence linking and provenance to results.
385
+
386
+ Input: Results with source IDs
387
+ Output: Results enriched with evidence links
388
+ """
389
+
390
+ def __init__(
391
+ self,
392
+ node_id: str,
393
+ source_mapping: Dict[str, str] = None # Maps ID prefixes to URLs
394
+ ):
395
+ super().__init__(node_id)
396
+ self.source_mapping = source_mapping or {
397
+ "PMID": "https://pubmed.ncbi.nlm.nih.gov/{id}",
398
+ "UniProt": "https://www.uniprot.org/uniprot/{id}",
399
+ "ChEMBL": "https://www.ebi.ac.uk/chembl/compound_report_card/{id}",
400
+ "PubChem": "https://pubchem.ncbi.nlm.nih.gov/compound/{id}",
401
+ }
402
+
403
+ @property
404
+ def node_type(self) -> str:
405
+ return "trace"
406
+
407
+ def _generate_evidence_link(self, source_id: str, payload: Dict[str, Any]) -> Dict[str, str]:
408
+ """Generate evidence links from source ID and payload."""
409
+ links = {}
410
+
411
+ # Check for known ID types in payload
412
+ for key, url_template in self.source_mapping.items():
413
+ if key.lower() in payload:
414
+ links[key] = url_template.format(id=payload[key.lower()])
415
+ elif f"{key.lower()}_id" in payload:
416
+ links[key] = url_template.format(id=payload[f"{key.lower()}_id"])
417
+
418
+ # Check source_id prefix
419
+ for prefix, url_template in self.source_mapping.items():
420
+ if source_id.startswith(prefix):
421
+ id_part = source_id.replace(prefix, "").lstrip("_:-")
422
+ links[prefix] = url_template.format(id=id_part)
423
+
424
+ return links
425
+
426
+ def execute(self, input_data: Any, context: Dict[str, Any] = None) -> NodeResult:
427
+ """Add evidence links to results."""
428
+ context = context or {}
429
+
430
+ if not isinstance(input_data, list):
431
+ input_data = [input_data]
432
+
433
+ enriched = []
434
+ for item in input_data:
435
+ if isinstance(item, dict):
436
+ source_id = item.get("source_id", item.get("id", ""))
437
+ payload = item.get("payload", item)
438
+ evidence = self._generate_evidence_link(source_id, payload)
439
+
440
+ enriched_item = {
441
+ **item,
442
+ "evidence_links": evidence,
443
+ "has_evidence": len(evidence) > 0
444
+ }
445
+ enriched.append(enriched_item)
446
+ elif isinstance(item, RetrievalResult):
447
+ evidence = self._generate_evidence_link(item.id, item.payload)
448
+ enriched.append({
449
+ "id": item.id,
450
+ "content": item.content,
451
+ "score": item.score,
452
+ "modality": item.modality.value,
453
+ "payload": item.payload,
454
+ "evidence_links": evidence,
455
+ "has_evidence": len(evidence) > 0
456
+ })
457
+ else:
458
+ enriched.append(item)
459
+
460
+ return NodeResult(
461
+ node_id=self.node_id,
462
+ node_type=self.node_type,
463
+ data=enriched,
464
+ metadata={"with_evidence": sum(1 for e in enriched if isinstance(e, dict) and e.get("has_evidence", False))}
465
+ )
bioflow/core/orchestrator.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Orchestrator
3
+ =====================
4
+
5
+ Stateful pipeline engine that manages the flow of data through
6
+ registered tools, forming a Directed Acyclic Graph (DAG) of operations.
7
+ """
8
+
9
+ import logging
10
+ from typing import Dict, List, Any, Optional, Callable
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from collections import defaultdict
14
+
15
+ from typing import Optional as OptionalType
16
+ from bioflow.core.base import BioEncoder, BioPredictor, BioGenerator, Modality
17
+ from bioflow.core.config import NodeConfig, WorkflowConfig, NodeType
18
+ from bioflow.core.registry import ToolRegistry
19
+
20
+ # Re-import Optional with a different name to avoid conflicts
21
+ from typing import Optional
22
+
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class ExecutionContext:
29
+ """Context passed through pipeline execution."""
30
+ workflow_id: str
31
+ start_time: datetime = field(default_factory=datetime.now)
32
+ node_outputs: Dict[str, Any] = field(default_factory=dict)
33
+ metadata: Dict[str, Any] = field(default_factory=dict)
34
+ errors: List[str] = field(default_factory=list)
35
+
36
+ def get_input(self, node_id: str) -> Any:
37
+ """Get output from a previous node."""
38
+ return self.node_outputs.get(node_id)
39
+
40
+ def set_output(self, node_id: str, value: Any):
41
+ """Store output from a node."""
42
+ self.node_outputs[node_id] = value
43
+
44
+
45
+ @dataclass
46
+ class PipelineResult:
47
+ """Final result of workflow execution."""
48
+ success: bool
49
+ output: Any
50
+ context: ExecutionContext
51
+ duration_ms: float
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ return {
55
+ "success": self.success,
56
+ "output": self.output,
57
+ "duration_ms": self.duration_ms,
58
+ "errors": self.context.errors,
59
+ "node_outputs": {k: str(v)[:100] for k, v in self.context.node_outputs.items()}
60
+ }
61
+
62
+
63
+ class BioFlowOrchestrator:
64
+ """
65
+ Main orchestration engine for BioFlow pipelines.
66
+
67
+ Responsibilities:
68
+ - Parse workflow configurations
69
+ - Build execution DAG from node dependencies
70
+ - Execute nodes in topological order
71
+ - Manage state between nodes
72
+ - Handle errors and retries
73
+
74
+ Example:
75
+ >>> orchestrator = BioFlowOrchestrator()
76
+ >>> orchestrator.register_workflow(workflow_config)
77
+ >>> result = orchestrator.run("my_workflow", input_data)
78
+ """
79
+
80
+ def __init__(self, registry: Optional[ToolRegistry] = None):
81
+ """
82
+ Initialize orchestrator.
83
+
84
+ Args:
85
+ registry: Tool registry instance. Uses global if None.
86
+ """
87
+ self.registry = registry if registry is not None else ToolRegistry
88
+ self.workflows: Dict[str, WorkflowConfig] = {}
89
+ self.custom_handlers: Dict[str, Callable] = {}
90
+ self._retriever = None # Qdrant manager reference
91
+
92
+ def set_retriever(self, retriever):
93
+ """Set the vector DB retriever (QdrantManager)."""
94
+ self._retriever = retriever
95
+
96
+ def register_workflow(self, config: WorkflowConfig) -> None:
97
+ """Register a workflow configuration."""
98
+ self.workflows[config.name] = config
99
+ logger.info(f"Registered workflow: {config.name} ({len(config.nodes)} nodes)")
100
+
101
+ def register_custom_handler(self, name: str, handler: Callable) -> None:
102
+ """Register a custom node handler function."""
103
+ self.custom_handlers[name] = handler
104
+ logger.info(f"Registered custom handler: {name}")
105
+
106
+ def _build_execution_order(self, config: WorkflowConfig) -> List[NodeConfig]:
107
+ """
108
+ Build topological execution order from node dependencies.
109
+
110
+ Returns nodes sorted so dependencies are executed first.
111
+ """
112
+ # Build adjacency list
113
+ in_degree = defaultdict(int)
114
+ dependents = defaultdict(list)
115
+ node_map = {node.id: node for node in config.nodes}
116
+
117
+ for node in config.nodes:
118
+ for dep in node.inputs:
119
+ if dep != "input" and dep in node_map:
120
+ dependents[dep].append(node.id)
121
+ in_degree[node.id] += 1
122
+
123
+ # Kahn's algorithm for topological sort
124
+ queue = [n.id for n in config.nodes if in_degree[n.id] == 0]
125
+ order = []
126
+
127
+ while queue:
128
+ node_id = queue.pop(0)
129
+ order.append(node_map[node_id])
130
+ for dependent in dependents[node_id]:
131
+ in_degree[dependent] -= 1
132
+ if in_degree[dependent] == 0:
133
+ queue.append(dependent)
134
+
135
+ if len(order) != len(config.nodes):
136
+ raise ValueError("Cycle detected in workflow DAG")
137
+
138
+ return order
139
+
140
+ def _execute_node(
141
+ self,
142
+ node: NodeConfig,
143
+ context: ExecutionContext,
144
+ initial_input: Any
145
+ ) -> Any:
146
+ """Execute a single node and return its output."""
147
+
148
+ # Gather inputs
149
+ inputs = []
150
+ for inp in node.inputs:
151
+ if inp == "input":
152
+ inputs.append(initial_input)
153
+ else:
154
+ inputs.append(context.get_input(inp))
155
+
156
+ # Single input case
157
+ node_input = inputs[0] if len(inputs) == 1 else inputs
158
+
159
+ logger.debug(f"Executing node: {node.id} (type={node.type.value})")
160
+
161
+ try:
162
+ if node.type == NodeType.ENCODE:
163
+ encoder = self.registry.get_encoder(node.tool)
164
+ modality = Modality(node.params.get("modality", "text"))
165
+ return encoder.encode(node_input, modality)
166
+
167
+ elif node.type == NodeType.PREDICT:
168
+ predictor = self.registry.get_predictor(node.tool)
169
+ drug: str = str(node.params.get("drug") or node_input)
170
+ target: str = str(node.params.get("target") or node.params.get("target_input") or "")
171
+ return predictor.predict(drug, target)
172
+
173
+ elif node.type == NodeType.RETRIEVE:
174
+ if self._retriever is None:
175
+ raise ValueError("No retriever configured. Call set_retriever() first.")
176
+ limit = node.params.get("limit", 5)
177
+ modality = node.params.get("modality", "text")
178
+ return self._retriever.search(
179
+ query=node_input,
180
+ query_modality=modality,
181
+ limit=limit
182
+ )
183
+
184
+ elif node.type == NodeType.GENERATE:
185
+ generator = self.registry.get_generator(node.tool)
186
+ constraints = node.params.get("constraints", {})
187
+ return generator.generate(node_input, constraints)
188
+
189
+ elif node.type == NodeType.FILTER:
190
+ # Built-in filter: expects list, applies threshold
191
+ threshold = node.params.get("threshold", 0.5)
192
+ key = node.params.get("key", "score")
193
+ if isinstance(node_input, list):
194
+ return [x for x in node_input if getattr(x, key, x.get(key, 0)) >= threshold]
195
+ return node_input
196
+
197
+ elif node.type == NodeType.CUSTOM:
198
+ if node.tool not in self.custom_handlers:
199
+ raise ValueError(f"Custom handler '{node.tool}' not registered")
200
+ handler = self.custom_handlers[node.tool]
201
+ return handler(node_input, **node.params)
202
+
203
+ else:
204
+ raise ValueError(f"Unknown node type: {node.type}")
205
+
206
+ except Exception as e:
207
+ context.errors.append(f"Node {node.id}: {str(e)}")
208
+ logger.error(f"Error in node {node.id}: {e}")
209
+ raise
210
+
211
+ def run(
212
+ self,
213
+ workflow_name: str,
214
+ input_data: Any,
215
+ metadata: Optional[Dict[str, Any]] = None
216
+ ) -> PipelineResult:
217
+ """
218
+ Execute a registered workflow.
219
+
220
+ Args:
221
+ workflow_name: Name of registered workflow
222
+ input_data: Initial input to the pipeline
223
+ metadata: Optional metadata to include in context
224
+
225
+ Returns:
226
+ PipelineResult with output and execution details
227
+ """
228
+ if workflow_name not in self.workflows:
229
+ raise ValueError(f"Workflow '{workflow_name}' not found")
230
+
231
+ config = self.workflows[workflow_name]
232
+ context = ExecutionContext(
233
+ workflow_id=workflow_name,
234
+ metadata=metadata or {}
235
+ )
236
+
237
+ start = datetime.now()
238
+
239
+ try:
240
+ # Get execution order
241
+ execution_order = self._build_execution_order(config)
242
+
243
+ # Execute each node
244
+ for node in execution_order:
245
+ output = self._execute_node(node, context, input_data)
246
+ context.set_output(node.id, output)
247
+
248
+ # Get final output
249
+ final_output = context.get_input(config.output_node) if config.output_node else output
250
+
251
+ duration = (datetime.now() - start).total_seconds() * 1000
252
+
253
+ return PipelineResult(
254
+ success=True,
255
+ output=final_output,
256
+ context=context,
257
+ duration_ms=duration
258
+ )
259
+
260
+ except Exception as e:
261
+ duration = (datetime.now() - start).total_seconds() * 1000
262
+ logger.error(f"Workflow {workflow_name} failed: {e}")
263
+
264
+ return PipelineResult(
265
+ success=False,
266
+ output=None,
267
+ context=context,
268
+ duration_ms=duration
269
+ )
270
+
271
+ def run_from_dict(
272
+ self,
273
+ workflow_dict: Dict[str, Any],
274
+ input_data: Any
275
+ ) -> PipelineResult:
276
+ """
277
+ Execute a workflow from a dictionary (e.g., loaded YAML).
278
+
279
+ Useful for ad-hoc workflows without pre-registration.
280
+ """
281
+ config = WorkflowConfig.from_dict(workflow_dict)
282
+ self.register_workflow(config)
283
+ return self.run(config.name, input_data)
284
+
285
+ def list_workflows(self) -> List[str]:
286
+ """List all registered workflows."""
287
+ return list(self.workflows.keys())
288
+
289
+ def describe_workflow(self, name: str) -> Dict[str, Any]:
290
+ """Get details about a workflow."""
291
+ if name not in self.workflows:
292
+ raise ValueError(f"Workflow '{name}' not found")
293
+
294
+ config = self.workflows[name]
295
+ return {
296
+ "name": config.name,
297
+ "description": config.description,
298
+ "nodes": [
299
+ {"id": n.id, "type": n.type.value, "tool": n.tool}
300
+ for n in config.nodes
301
+ ],
302
+ "output_node": config.output_node
303
+ }
bioflow/core/registry.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Tool Registry
3
+ ======================
4
+
5
+ Central registry for all biological tools in the BioFlow platform.
6
+ Supports encoders, predictors, generators, and misc tools.
7
+ """
8
+
9
+ from typing import Dict, Type, Any, Optional, List
10
+ import logging
11
+
12
+ from bioflow.core.base import BioEncoder, BioPredictor, BioGenerator, BioTool
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ToolRegistry:
18
+ """
19
+ Central registry for all biological tools in the BioFlow platform.
20
+
21
+ Features:
22
+ - Register/unregister tools by name
23
+ - Get tools with fallback to default
24
+ - List all registered tools
25
+ - Auto-discovery of tools from plugins directory
26
+
27
+ Usage:
28
+ >>> ToolRegistry.register_encoder("esm2", ESM2Encoder())
29
+ >>> encoder = ToolRegistry.get_encoder("esm2")
30
+ """
31
+
32
+ _encoders: Dict[str, BioEncoder] = {}
33
+ _predictors: Dict[str, BioPredictor] = {}
34
+ _generators: Dict[str, BioGenerator] = {}
35
+ _misc_tools: Dict[str, BioTool] = {}
36
+ _default_encoder: Optional[str] = None
37
+ _default_predictor: Optional[str] = None
38
+
39
+ # ==================== ENCODERS ====================
40
+
41
+ @classmethod
42
+ def register_encoder(cls, name: str, encoder: BioEncoder, set_default: bool = False):
43
+ """Register an encoder with optional default flag."""
44
+ cls._encoders[name] = encoder
45
+ if set_default or cls._default_encoder is None:
46
+ cls._default_encoder = name
47
+ logger.info(f"Registered encoder: {name} (dim={encoder.dimension})")
48
+
49
+ @classmethod
50
+ def unregister_encoder(cls, name: str):
51
+ """Remove an encoder from registry."""
52
+ if name in cls._encoders:
53
+ del cls._encoders[name]
54
+ if cls._default_encoder == name:
55
+ cls._default_encoder = next(iter(cls._encoders), None)
56
+
57
+ @classmethod
58
+ def get_encoder(cls, name: str = None) -> BioEncoder:
59
+ """Get encoder by name, or return default."""
60
+ name = name or cls._default_encoder
61
+ if name not in cls._encoders:
62
+ available = list(cls._encoders.keys())
63
+ raise ValueError(f"Encoder '{name}' not found. Available: {available}")
64
+ return cls._encoders[name]
65
+
66
+ # ==================== PREDICTORS ====================
67
+
68
+ @classmethod
69
+ def register_predictor(cls, name: str, predictor: BioPredictor, set_default: bool = False):
70
+ """Register a predictor with optional default flag."""
71
+ cls._predictors[name] = predictor
72
+ if set_default or cls._default_predictor is None:
73
+ cls._default_predictor = name
74
+ logger.info(f"Registered predictor: {name}")
75
+
76
+ @classmethod
77
+ def unregister_predictor(cls, name: str):
78
+ """Remove a predictor from registry."""
79
+ if name in cls._predictors:
80
+ del cls._predictors[name]
81
+ if cls._default_predictor == name:
82
+ cls._default_predictor = next(iter(cls._predictors), None)
83
+
84
+ @classmethod
85
+ def get_predictor(cls, name: str = None) -> BioPredictor:
86
+ """Get predictor by name, or return default."""
87
+ name = name or cls._default_predictor
88
+ if name not in cls._predictors:
89
+ available = list(cls._predictors.keys())
90
+ raise ValueError(f"Predictor '{name}' not found. Available: {available}")
91
+ return cls._predictors[name]
92
+
93
+ # ==================== GENERATORS ====================
94
+
95
+ @classmethod
96
+ def register_generator(cls, name: str, generator: BioGenerator):
97
+ """Register a generator."""
98
+ cls._generators[name] = generator
99
+ logger.info(f"Registered generator: {name}")
100
+
101
+ @classmethod
102
+ def get_generator(cls, name: str) -> BioGenerator:
103
+ """Get generator by name."""
104
+ if name not in cls._generators:
105
+ available = list(cls._generators.keys())
106
+ raise ValueError(f"Generator '{name}' not found. Available: {available}")
107
+ return cls._generators[name]
108
+
109
+ # ==================== MISC TOOLS ====================
110
+
111
+ @classmethod
112
+ def register_tool(cls, name: str, tool: BioTool):
113
+ """Register a miscellaneous tool."""
114
+ cls._misc_tools[name] = tool
115
+ logger.info(f"Registered tool: {name}")
116
+
117
+ @classmethod
118
+ def get_tool(cls, name: str) -> BioTool:
119
+ """Get misc tool by name."""
120
+ if name not in cls._misc_tools:
121
+ available = list(cls._misc_tools.keys())
122
+ raise ValueError(f"Tool '{name}' not found. Available: {available}")
123
+ return cls._misc_tools[name]
124
+
125
+ # ==================== UTILITIES ====================
126
+
127
+ @classmethod
128
+ def list_tools(cls) -> Dict[str, List[str]]:
129
+ """List all registered tools by category."""
130
+ return {
131
+ "encoders": list(cls._encoders.keys()),
132
+ "predictors": list(cls._predictors.keys()),
133
+ "generators": list(cls._generators.keys()),
134
+ "tools": list(cls._misc_tools.keys())
135
+ }
136
+
137
+ @classmethod
138
+ def clear(cls):
139
+ """Clear all registered tools (useful for testing)."""
140
+ cls._encoders.clear()
141
+ cls._predictors.clear()
142
+ cls._generators.clear()
143
+ cls._misc_tools.clear()
144
+ cls._default_encoder = None
145
+ cls._default_predictor = None
146
+
147
+ @classmethod
148
+ def summary(cls) -> str:
149
+ """Get human-readable summary of registered tools."""
150
+ tools = cls.list_tools()
151
+ lines = ["BioFlow Tool Registry:"]
152
+ for category, names in tools.items():
153
+ lines.append(f" {category}: {', '.join(names) if names else '(none)'}")
154
+ return "\n".join(lines)
bioflow/demo.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Demo Script - Test all capabilities
3
+ =============================================
4
+
5
+ This script demonstrates all major features of the BioFlow system.
6
+ Run this to verify your installation and see the system in action.
7
+
8
+ Usage:
9
+ python bioflow/demo.py
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import numpy as np
15
+ from pprint import pprint
16
+
17
+ # Add project root to path
18
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
+ sys.path.insert(0, ROOT_DIR)
20
+
21
+ def print_header(title: str):
22
+ """Print a formatted section header."""
23
+ print("\n" + "=" * 60)
24
+ print(f" {title}")
25
+ print("=" * 60 + "\n")
26
+
27
+
28
+ def demo_obm_encoding():
29
+ """Demonstrate OBM encoding capabilities."""
30
+ print_header("🧬 OBM Multimodal Encoding")
31
+
32
+ from bioflow.obm_wrapper import OBMWrapper, ModalityType
33
+
34
+ # Initialize with mock mode (no GPU needed)
35
+ obm = OBMWrapper(use_mock=True)
36
+ print(f"✅ OBM initialized in Mock mode")
37
+ print(f" Vector dimension: {obm.vector_dim}")
38
+ print(f" Device: {obm.device}")
39
+
40
+ # Encode text
41
+ print("\n📝 Encoding Text:")
42
+ texts = [
43
+ "KRAS is a protein involved in cell signaling",
44
+ "Aspirin is used to reduce inflammation"
45
+ ]
46
+ text_embeddings = obm.encode_text(texts)
47
+ for emb in text_embeddings:
48
+ print(f" [{emb.modality.value}] dim={emb.dimension}, hash={emb.content_hash}")
49
+ print(f" Content: {emb.content[:50]}...")
50
+
51
+ # Encode SMILES
52
+ print("\n🧪 Encoding SMILES:")
53
+ smiles_list = [
54
+ "CC(=O)OC1=CC=CC=C1C(=O)O", # Aspirin
55
+ "CCO", # Ethanol
56
+ "c1ccccc1" # Benzene
57
+ ]
58
+ mol_embeddings = obm.encode_smiles(smiles_list)
59
+ for emb in mol_embeddings:
60
+ print(f" [{emb.modality.value}] {emb.content} → dim={emb.dimension}")
61
+
62
+ # Encode proteins
63
+ print("\n🔬 Encoding Proteins:")
64
+ proteins = [
65
+ "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGET", # KRAS fragment
66
+ ]
67
+ prot_embeddings = obm.encode_protein(proteins)
68
+ for emb in prot_embeddings:
69
+ print(f" [{emb.modality.value}] {emb.content[:30]}... → dim={emb.dimension}")
70
+
71
+ # Cross-modal similarity
72
+ print("\n🔄 Cross-Modal Similarity (Text → Molecules):")
73
+ similarities = obm.cross_modal_similarity(
74
+ query="anti-inflammatory drug",
75
+ query_modality="text",
76
+ targets=smiles_list,
77
+ target_modality="smiles"
78
+ )
79
+ for content, score in similarities:
80
+ print(f" {score:.4f} | {content}")
81
+
82
+ return obm
83
+
84
+
85
+ def demo_qdrant_manager(obm):
86
+ """Demonstrate Qdrant vector storage."""
87
+ print_header("📦 Qdrant Vector Storage")
88
+
89
+ from bioflow.qdrant_manager import QdrantManager
90
+
91
+ # Initialize with in-memory storage
92
+ qdrant = QdrantManager(obm, default_collection="demo_collection")
93
+ print(f"✅ Qdrant Manager initialized (in-memory)")
94
+
95
+ # Create collection
96
+ qdrant.create_collection(recreate=True)
97
+ print(f" Collection created: demo_collection")
98
+
99
+ # Ingest sample data
100
+ print("\n📥 Ingesting Sample Data:")
101
+ sample_data = [
102
+ {"content": "Aspirin is used to reduce fever and relieve mild to moderate pain", "modality": "text", "source": "PubMed:001", "tags": ["pain", "fever"]},
103
+ {"content": "CC(=O)OC1=CC=CC=C1C(=O)O", "modality": "smiles", "source": "ChEMBL", "tags": ["aspirin", "nsaid"]},
104
+ {"content": "Ibuprofen is a nonsteroidal anti-inflammatory drug", "modality": "text", "source": "PubMed:002", "tags": ["nsaid"]},
105
+ {"content": "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", "modality": "smiles", "source": "ChEMBL", "tags": ["ibuprofen"]},
106
+ {"content": "KRAS mutations are found in many cancers", "modality": "text", "source": "PubMed:003", "tags": ["cancer", "KRAS"]},
107
+ {"content": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGET", "modality": "protein", "source": "UniProt:P01116", "tags": ["KRAS"]},
108
+ ]
109
+
110
+ stats = qdrant.ingest(sample_data)
111
+ print(f" Ingestion stats: {stats}")
112
+
113
+ # Collection info
114
+ info = qdrant.get_collection_info()
115
+ print(f"\n📊 Collection Info:")
116
+ for k, v in info.items():
117
+ print(f" {k}: {v}")
118
+
119
+ # Search
120
+ print("\n🔍 Searching for 'anti-inflammatory':")
121
+ results = qdrant.search(
122
+ query="anti-inflammatory medicine",
123
+ query_modality="text",
124
+ limit=3
125
+ )
126
+ for r in results:
127
+ print(f" {r.score:.4f} | [{r.modality}] {r.content[:50]}...")
128
+
129
+ # Cross-modal search
130
+ print("\n🔄 Cross-Modal Search (Text → Molecules):")
131
+ results = qdrant.cross_modal_search(
132
+ query="pain relief medication",
133
+ query_modality="text",
134
+ target_modality="smiles",
135
+ limit=3
136
+ )
137
+ for r in results:
138
+ print(f" {r.score:.4f} | {r.content}")
139
+
140
+ # Diversity analysis
141
+ print("\n📈 Neighbors Diversity Analysis:")
142
+ diversity = qdrant.get_neighbors_diversity(
143
+ query="cancer treatment",
144
+ query_modality="text",
145
+ k=5
146
+ )
147
+ for k, v in diversity.items():
148
+ if isinstance(v, float):
149
+ print(f" {k}: {v:.4f}")
150
+ else:
151
+ print(f" {k}: {v}")
152
+
153
+ return qdrant
154
+
155
+
156
+ def demo_pipeline(obm, qdrant):
157
+ """Demonstrate the pipeline system."""
158
+ print_header("🔬 BioFlow Pipeline")
159
+
160
+ from bioflow.pipeline import BioFlowPipeline, MinerAgent, ValidatorAgent
161
+
162
+ # Initialize pipeline
163
+ pipeline = BioFlowPipeline(obm, qdrant)
164
+ print("✅ Pipeline initialized")
165
+
166
+ # Register agents
167
+ miner = MinerAgent(obm, qdrant, "demo_collection")
168
+ validator = ValidatorAgent(obm, qdrant, "demo_collection")
169
+
170
+ pipeline.register_agent(miner)
171
+ pipeline.register_agent(validator)
172
+ print(f" Registered agents: {list(pipeline.agents.keys())}")
173
+
174
+ # Run discovery workflow
175
+ print("\n🚀 Running Discovery Workflow:")
176
+ print(" Query: 'anti-inflammatory drug for pain'")
177
+
178
+ results = pipeline.run_discovery_workflow(
179
+ query="anti-inflammatory drug for pain",
180
+ query_modality="text",
181
+ target_modality="smiles"
182
+ )
183
+
184
+ print("\n 📚 Literature Results:")
185
+ for item in results["stages"].get("literature", [])[:3]:
186
+ print(f" {item['score']:.4f} | {item['content'][:40]}...")
187
+
188
+ print("\n 🧪 Molecule Results:")
189
+ for item in results["stages"].get("molecules", [])[:3]:
190
+ print(f" {item['score']:.4f} | {item['content']}")
191
+
192
+ print("\n 📊 Diversity:")
193
+ div = results["stages"].get("diversity", {})
194
+ print(f" Mean similarity: {div.get('mean_similarity', 0):.4f}")
195
+ print(f" Diversity score: {div.get('diversity_score', 0):.4f}")
196
+
197
+
198
+ def demo_visualization():
199
+ """Demonstrate visualization capabilities."""
200
+ print_header("📊 Visualization Capabilities")
201
+
202
+ try:
203
+ from bioflow.visualizer import EmbeddingVisualizer, ResultsVisualizer
204
+ print("✅ Visualization module loaded")
205
+
206
+ # Generate sample embeddings
207
+ n_samples = 20
208
+ embeddings = np.random.randn(n_samples, 768)
209
+ labels = [f"Sample {i}" for i in range(n_samples)]
210
+ colors = ["text"] * 7 + ["smiles"] * 7 + ["protein"] * 6
211
+
212
+ # Dimensionality reduction
213
+ print("\n🔻 Dimensionality Reduction:")
214
+ reduced = EmbeddingVisualizer.reduce_dimensions(embeddings, method="pca", n_components=2)
215
+ print(f" Original shape: {embeddings.shape}")
216
+ print(f" Reduced shape: {reduced.shape}")
217
+
218
+ # Note about plots
219
+ print("\n📈 Plotting Functions Available:")
220
+ print(" - plot_embeddings_2d(embeddings, labels, colors)")
221
+ print(" - plot_embeddings_3d(embeddings, labels)")
222
+ print(" - plot_similarity_matrix(embeddings, labels)")
223
+ print(" - create_dashboard(results, embeddings)")
224
+ print("\n Run the Streamlit app to see interactive visualizations!")
225
+
226
+ except ImportError as e:
227
+ print(f"⚠️ Some visualization dependencies missing: {e}")
228
+ print(" Install with: pip install plotly scikit-learn")
229
+
230
+
231
+ def main():
232
+ """Run all demos."""
233
+ print("\n" + "🧬" * 20)
234
+ print(" BIOFLOW + OBM DEMO")
235
+ print("🧬" * 20)
236
+
237
+ print("\nThis demo runs in MOCK mode (no GPU/model required).")
238
+ print("Embeddings are deterministic random vectors for testing.\n")
239
+
240
+ # Run demos
241
+ obm = demo_obm_encoding()
242
+ qdrant = demo_qdrant_manager(obm)
243
+ demo_pipeline(obm, qdrant)
244
+ demo_visualization()
245
+
246
+ print_header("✅ Demo Complete!")
247
+ print("Next steps:")
248
+ print(" 1. Run the Streamlit interface:")
249
+ print(" streamlit run bioflow/app.py")
250
+ print("")
251
+ print(" 2. For real embeddings, set use_mock=False and ensure:")
252
+ print(" - BioMedGPT checkpoints are downloaded")
253
+ print(" - GPU is available")
254
+ print("")
255
+ print(" 3. Read the documentation:")
256
+ print(" docs/BIOFLOW_OBM_REPORT.md")
257
+ print("")
258
+
259
+
260
+ if __name__ == "__main__":
261
+ main()
bioflow/obm_wrapper.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OBM Wrapper - Unified Multimodal Encoding Interface
3
+ =====================================================
4
+
5
+ This module provides a clean, high-level API for encoding biological data
6
+ (text, molecules, proteins) into a unified vector space using open-source models.
7
+ """
8
+
9
+ import os
10
+ import sys
11
+ import torch
12
+ import numpy as np
13
+ import logging
14
+ from typing import List, Union, Dict, Any, Optional, Tuple
15
+ from dataclasses import dataclass
16
+ from enum import Enum
17
+ import hashlib
18
+
19
+ # Add project root to path
20
+ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
21
+ sys.path.insert(0, ROOT_DIR)
22
+
23
+ logging.basicConfig(level=logging.INFO)
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class ModalityType(Enum):
28
+ """Supported data modalities."""
29
+ TEXT = "text"
30
+ MOLECULE = "molecule"
31
+ SMILES = "smiles"
32
+ PROTEIN = "protein"
33
+ CELL = "cell"
34
+
35
+
36
+ @dataclass
37
+ class EmbeddingResult:
38
+ """Container for embedding results with metadata."""
39
+ vector: np.ndarray
40
+ modality: ModalityType
41
+ content: str
42
+ content_hash: str
43
+ dimension: int
44
+
45
+ def to_dict(self) -> Dict[str, Any]:
46
+ return {
47
+ "vector": self.vector.tolist(),
48
+ "modality": self.modality.value,
49
+ "content": self.content,
50
+ "content_hash": self.content_hash,
51
+ "dimension": self.dimension
52
+ }
53
+
54
+
55
+ class OBMWrapper:
56
+ """
57
+ Unified wrapper for OpenBioMed multimodal encoding.
58
+
59
+ This class provides a clean API for encoding biological data into
60
+ a shared embedding space, enabling cross-modal similarity search.
61
+
62
+ Attributes:
63
+ device: Computing device ('cuda' or 'cpu')
64
+ model: Underlying open-source model
65
+ vector_dim: Dimension of output embeddings
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ device: str = None,
71
+ config_path: str = None,
72
+ checkpoint_path: str = None,
73
+ use_mock: bool = False
74
+ ):
75
+ """
76
+ Initialize the OBM wrapper.
77
+
78
+ Args:
79
+ device: 'cuda' or 'cpu'. Auto-detects if None.
80
+ config_path: Path to open-source model config YAML.
81
+ checkpoint_path: Path to model weights.
82
+ use_mock: If True, uses mock embeddings (for testing without GPU).
83
+ """
84
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
85
+ self.use_mock = use_mock
86
+ self._model = None
87
+ self._vector_dim = 768 # Default, updated after model load
88
+
89
+ if config_path is None:
90
+ config_path = os.path.join(ROOT_DIR, "configs/model/opensource_model.yaml")
91
+
92
+ self.config_path = config_path
93
+ self.checkpoint_path = checkpoint_path
94
+
95
+ if not use_mock:
96
+ self._init_model()
97
+ else:
98
+ logger.info("Using MOCK mode - embeddings are random vectors for testing")
99
+ self._vector_dim = 768
100
+
101
+ def _init_model(self):
102
+ """Initialize the open-source model."""
103
+ try:
104
+ # Placeholder for initializing open-source model
105
+ pass
106
+
107
+ self._model = None
108
+
109
+ self._vector_dim = 768
110
+
111
+ logger.info(f"OBM initialized. Device: {self.device}, Vector dim: {self._vector_dim}")
112
+
113
+ except Exception as e:
114
+ logger.error(f"Failed to load model: {e}")
115
+ logger.warning("Falling back to MOCK mode")
116
+ self.use_mock = True
117
+ self._vector_dim = 768
118
+
119
+ @property
120
+ def vector_dim(self) -> int:
121
+ """Return the embedding dimension."""
122
+ return self._vector_dim
123
+
124
+ @property
125
+ def is_ready(self) -> bool:
126
+ """Check if model is loaded and ready."""
127
+ return self._model is not None or self.use_mock
128
+
129
+ def _compute_hash(self, content: str) -> str:
130
+ """Compute content hash for deduplication."""
131
+ return hashlib.md5(content.encode()).hexdigest()[:16]
132
+
133
+ def _mock_embed(self, content: str, modality: ModalityType) -> np.ndarray:
134
+ """Generate deterministic mock embedding based on content hash."""
135
+ seed = int(self._compute_hash(content), 16) % (2**32)
136
+ rng = np.random.RandomState(seed)
137
+ vec = rng.randn(self._vector_dim).astype(np.float32)
138
+ # Normalize
139
+ vec = vec / np.linalg.norm(vec)
140
+ return vec
141
+
142
+ @torch.no_grad()
143
+ def encode_text(self, text: Union[str, List[str]]) -> List[EmbeddingResult]:
144
+ """
145
+ Encode text (abstracts, descriptions, notes) into embeddings.
146
+
147
+ Args:
148
+ text: Single string or list of strings.
149
+
150
+ Returns:
151
+ List of EmbeddingResult objects.
152
+ """
153
+ if isinstance(text, str):
154
+ text = [text]
155
+
156
+ results = []
157
+
158
+ if self.use_mock:
159
+ for t in text:
160
+ vec = self._mock_embed(t, ModalityType.TEXT)
161
+ results.append(EmbeddingResult(
162
+ vector=vec,
163
+ modality=ModalityType.TEXT,
164
+ content=t[:200], # Truncate for storage
165
+ content_hash=self._compute_hash(t),
166
+ dimension=self._vector_dim
167
+ ))
168
+ else:
169
+ tokenizer = self._model.llm_tokenizer
170
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
171
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
172
+
173
+ outputs = self._model.llm(**inputs, output_hidden_states=True)
174
+ hidden = outputs.hidden_states[-1]
175
+
176
+ mask = inputs['attention_mask'].unsqueeze(-1).float()
177
+ pooled = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
178
+ vectors = pooled.cpu().numpy()
179
+
180
+ for i, t in enumerate(text):
181
+ results.append(EmbeddingResult(
182
+ vector=vectors[i],
183
+ modality=ModalityType.TEXT,
184
+ content=t[:200],
185
+ content_hash=self._compute_hash(t),
186
+ dimension=self._vector_dim
187
+ ))
188
+
189
+ return results
190
+
191
+ @torch.no_grad()
192
+ def encode_smiles(self, smiles: Union[str, List[str]]) -> List[EmbeddingResult]:
193
+ """
194
+ Encode SMILES molecular representations into embeddings.
195
+
196
+ Args:
197
+ smiles: Single SMILES string or list of SMILES.
198
+
199
+ Returns:
200
+ List of EmbeddingResult objects.
201
+ """
202
+ if isinstance(smiles, str):
203
+ smiles = [smiles]
204
+
205
+ results = []
206
+
207
+ if self.use_mock:
208
+ for s in smiles:
209
+ vec = self._mock_embed(s, ModalityType.MOLECULE)
210
+ results.append(EmbeddingResult(
211
+ vector=vec,
212
+ modality=ModalityType.MOLECULE,
213
+ content=s,
214
+ content_hash=self._compute_hash(s),
215
+ dimension=self._vector_dim
216
+ ))
217
+ else:
218
+ from open_biomed.data import Molecule
219
+ from torch_scatter import scatter_mean
220
+
221
+ molecules = [Molecule.from_smiles(s) for s in smiles]
222
+ mol_feats = [self._model.featurizer.molecule_featurizer(m) for m in molecules]
223
+ collated = self._model.collator.molecule_collator(mol_feats).to(self.device)
224
+
225
+ node_feats = self._model.mol_structure_encoder(collated)
226
+ proj_feats = self._model.proj_mol(node_feats)
227
+ vectors = scatter_mean(proj_feats, collated.batch, dim=0).cpu().numpy()
228
+
229
+ for i, s in enumerate(smiles):
230
+ results.append(EmbeddingResult(
231
+ vector=vectors[i],
232
+ modality=ModalityType.MOLECULE,
233
+ content=s,
234
+ content_hash=self._compute_hash(s),
235
+ dimension=self._vector_dim
236
+ ))
237
+
238
+ return results
239
+
240
+ @torch.no_grad()
241
+ def encode_protein(self, sequences: Union[str, List[str]]) -> List[EmbeddingResult]:
242
+ """
243
+ Encode protein sequences (FASTA format) into embeddings.
244
+
245
+ Args:
246
+ sequences: Single sequence or list of sequences.
247
+
248
+ Returns:
249
+ List of EmbeddingResult objects.
250
+ """
251
+ if isinstance(sequences, str):
252
+ sequences = [sequences]
253
+
254
+ results = []
255
+
256
+ if self.use_mock:
257
+ for seq in sequences:
258
+ vec = self._mock_embed(seq, ModalityType.PROTEIN)
259
+ results.append(EmbeddingResult(
260
+ vector=vec,
261
+ modality=ModalityType.PROTEIN,
262
+ content=seq[:100] + "..." if len(seq) > 100 else seq,
263
+ content_hash=self._compute_hash(seq),
264
+ dimension=self._vector_dim
265
+ ))
266
+ else:
267
+ tokenizer = self._model.prot_tokenizer
268
+ inputs = tokenizer(sequences, return_tensors="pt", padding=True, truncation=True, max_length=1024)
269
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
270
+
271
+ outputs = self._model.prot_structure_encoder(**inputs)
272
+ hidden = outputs.last_hidden_state
273
+ proj = self._model.proj_prot(hidden)
274
+
275
+ mask = inputs['attention_mask'].unsqueeze(-1).float()
276
+ pooled = (proj * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
277
+ vectors = pooled.cpu().numpy()
278
+
279
+ for i, seq in enumerate(sequences):
280
+ results.append(EmbeddingResult(
281
+ vector=vectors[i],
282
+ modality=ModalityType.PROTEIN,
283
+ content=seq[:100] + "..." if len(seq) > 100 else seq,
284
+ content_hash=self._compute_hash(seq),
285
+ dimension=self._vector_dim
286
+ ))
287
+
288
+ return results
289
+
290
+ def encode(self, content: str, modality: Union[str, ModalityType]) -> EmbeddingResult:
291
+ """
292
+ Universal encoding function.
293
+
294
+ Args:
295
+ content: The content to encode.
296
+ modality: Type of content ('text', 'smiles', 'molecule', 'protein').
297
+
298
+ Returns:
299
+ Single EmbeddingResult.
300
+ """
301
+ if isinstance(modality, str):
302
+ modality = ModalityType(modality.lower())
303
+
304
+ if modality in [ModalityType.TEXT]:
305
+ return self.encode_text(content)[0]
306
+ elif modality in [ModalityType.MOLECULE, ModalityType.SMILES]:
307
+ return self.encode_smiles(content)[0]
308
+ elif modality == ModalityType.PROTEIN:
309
+ return self.encode_protein(content)[0]
310
+ else:
311
+ raise ValueError(f"Unsupported modality: {modality}")
312
+
313
+ def compute_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
314
+ """Compute cosine similarity between two embeddings."""
315
+ norm1 = np.linalg.norm(embedding1)
316
+ norm2 = np.linalg.norm(embedding2)
317
+ if norm1 == 0 or norm2 == 0:
318
+ return 0.0
319
+ return float(np.dot(embedding1, embedding2) / (norm1 * norm2))
320
+
321
+ def cross_modal_similarity(
322
+ self,
323
+ query: str,
324
+ query_modality: str,
325
+ targets: List[str],
326
+ target_modality: str
327
+ ) -> List[Tuple[str, float]]:
328
+ """
329
+ Compute cross-modal similarities.
330
+
331
+ Args:
332
+ query: Query content.
333
+ query_modality: Modality of query.
334
+ targets: List of target contents.
335
+ target_modality: Modality of targets.
336
+
337
+ Returns:
338
+ List of (target, similarity_score) tuples, sorted by similarity.
339
+ """
340
+ query_emb = self.encode(query, query_modality)
341
+ target_embs = []
342
+
343
+ if target_modality.lower() in ['text']:
344
+ target_embs = self.encode_text(targets)
345
+ elif target_modality.lower() in ['smiles', 'molecule']:
346
+ target_embs = self.encode_smiles(targets)
347
+ elif target_modality.lower() == 'protein':
348
+ target_embs = self.encode_protein(targets)
349
+
350
+ results = []
351
+ for emb in target_embs:
352
+ sim = self.compute_similarity(query_emb.vector, emb.vector)
353
+ results.append((emb.content, sim))
354
+
355
+ return sorted(results, key=lambda x: x[1], reverse=True)
bioflow/pipeline.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Pipeline - Workflow Orchestration
3
+ ==========================================
4
+
5
+ This module provides the pipeline orchestration for BioFlow,
6
+ connecting agents, memory (Qdrant), and OBM encoders.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Callable
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from datetime import datetime
14
+ import json
15
+
16
+ from bioflow.obm_wrapper import OBMWrapper
17
+ from bioflow.qdrant_manager import QdrantManager, SearchResult
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class AgentType(Enum):
24
+ """Types of agents in the BioFlow system."""
25
+ GENERATOR = "generator" # Generates new molecules/variants
26
+ VALIDATOR = "validator" # Validates properties (toxicity, etc.)
27
+ MINER = "miner" # Mines literature for evidence
28
+ RANKER = "ranker" # Ranks candidates
29
+ CUSTOM = "custom"
30
+
31
+
32
+ @dataclass
33
+ class AgentMessage:
34
+ """Message passed between agents."""
35
+ sender: str
36
+ content: Any
37
+ metadata: Dict[str, Any] = field(default_factory=dict)
38
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
39
+
40
+
41
+ @dataclass
42
+ class PipelineResult:
43
+ """Result of a pipeline execution."""
44
+ success: bool
45
+ outputs: List[Any]
46
+ messages: List[AgentMessage]
47
+ stats: Dict[str, Any]
48
+
49
+
50
+ class BaseAgent:
51
+ """Base class for all BioFlow agents."""
52
+
53
+ def __init__(
54
+ self,
55
+ name: str,
56
+ agent_type: AgentType,
57
+ obm: OBMWrapper,
58
+ qdrant: QdrantManager
59
+ ):
60
+ self.name = name
61
+ self.agent_type = agent_type
62
+ self.obm = obm
63
+ self.qdrant = qdrant
64
+
65
+ def process(self, input_data: Any, context: Dict[str, Any] = None) -> AgentMessage:
66
+ """Process input and return output message."""
67
+ raise NotImplementedError
68
+
69
+
70
+ class MinerAgent(BaseAgent):
71
+ """
72
+ Literature mining agent.
73
+
74
+ Retrieves relevant scientific articles/abstracts based on query.
75
+ """
76
+
77
+ def __init__(self, obm: OBMWrapper, qdrant: QdrantManager, collection: str = None):
78
+ super().__init__("LiteratureMiner", AgentType.MINER, obm, qdrant)
79
+ self.collection = collection
80
+
81
+ def process(
82
+ self,
83
+ input_data: str,
84
+ context: Dict[str, Any] = None
85
+ ) -> AgentMessage:
86
+ """
87
+ Search for relevant literature.
88
+
89
+ Args:
90
+ input_data: Query text, SMILES, or protein sequence.
91
+ context: Optional context with 'modality', 'limit'.
92
+ """
93
+ context = context or {}
94
+ modality = context.get("modality", "text")
95
+ limit = context.get("limit", 5)
96
+
97
+ results = self.qdrant.search(
98
+ query=input_data,
99
+ query_modality=modality,
100
+ collection=self.collection,
101
+ limit=limit,
102
+ filter_modality="text"
103
+ )
104
+
105
+ return AgentMessage(
106
+ sender=self.name,
107
+ content=[r.payload for r in results],
108
+ metadata={
109
+ "query": input_data,
110
+ "modality": modality,
111
+ "result_count": len(results),
112
+ "top_score": results[0].score if results else 0
113
+ }
114
+ )
115
+
116
+
117
+ class ValidatorAgent(BaseAgent):
118
+ """
119
+ Validation agent.
120
+
121
+ Checks molecules against known toxicity, drug-likeness, etc.
122
+ """
123
+
124
+ def __init__(self, obm: OBMWrapper, qdrant: QdrantManager, collection: str = None):
125
+ super().__init__("Validator", AgentType.VALIDATOR, obm, qdrant)
126
+ self.collection = collection
127
+
128
+ def process(
129
+ self,
130
+ input_data: str,
131
+ context: Dict[str, Any] = None
132
+ ) -> AgentMessage:
133
+ """
134
+ Validate a molecule.
135
+
136
+ Args:
137
+ input_data: SMILES string to validate.
138
+ context: Optional context.
139
+ """
140
+ context = context or {}
141
+
142
+ # Search for similar known molecules
143
+ similar = self.qdrant.search(
144
+ query=input_data,
145
+ query_modality="smiles",
146
+ collection=self.collection,
147
+ limit=10,
148
+ filter_modality="smiles"
149
+ )
150
+
151
+ # Basic validation flags
152
+ validation = {
153
+ "has_similar_known": len(similar) > 0,
154
+ "max_similarity": similar[0].score if similar else 0,
155
+ "similar_molecules": [
156
+ {
157
+ "smiles": r.content,
158
+ "score": r.score,
159
+ "tags": r.payload.get("tags", [])
160
+ }
161
+ for r in similar[:3]
162
+ ]
163
+ }
164
+
165
+ # Flag potential issues based on tags of similar molecules
166
+ risk_tags = ["toxic", "mutagenic", "carcinogenic"]
167
+ flagged_risks = []
168
+ for r in similar:
169
+ tags = r.payload.get("tags", [])
170
+ for tag in tags:
171
+ if any(risk in tag.lower() for risk in risk_tags):
172
+ flagged_risks.append({"molecule": r.content, "tag": tag})
173
+
174
+ validation["flagged_risks"] = flagged_risks
175
+ validation["passed"] = len(flagged_risks) == 0
176
+
177
+ return AgentMessage(
178
+ sender=self.name,
179
+ content=validation,
180
+ metadata={"input_smiles": input_data}
181
+ )
182
+
183
+
184
+ class RankerAgent(BaseAgent):
185
+ """
186
+ Ranking agent.
187
+
188
+ Ranks candidates based on multiple criteria.
189
+ """
190
+
191
+ def __init__(self, obm: OBMWrapper, qdrant: QdrantManager):
192
+ super().__init__("Ranker", AgentType.RANKER, obm, qdrant)
193
+
194
+ def process(
195
+ self,
196
+ input_data: List[Dict[str, Any]],
197
+ context: Dict[str, Any] = None
198
+ ) -> AgentMessage:
199
+ """
200
+ Rank a list of candidates.
201
+
202
+ Args:
203
+ input_data: List of candidate dicts with 'content', 'scores'.
204
+ """
205
+ # Simple weighted ranking
206
+ ranked = sorted(
207
+ input_data,
208
+ key=lambda x: sum(x.get("scores", {}).values()),
209
+ reverse=True
210
+ )
211
+
212
+ return AgentMessage(
213
+ sender=self.name,
214
+ content=ranked,
215
+ metadata={"original_count": len(input_data)}
216
+ )
217
+
218
+
219
+ class BioFlowPipeline:
220
+ """
221
+ Main pipeline orchestrator for BioFlow.
222
+
223
+ Connects multiple agents in a workflow.
224
+ """
225
+
226
+ def __init__(
227
+ self,
228
+ obm: OBMWrapper,
229
+ qdrant: QdrantManager
230
+ ):
231
+ self.obm = obm
232
+ self.qdrant = qdrant
233
+ self.agents: Dict[str, BaseAgent] = {}
234
+ self.workflow: List[str] = []
235
+ self.messages: List[AgentMessage] = []
236
+
237
+ def register_agent(self, agent: BaseAgent) -> None:
238
+ """Register an agent with the pipeline."""
239
+ self.agents[agent.name] = agent
240
+ logger.info(f"Registered agent: {agent.name} ({agent.agent_type.value})")
241
+
242
+ def set_workflow(self, agent_names: List[str]) -> None:
243
+ """
244
+ Set the workflow order.
245
+
246
+ Args:
247
+ agent_names: List of agent names in execution order.
248
+ """
249
+ for name in agent_names:
250
+ if name not in self.agents:
251
+ raise ValueError(f"Unknown agent: {name}")
252
+ self.workflow = agent_names
253
+
254
+ def run(
255
+ self,
256
+ initial_input: Any,
257
+ initial_context: Dict[str, Any] = None
258
+ ) -> PipelineResult:
259
+ """
260
+ Execute the pipeline.
261
+
262
+ Args:
263
+ initial_input: Starting input data.
264
+ initial_context: Initial context for first agent.
265
+
266
+ Returns:
267
+ PipelineResult with all outputs and messages.
268
+ """
269
+ self.messages = []
270
+ current_input = initial_input
271
+ current_context = initial_context or {}
272
+ outputs = []
273
+
274
+ for agent_name in self.workflow:
275
+ agent = self.agents[agent_name]
276
+ logger.info(f"Executing agent: {agent_name}")
277
+
278
+ try:
279
+ message = agent.process(current_input, current_context)
280
+ self.messages.append(message)
281
+ outputs.append(message.content)
282
+
283
+ # Pass output to next agent
284
+ current_input = message.content
285
+ current_context.update(message.metadata)
286
+
287
+ except Exception as e:
288
+ logger.error(f"Agent {agent_name} failed: {e}")
289
+ return PipelineResult(
290
+ success=False,
291
+ outputs=outputs,
292
+ messages=self.messages,
293
+ stats={"failed_at": agent_name, "error": str(e)}
294
+ )
295
+
296
+ return PipelineResult(
297
+ success=True,
298
+ outputs=outputs,
299
+ messages=self.messages,
300
+ stats={
301
+ "agents_executed": len(self.workflow),
302
+ "total_messages": len(self.messages)
303
+ }
304
+ )
305
+
306
+ def run_discovery_workflow(
307
+ self,
308
+ query: str,
309
+ query_modality: str = "text",
310
+ target_modality: str = "smiles"
311
+ ) -> Dict[str, Any]:
312
+ """
313
+ Run a complete discovery workflow.
314
+
315
+ 1. Search for related literature
316
+ 2. Find similar molecules
317
+ 3. Validate candidates
318
+ 4. Return ranked results
319
+ """
320
+ results = {
321
+ "query": query,
322
+ "query_modality": query_modality,
323
+ "target_modality": target_modality,
324
+ "stages": {}
325
+ }
326
+
327
+ # Stage 1: Literature search
328
+ literature = self.qdrant.search(
329
+ query=query,
330
+ query_modality=query_modality,
331
+ limit=5,
332
+ filter_modality="text"
333
+ )
334
+ results["stages"]["literature"] = [
335
+ {"content": r.content, "score": r.score}
336
+ for r in literature
337
+ ]
338
+
339
+ # Stage 2: Cross-modal molecule search
340
+ molecules = self.qdrant.cross_modal_search(
341
+ query=query,
342
+ query_modality=query_modality,
343
+ target_modality=target_modality,
344
+ limit=10
345
+ )
346
+ results["stages"]["molecules"] = [
347
+ {"content": r.content, "score": r.score, "payload": r.payload}
348
+ for r in molecules
349
+ ]
350
+
351
+ # Stage 3: Validate top candidates
352
+ if "Validator" in self.agents and molecules:
353
+ validated = []
354
+ for mol in molecules[:3]:
355
+ val_msg = self.agents["Validator"].process(mol.content)
356
+ validated.append({
357
+ "smiles": mol.content,
358
+ "validation": val_msg.content
359
+ })
360
+ results["stages"]["validation"] = validated
361
+
362
+ # Stage 4: Diversity analysis
363
+ diversity = self.qdrant.get_neighbors_diversity(
364
+ query=query,
365
+ query_modality=query_modality,
366
+ k=10
367
+ )
368
+ results["stages"]["diversity"] = diversity
369
+
370
+ return results
bioflow/plugins/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Plugins
3
+ ================
4
+
5
+ Tool implementations for the BioFlow platform.
6
+
7
+ Encoders:
8
+ - OBMEncoder: Unified multimodal encoder (text, molecules, proteins)
9
+ - TextEncoder: PubMedBERT / SciBERT for biomedical text
10
+ - MoleculeEncoder: ChemBERTa for SMILES
11
+ - ProteinEncoder: ESM-2 for protein sequences
12
+
13
+ Retrievers:
14
+ - QdrantRetriever: Vector database search with Qdrant
15
+
16
+ Predictors:
17
+ - DeepPurposePredictor: Drug-Target Interaction prediction
18
+ """
19
+
20
+ # Encoders
21
+ from bioflow.plugins.obm_encoder import OBMEncoder
22
+ from bioflow.plugins.encoders import TextEncoder, MoleculeEncoder, ProteinEncoder
23
+
24
+ # Retriever
25
+ from bioflow.plugins.qdrant_retriever import QdrantRetriever
26
+
27
+ # Predictor
28
+ from bioflow.plugins.deeppurpose_predictor import DeepPurposePredictor
29
+
30
+ __all__ = [
31
+ # Encoders
32
+ "OBMEncoder",
33
+ "TextEncoder",
34
+ "MoleculeEncoder",
35
+ "ProteinEncoder",
36
+ # Retriever
37
+ "QdrantRetriever",
38
+ # Predictor
39
+ "DeepPurposePredictor",
40
+ ]
41
+
42
+
43
+ def register_all(registry=None):
44
+ """
45
+ Register all plugins with the tool registry.
46
+
47
+ Args:
48
+ registry: ToolRegistry instance (uses global if None)
49
+ """
50
+ from bioflow.core import ToolRegistry
51
+ registry = registry or ToolRegistry
52
+
53
+ # Note: Encoders are lazy-loaded, so we don't instantiate here
54
+ # They will be registered when first used
55
+ print("Plugins available for registration:")
56
+ print(" Encoders: OBMEncoder, TextEncoder, MoleculeEncoder, ProteinEncoder")
57
+ print(" Retrievers: QdrantRetriever")
58
+ print(" Predictors: DeepPurposePredictor")
bioflow/plugins/deeppurpose_predictor.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DeepPurpose Predictor - DTI Prediction
3
+ ========================================
4
+
5
+ Implements BioPredictor interface for drug-target interaction prediction.
6
+
7
+ Note: DeepPurpose is an open-source toolkit for DTI/DDI prediction.
8
+ If DeepPurpose is not available, falls back to a simple baseline.
9
+ """
10
+
11
+ import logging
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ import warnings
14
+
15
+ from bioflow.core import BioPredictor, PredictionResult
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Lazy import
20
+ _deeppurpose = None
21
+ _deeppurpose_available = None
22
+
23
+
24
+ def _check_deeppurpose():
25
+ global _deeppurpose, _deeppurpose_available
26
+
27
+ if _deeppurpose_available is None:
28
+ try:
29
+ from DeepPurpose import DTI as DeepPurposeDTI
30
+ from DeepPurpose import utils as DeepPurposeUtils
31
+ _deeppurpose = {
32
+ "DTI": DeepPurposeDTI,
33
+ "utils": DeepPurposeUtils
34
+ }
35
+ _deeppurpose_available = True
36
+ logger.info("DeepPurpose is available")
37
+ except ImportError:
38
+ _deeppurpose_available = False
39
+ logger.warning(
40
+ "DeepPurpose not available. Using fallback predictor. "
41
+ "Install with: pip install DeepPurpose"
42
+ )
43
+
44
+ return _deeppurpose_available, _deeppurpose
45
+
46
+
47
+ class DeepPurposePredictor(BioPredictor):
48
+ """
49
+ Drug-Target Interaction predictor using DeepPurpose.
50
+
51
+ Predicts binding affinity between a drug (SMILES) and target (protein sequence).
52
+
53
+ Example:
54
+ >>> predictor = DeepPurposePredictor()
55
+ >>> result = predictor.predict(
56
+ ... drug="CC(=O)Oc1ccccc1C(=O)O", # Aspirin
57
+ ... target="MKTVRQERLKSIVRILERSKEPVSG..." # Target protein
58
+ ... )
59
+ >>> print(result.score) # Predicted binding affinity
60
+
61
+ Models (when DeepPurpose is available):
62
+ - Transformer + CNN (default)
63
+ - MPNN + CNN
64
+ - Morgan + AAC (baseline)
65
+ """
66
+
67
+ AVAILABLE_MODELS = [
68
+ "Transformer_CNN",
69
+ "MPNN_CNN",
70
+ "Morgan_CNN",
71
+ "Morgan_AAC",
72
+ ]
73
+
74
+ def __init__(
75
+ self,
76
+ model_type: str = "Transformer_CNN",
77
+ pretrained: str = None,
78
+ device: str = "cpu"
79
+ ):
80
+ """
81
+ Initialize DeepPurposePredictor.
82
+
83
+ Args:
84
+ model_type: Model architecture (e.g., "Transformer_CNN")
85
+ pretrained: Path to pretrained model
86
+ device: torch device
87
+ """
88
+ self.model_type = model_type
89
+ self.pretrained = pretrained
90
+ self.device = device
91
+
92
+ available, dp = _check_deeppurpose()
93
+ self._use_deeppurpose = available
94
+ self._model = None
95
+
96
+ if available and pretrained:
97
+ self._load_pretrained(pretrained)
98
+
99
+ def _load_pretrained(self, path: str):
100
+ """Load pretrained DeepPurpose model."""
101
+ available, dp = _check_deeppurpose()
102
+ if not available:
103
+ return
104
+
105
+ try:
106
+ self._model = dp["DTI"].load_pretrained_model(path)
107
+ logger.info(f"Loaded pretrained model from {path}")
108
+ except Exception as e:
109
+ logger.error(f"Failed to load pretrained model: {e}")
110
+
111
+ def _fallback_predict(self, drug: str, target: str) -> Tuple[float, float]:
112
+ """
113
+ Fallback prediction when DeepPurpose is not available.
114
+
115
+ Uses simple heuristics based on molecular properties.
116
+ This is NOT accurate - just a placeholder.
117
+ """
118
+ # Simple heuristics based on sequence/molecule properties
119
+ drug_score = min(len(drug) / 50.0, 1.0) # Longer SMILES = higher complexity
120
+ target_score = min(len(target) / 500.0, 1.0) # Longer protein = more binding sites
121
+
122
+ # Combine with some randomness
123
+ import random
124
+ random.seed(hash(drug + target) % 2**32)
125
+ base_score = (drug_score + target_score) / 2
126
+ noise = random.uniform(-0.1, 0.1)
127
+
128
+ score = max(0, min(1, base_score + noise))
129
+ confidence = 0.3 # Low confidence for fallback
130
+
131
+ return score, confidence
132
+
133
+ def predict(self, drug: str, target: str) -> PredictionResult:
134
+ """
135
+ Predict drug-target interaction.
136
+
137
+ Args:
138
+ drug: SMILES string of drug molecule
139
+ target: Protein sequence
140
+
141
+ Returns:
142
+ PredictionResult with binding affinity score
143
+ """
144
+ if self._use_deeppurpose:
145
+ return self._predict_deeppurpose(drug, target)
146
+ else:
147
+ score, confidence = self._fallback_predict(drug, target)
148
+ return PredictionResult(
149
+ score=score,
150
+ confidence=confidence,
151
+ label="binding" if score > 0.5 else "non-binding",
152
+ metadata={
153
+ "method": "fallback_heuristic",
154
+ "warning": "DeepPurpose not available, using simple heuristics"
155
+ }
156
+ )
157
+
158
+ def _predict_deeppurpose(self, drug: str, target: str) -> PredictionResult:
159
+ """Predict using DeepPurpose."""
160
+ available, dp = _check_deeppurpose()
161
+
162
+ try:
163
+ # Encode drug and target
164
+ drug_encoding = dp["utils"].drug_encoding(drug, self.model_type.split("_")[0])
165
+ target_encoding = dp["utils"].target_encoding(target, self.model_type.split("_")[1])
166
+
167
+ # Predict
168
+ if self._model:
169
+ y_pred = self._model.predict(drug_encoding, target_encoding)
170
+ else:
171
+ # Train a quick model or use default
172
+ warnings.warn("No pretrained model loaded, predictions may be unreliable")
173
+ y_pred = [0.5] # Default
174
+
175
+ score = float(y_pred[0]) if hasattr(y_pred, '__iter__') else float(y_pred)
176
+
177
+ return PredictionResult(
178
+ score=score,
179
+ confidence=0.8,
180
+ label="binding" if score > 0.5 else "non-binding",
181
+ metadata={
182
+ "method": "deeppurpose",
183
+ "model_type": self.model_type,
184
+ "drug_smiles": drug[:50],
185
+ "target_length": len(target)
186
+ }
187
+ )
188
+
189
+ except Exception as e:
190
+ logger.error(f"DeepPurpose prediction failed: {e}")
191
+ # Fallback
192
+ score, confidence = self._fallback_predict(drug, target)
193
+ return PredictionResult(
194
+ score=score,
195
+ confidence=confidence,
196
+ label="binding" if score > 0.5 else "non-binding",
197
+ metadata={"method": "fallback", "error": str(e)}
198
+ )
199
+
200
+ def batch_predict(self, pairs: List[Tuple[str, str]]) -> List[PredictionResult]:
201
+ """
202
+ Batch predict drug-target interactions.
203
+
204
+ Args:
205
+ pairs: List of (drug_smiles, target_sequence) tuples
206
+
207
+ Returns:
208
+ List of PredictionResults
209
+ """
210
+ return [self.predict(drug, target) for drug, target in pairs]
211
+
212
+ def get_model_info(self) -> Dict[str, Any]:
213
+ """Get model information."""
214
+ return {
215
+ "model_type": self.model_type,
216
+ "use_deeppurpose": self._use_deeppurpose,
217
+ "pretrained": self.pretrained,
218
+ "device": self.device,
219
+ "available_models": self.AVAILABLE_MODELS
220
+ }
bioflow/plugins/encoders/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Encoders
3
+ =================
4
+
5
+ Open-source encoder implementations for different modalities.
6
+
7
+ Available Encoders:
8
+ - TextEncoder: PubMedBERT / SciBERT for biomedical text
9
+ - MoleculeEncoder: ChemBERTa for SMILES molecules
10
+ - ProteinEncoder: ESM-2 for protein sequences
11
+ """
12
+
13
+ from bioflow.plugins.encoders.text_encoder import TextEncoder
14
+ from bioflow.plugins.encoders.molecule_encoder import MoleculeEncoder
15
+ from bioflow.plugins.encoders.protein_encoder import ProteinEncoder
16
+
17
+ __all__ = ["TextEncoder", "MoleculeEncoder", "ProteinEncoder"]
bioflow/plugins/encoders/molecule_encoder.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Molecule Encoder - ChemBERTa / RDKit
3
+ =====================================
4
+
5
+ Encodes SMILES molecules into vectors.
6
+
7
+ Models:
8
+ - seyonec/ChemBERTa-zinc-base-v1 (default)
9
+ - DeepChem/ChemBERTa-77M-MTR
10
+ - RDKit fingerprints (fallback, no GPU needed)
11
+ """
12
+
13
+ import logging
14
+ from typing import List, Optional
15
+ from enum import Enum
16
+
17
+ from bioflow.core import BioEncoder, Modality, EmbeddingResult
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Lazy imports
22
+ _transformers = None
23
+ _torch = None
24
+ _rdkit = None
25
+
26
+
27
+ def _load_transformers():
28
+ global _transformers, _torch
29
+ if _transformers is None:
30
+ try:
31
+ import transformers
32
+ import torch
33
+ _transformers = transformers
34
+ _torch = torch
35
+ except ImportError:
36
+ raise ImportError(
37
+ "transformers and torch are required. "
38
+ "Install with: pip install transformers torch"
39
+ )
40
+ return _transformers, _torch
41
+
42
+
43
+ def _load_rdkit():
44
+ global _rdkit
45
+ if _rdkit is None:
46
+ try:
47
+ from rdkit import Chem
48
+ from rdkit.Chem import AllChem
49
+ _rdkit = (Chem, AllChem)
50
+ except ImportError:
51
+ raise ImportError(
52
+ "RDKit is required for fingerprint encoding. "
53
+ "Install with: pip install rdkit"
54
+ )
55
+ return _rdkit
56
+
57
+
58
+ class MoleculeEncoderBackend(Enum):
59
+ CHEMBERTA = "chemberta"
60
+ RDKIT_MORGAN = "rdkit_morgan"
61
+ RDKIT_MACCS = "rdkit_maccs"
62
+
63
+
64
+ class MoleculeEncoder(BioEncoder):
65
+ """
66
+ Encoder for SMILES molecules using ChemBERTa or RDKit fingerprints.
67
+
68
+ Example:
69
+ >>> encoder = MoleculeEncoder(backend="chemberta")
70
+ >>> result = encoder.encode("CCO", Modality.SMILES) # Ethanol
71
+ >>> print(len(result.vector)) # 768
72
+
73
+ >>> encoder = MoleculeEncoder(backend="rdkit_morgan")
74
+ >>> result = encoder.encode("CCO", Modality.SMILES)
75
+ >>> print(len(result.vector)) # 2048
76
+ """
77
+
78
+ SUPPORTED_MODELS = {
79
+ "chemberta": "seyonec/ChemBERTa-zinc-base-v1",
80
+ "chemberta-77m": "DeepChem/ChemBERTa-77M-MTR",
81
+ }
82
+
83
+ def __init__(
84
+ self,
85
+ backend: str = "chemberta",
86
+ model_name: str = None,
87
+ device: str = None,
88
+ fp_size: int = 2048, # For RDKit fingerprints
89
+ fp_radius: int = 2, # For Morgan fingerprints
90
+ ):
91
+ """
92
+ Initialize MoleculeEncoder.
93
+
94
+ Args:
95
+ backend: "chemberta", "rdkit_morgan", or "rdkit_maccs"
96
+ model_name: HuggingFace model path (for chemberta)
97
+ device: torch device
98
+ fp_size: Fingerprint size (for RDKit)
99
+ fp_radius: Morgan fingerprint radius
100
+ """
101
+ self.backend = MoleculeEncoderBackend(backend.lower())
102
+ self.fp_size = fp_size
103
+ self.fp_radius = fp_radius
104
+
105
+ if self.backend == MoleculeEncoderBackend.CHEMBERTA:
106
+ transformers, torch = _load_transformers()
107
+
108
+ self.model_path = model_name or self.SUPPORTED_MODELS["chemberta"]
109
+
110
+ if device is None:
111
+ device = "cuda" if torch.cuda.is_available() else "cpu"
112
+ self.device = device
113
+
114
+ logger.info(f"Loading MoleculeEncoder: {self.model_path} on {self.device}")
115
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
116
+ self.model = transformers.AutoModel.from_pretrained(self.model_path)
117
+ self.model.to(self.device)
118
+ self.model.eval()
119
+
120
+ self._dimension = self.model.config.hidden_size
121
+ else:
122
+ # RDKit fingerprints
123
+ _load_rdkit()
124
+ self.device = "cpu"
125
+ self.model = None
126
+ self.tokenizer = None
127
+
128
+ if self.backend == MoleculeEncoderBackend.RDKIT_MORGAN:
129
+ self._dimension = fp_size
130
+ else: # MACCS
131
+ self._dimension = 167
132
+
133
+ logger.info(f"MoleculeEncoder ready (backend={backend}, dim={self._dimension})")
134
+
135
+ @property
136
+ def dimension(self) -> int:
137
+ return self._dimension
138
+
139
+ @property
140
+ def supported_modalities(self) -> List[Modality]:
141
+ return [Modality.SMILES]
142
+
143
+ def _encode_chemberta(self, smiles: str) -> List[float]:
144
+ """Encode using ChemBERTa."""
145
+ transformers, torch = _load_transformers()
146
+
147
+ inputs = self.tokenizer(
148
+ smiles,
149
+ return_tensors="pt",
150
+ max_length=512,
151
+ truncation=True,
152
+ padding=True
153
+ ).to(self.device)
154
+
155
+ with torch.no_grad():
156
+ outputs = self.model(**inputs)
157
+ # Mean pooling
158
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
159
+ hidden_states = outputs.last_hidden_state
160
+ embedding = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
161
+
162
+ return embedding.squeeze().cpu().numpy().tolist()
163
+
164
+ def _encode_rdkit(self, smiles: str) -> List[float]:
165
+ """Encode using RDKit fingerprints."""
166
+ Chem, AllChem = _load_rdkit()
167
+
168
+ mol = Chem.MolFromSmiles(smiles)
169
+ if mol is None:
170
+ raise ValueError(f"Invalid SMILES: {smiles}")
171
+
172
+ if self.backend == MoleculeEncoderBackend.RDKIT_MORGAN:
173
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.fp_radius, nBits=self.fp_size)
174
+ else: # MACCS
175
+ from rdkit.Chem import MACCSkeys
176
+ fp = MACCSkeys.GenMACCSKeys(mol)
177
+
178
+ return list(fp)
179
+
180
+ def encode(self, content: str, modality: Modality = Modality.SMILES) -> EmbeddingResult:
181
+ """Encode SMILES into a vector."""
182
+ if modality != Modality.SMILES:
183
+ raise ValueError(f"MoleculeEncoder only supports SMILES modality, got {modality}")
184
+
185
+ if self.backend == MoleculeEncoderBackend.CHEMBERTA:
186
+ vector = self._encode_chemberta(content)
187
+ else:
188
+ vector = self._encode_rdkit(content)
189
+
190
+ return EmbeddingResult(
191
+ vector=vector,
192
+ modality=modality,
193
+ dimension=self._dimension,
194
+ metadata={"backend": self.backend.value, "smiles": content}
195
+ )
196
+
197
+ def batch_encode(self, contents: List[str], modality: Modality = Modality.SMILES) -> List[EmbeddingResult]:
198
+ """Batch encode SMILES."""
199
+ if self.backend == MoleculeEncoderBackend.CHEMBERTA:
200
+ transformers, torch = _load_transformers()
201
+
202
+ inputs = self.tokenizer(
203
+ contents,
204
+ return_tensors="pt",
205
+ max_length=512,
206
+ truncation=True,
207
+ padding=True
208
+ ).to(self.device)
209
+
210
+ with torch.no_grad():
211
+ outputs = self.model(**inputs)
212
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
213
+ hidden_states = outputs.last_hidden_state
214
+ embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
215
+
216
+ results = []
217
+ for i, emb in enumerate(embeddings):
218
+ results.append(EmbeddingResult(
219
+ vector=emb.cpu().numpy().tolist(),
220
+ modality=modality,
221
+ dimension=self._dimension,
222
+ metadata={"backend": self.backend.value, "smiles": contents[i]}
223
+ ))
224
+ return results
225
+ else:
226
+ return [self.encode(s, modality) for s in contents]
bioflow/plugins/encoders/protein_encoder.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Protein Encoder - ESM-2 / ProtBERT
3
+ ===================================
4
+
5
+ Encodes protein sequences into vectors.
6
+
7
+ Models:
8
+ - facebook/esm2_t33_650M_UR50D (default, 1280-dim)
9
+ - facebook/esm2_t12_35M_UR50D (smaller, 480-dim)
10
+ - Rostlab/prot_bert (1024-dim)
11
+ """
12
+
13
+ import logging
14
+ from typing import List, Optional
15
+
16
+ from bioflow.core import BioEncoder, Modality, EmbeddingResult
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Lazy imports
21
+ _transformers = None
22
+ _torch = None
23
+
24
+
25
+ def _load_transformers():
26
+ global _transformers, _torch
27
+ if _transformers is None:
28
+ try:
29
+ import transformers
30
+ import torch
31
+ _transformers = transformers
32
+ _torch = torch
33
+ except ImportError:
34
+ raise ImportError(
35
+ "transformers and torch are required. "
36
+ "Install with: pip install transformers torch"
37
+ )
38
+ return _transformers, _torch
39
+
40
+
41
+ class ProteinEncoder(BioEncoder):
42
+ """
43
+ Encoder for protein sequences using ESM-2 or ProtBERT.
44
+
45
+ Example:
46
+ >>> encoder = ProteinEncoder(model_name="esm2_t12")
47
+ >>> result = encoder.encode("MKTVRQERLKSIVRILERSKEPVSG", Modality.PROTEIN)
48
+ >>> print(len(result.vector)) # 480
49
+ """
50
+
51
+ SUPPORTED_MODELS = {
52
+ "esm2_t33": "facebook/esm2_t33_650M_UR50D", # 1280-dim, 650M params
53
+ "esm2_t30": "facebook/esm2_t30_150M_UR50D", # 640-dim, 150M params
54
+ "esm2_t12": "facebook/esm2_t12_35M_UR50D", # 480-dim, 35M params (fast)
55
+ "esm2_t6": "facebook/esm2_t6_8M_UR50D", # 320-dim, 8M params (fastest)
56
+ "protbert": "Rostlab/prot_bert", # 1024-dim
57
+ "protbert_bfd": "Rostlab/prot_bert_bfd", # 1024-dim, larger
58
+ }
59
+
60
+ def __init__(
61
+ self,
62
+ model_name: str = "esm2_t12",
63
+ device: str = None,
64
+ max_length: int = 1024,
65
+ pooling: str = "mean"
66
+ ):
67
+ """
68
+ Initialize ProteinEncoder.
69
+
70
+ Args:
71
+ model_name: Model key or HuggingFace path
72
+ device: torch device
73
+ max_length: Max sequence length
74
+ pooling: Pooling strategy (mean, cls)
75
+ """
76
+ transformers, torch = _load_transformers()
77
+
78
+ # Resolve model name
79
+ self.model_path = self.SUPPORTED_MODELS.get(model_name.lower(), model_name)
80
+ self.max_length = max_length
81
+ self.pooling = pooling
82
+
83
+ # Set device
84
+ if device is None:
85
+ device = "cuda" if torch.cuda.is_available() else "cpu"
86
+ self.device = device
87
+
88
+ # Load model
89
+ logger.info(f"Loading ProteinEncoder: {self.model_path} on {self.device}")
90
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
91
+ self.model = transformers.AutoModel.from_pretrained(self.model_path)
92
+ self.model.to(self.device)
93
+ self.model.eval()
94
+
95
+ self._dimension = self.model.config.hidden_size
96
+ logger.info(f"ProteinEncoder ready (dim={self._dimension})")
97
+
98
+ @property
99
+ def dimension(self) -> int:
100
+ return self._dimension
101
+
102
+ @property
103
+ def supported_modalities(self) -> List[Modality]:
104
+ return [Modality.PROTEIN]
105
+
106
+ def _preprocess_sequence(self, sequence: str) -> str:
107
+ """Preprocess protein sequence for encoding."""
108
+ # Remove whitespace
109
+ sequence = sequence.strip().upper()
110
+
111
+ # For ProtBERT, add spaces between amino acids
112
+ if "prot_bert" in self.model_path.lower():
113
+ sequence = " ".join(list(sequence))
114
+
115
+ return sequence
116
+
117
+ def encode(self, content: str, modality: Modality = Modality.PROTEIN) -> EmbeddingResult:
118
+ """Encode protein sequence into a vector."""
119
+ if modality != Modality.PROTEIN:
120
+ raise ValueError(f"ProteinEncoder only supports PROTEIN modality, got {modality}")
121
+
122
+ transformers, torch = _load_transformers()
123
+
124
+ # Preprocess
125
+ sequence = self._preprocess_sequence(content)
126
+
127
+ # Tokenize
128
+ inputs = self.tokenizer(
129
+ sequence,
130
+ return_tensors="pt",
131
+ max_length=self.max_length,
132
+ truncation=True,
133
+ padding=True
134
+ ).to(self.device)
135
+
136
+ # Encode
137
+ with torch.no_grad():
138
+ outputs = self.model(**inputs)
139
+ hidden_states = outputs.last_hidden_state
140
+
141
+ if self.pooling == "cls":
142
+ embedding = hidden_states[:, 0, :]
143
+ else: # mean
144
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
145
+ embedding = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
146
+
147
+ vector = embedding.squeeze().cpu().numpy().tolist()
148
+
149
+ return EmbeddingResult(
150
+ vector=vector,
151
+ modality=modality,
152
+ dimension=self._dimension,
153
+ metadata={
154
+ "model": self.model_path,
155
+ "sequence_length": len(content)
156
+ }
157
+ )
158
+
159
+ def batch_encode(self, contents: List[str], modality: Modality = Modality.PROTEIN) -> List[EmbeddingResult]:
160
+ """Batch encode protein sequences."""
161
+ transformers, torch = _load_transformers()
162
+
163
+ sequences = [self._preprocess_sequence(s) for s in contents]
164
+
165
+ inputs = self.tokenizer(
166
+ sequences,
167
+ return_tensors="pt",
168
+ max_length=self.max_length,
169
+ truncation=True,
170
+ padding=True
171
+ ).to(self.device)
172
+
173
+ with torch.no_grad():
174
+ outputs = self.model(**inputs)
175
+ hidden_states = outputs.last_hidden_state
176
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
177
+ embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
178
+
179
+ results = []
180
+ for i, emb in enumerate(embeddings):
181
+ results.append(EmbeddingResult(
182
+ vector=emb.cpu().numpy().tolist(),
183
+ modality=modality,
184
+ dimension=self._dimension,
185
+ metadata={"model": self.model_path, "sequence_length": len(contents[i])}
186
+ ))
187
+
188
+ return results
bioflow/plugins/encoders/text_encoder.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Encoder - PubMedBERT / SciBERT
3
+ ====================================
4
+
5
+ Encodes biomedical text (abstracts, clinical notes) into vectors.
6
+
7
+ Models:
8
+ - microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext (default)
9
+ - allenai/scibert_scivocab_uncased
10
+ - allenai/specter
11
+ """
12
+
13
+ import logging
14
+ from typing import List, Optional
15
+ import numpy as np
16
+
17
+ from bioflow.core import BioEncoder, Modality, EmbeddingResult
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Lazy imports for optional dependencies
22
+ _transformers = None
23
+ _torch = None
24
+
25
+
26
+ def _load_transformers():
27
+ global _transformers, _torch
28
+ if _transformers is None:
29
+ try:
30
+ import transformers
31
+ import torch
32
+ _transformers = transformers
33
+ _torch = torch
34
+ except ImportError:
35
+ raise ImportError(
36
+ "transformers and torch are required for TextEncoder. "
37
+ "Install with: pip install transformers torch"
38
+ )
39
+ return _transformers, _torch
40
+
41
+
42
+ class TextEncoder(BioEncoder):
43
+ """
44
+ Encoder for biomedical text using PubMedBERT or similar models.
45
+
46
+ Example:
47
+ >>> encoder = TextEncoder(model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
48
+ >>> result = encoder.encode("EGFR mutations in lung cancer", Modality.TEXT)
49
+ >>> print(len(result.vector)) # 768
50
+ """
51
+
52
+ SUPPORTED_MODELS = {
53
+ "pubmedbert": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
54
+ "scibert": "allenai/scibert_scivocab_uncased",
55
+ "specter": "allenai/specter",
56
+ "biobert": "dmis-lab/biobert-base-cased-v1.2",
57
+ }
58
+
59
+ def __init__(
60
+ self,
61
+ model_name: str = "pubmedbert",
62
+ device: str = None,
63
+ max_length: int = 512,
64
+ pooling: str = "mean" # mean, cls, max
65
+ ):
66
+ """
67
+ Initialize TextEncoder.
68
+
69
+ Args:
70
+ model_name: Model key or HuggingFace model path
71
+ device: torch device (auto-detected if None)
72
+ max_length: Maximum token length
73
+ pooling: Pooling strategy for embeddings
74
+ """
75
+ transformers, torch = _load_transformers()
76
+
77
+ # Resolve model name
78
+ self.model_path = self.SUPPORTED_MODELS.get(model_name.lower(), model_name)
79
+ self.max_length = max_length
80
+ self.pooling = pooling
81
+
82
+ # Set device
83
+ if device is None:
84
+ device = "cuda" if torch.cuda.is_available() else "cpu"
85
+ self.device = device
86
+
87
+ # Load model and tokenizer
88
+ logger.info(f"Loading TextEncoder: {self.model_path} on {self.device}")
89
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
90
+ self.model = transformers.AutoModel.from_pretrained(self.model_path)
91
+ self.model.to(self.device)
92
+ self.model.eval()
93
+
94
+ self._dimension = self.model.config.hidden_size
95
+ logger.info(f"TextEncoder ready (dim={self._dimension})")
96
+
97
+ @property
98
+ def dimension(self) -> int:
99
+ return self._dimension
100
+
101
+ @property
102
+ def supported_modalities(self) -> List[Modality]:
103
+ return [Modality.TEXT]
104
+
105
+ def encode(self, content: str, modality: Modality = Modality.TEXT) -> EmbeddingResult:
106
+ """Encode text into a vector."""
107
+ if modality != Modality.TEXT:
108
+ raise ValueError(f"TextEncoder only supports TEXT modality, got {modality}")
109
+
110
+ transformers, torch = _load_transformers()
111
+
112
+ # Tokenize
113
+ inputs = self.tokenizer(
114
+ content,
115
+ return_tensors="pt",
116
+ max_length=self.max_length,
117
+ truncation=True,
118
+ padding=True
119
+ ).to(self.device)
120
+
121
+ # Encode
122
+ with torch.no_grad():
123
+ outputs = self.model(**inputs)
124
+ hidden_states = outputs.last_hidden_state
125
+
126
+ # Apply pooling
127
+ if self.pooling == "cls":
128
+ embedding = hidden_states[:, 0, :]
129
+ elif self.pooling == "mean":
130
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
131
+ embedding = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
132
+ elif self.pooling == "max":
133
+ embedding = hidden_states.max(dim=1).values
134
+ else:
135
+ raise ValueError(f"Unknown pooling: {self.pooling}")
136
+
137
+ vector = embedding.squeeze().cpu().numpy().tolist()
138
+
139
+ return EmbeddingResult(
140
+ vector=vector,
141
+ modality=modality,
142
+ dimension=self._dimension,
143
+ metadata={"model": self.model_path, "pooling": self.pooling}
144
+ )
145
+
146
+ def batch_encode(self, contents: List[str], modality: Modality = Modality.TEXT) -> List[EmbeddingResult]:
147
+ """Batch encode multiple texts."""
148
+ transformers, torch = _load_transformers()
149
+
150
+ inputs = self.tokenizer(
151
+ contents,
152
+ return_tensors="pt",
153
+ max_length=self.max_length,
154
+ truncation=True,
155
+ padding=True
156
+ ).to(self.device)
157
+
158
+ with torch.no_grad():
159
+ outputs = self.model(**inputs)
160
+ hidden_states = outputs.last_hidden_state
161
+
162
+ if self.pooling == "mean":
163
+ attention_mask = inputs["attention_mask"].unsqueeze(-1)
164
+ embeddings = (hidden_states * attention_mask).sum(1) / attention_mask.sum(1)
165
+ else:
166
+ embeddings = hidden_states[:, 0, :]
167
+
168
+ results = []
169
+ for i, emb in enumerate(embeddings):
170
+ results.append(EmbeddingResult(
171
+ vector=emb.cpu().numpy().tolist(),
172
+ modality=modality,
173
+ dimension=self._dimension,
174
+ metadata={"model": self.model_path, "index": i}
175
+ ))
176
+
177
+ return results
bioflow/plugins/obm_encoder.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OBM Encoder - Unified Multimodal Encoder
3
+ ==========================================
4
+
5
+ The OBM (Open BioMed) Encoder is the central multimodal embedding engine
6
+ that unifies text, molecules, and proteins into a common vector space.
7
+
8
+ This is the "heart" of the BioFlow platform - it enables cross-modal
9
+ similarity search (e.g., find proteins similar to a text description).
10
+
11
+ Architecture:
12
+ ┌─────────────────────────────────────────────┐
13
+ │ OBMEncoder │
14
+ │ ┌─────────┐ ┌──────────┐ ┌─────────────┐ │
15
+ │ │ Text │ │ Molecule │ │ Protein │ │
16
+ │ │ Encoder │ │ Encoder │ │ Encoder │ │
17
+ │ │(PubMed) │ │(ChemBERTa│ │ (ESM-2) │ │
18
+ │ └────┬────┘ └────┬─────┘ └──────┬──────┘ │
19
+ │ │ │ │ │
20
+ │ └───────────┼──────────────┘ │
21
+ │ ▼ │
22
+ │ Unified Embedding │
23
+ │ (768-dim) │
24
+ └─────────────────────────────────────────────┘
25
+ """
26
+
27
+ import logging
28
+ from typing import List, Dict, Any, Optional, Union
29
+ import numpy as np
30
+
31
+ from bioflow.core import BioEncoder, Modality, EmbeddingResult
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class OBMEncoder(BioEncoder):
37
+ """
38
+ Unified Multimodal Encoder for BioFlow.
39
+
40
+ Combines specialized encoders for each modality and optionally
41
+ projects them into a shared embedding space.
42
+
43
+ Example:
44
+ >>> obm = OBMEncoder()
45
+ >>>
46
+ >>> # Encode different modalities
47
+ >>> text_emb = obm.encode("EGFR inhibitor for lung cancer", Modality.TEXT)
48
+ >>> mol_emb = obm.encode("CC(=O)Oc1ccccc1C(=O)O", Modality.SMILES) # Aspirin
49
+ >>> prot_emb = obm.encode("MKTVRQERLKSIVRILERSKEPVSG", Modality.PROTEIN)
50
+ >>>
51
+ >>> # All embeddings have the same dimension
52
+ >>> assert len(text_emb.vector) == len(mol_emb.vector) == len(prot_emb.vector)
53
+
54
+ Attributes:
55
+ text_encoder: Encoder for biomedical text
56
+ molecule_encoder: Encoder for SMILES molecules
57
+ protein_encoder: Encoder for protein sequences
58
+ output_dim: Dimension of output embeddings (after projection)
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ text_model: str = "pubmedbert",
64
+ molecule_model: str = "chemberta",
65
+ protein_model: str = "esm2_t12",
66
+ device: str = None,
67
+ output_dim: int = 768,
68
+ lazy_load: bool = True
69
+ ):
70
+ """
71
+ Initialize OBMEncoder.
72
+
73
+ Args:
74
+ text_model: Model for text encoding
75
+ molecule_model: Model for molecule encoding
76
+ protein_model: Model for protein encoding
77
+ device: torch device (auto-detected if None)
78
+ output_dim: Target dimension for all embeddings
79
+ lazy_load: If True, load encoders on first use
80
+ """
81
+ self.text_model = text_model
82
+ self.molecule_model = molecule_model
83
+ self.protein_model = protein_model
84
+ self.device = device
85
+ self._output_dim = output_dim
86
+ self.lazy_load = lazy_load
87
+
88
+ # Encoders (lazy loaded)
89
+ self._text_encoder = None
90
+ self._molecule_encoder = None
91
+ self._protein_encoder = None
92
+
93
+ # Projection matrices (for dimension alignment)
94
+ self._projections: Dict[Modality, Any] = {}
95
+
96
+ if not lazy_load:
97
+ self._load_all_encoders()
98
+
99
+ logger.info(f"OBMEncoder initialized (lazy_load={lazy_load}, output_dim={output_dim})")
100
+
101
+ def _load_all_encoders(self):
102
+ """Load all encoders."""
103
+ self._get_text_encoder()
104
+ self._get_molecule_encoder()
105
+ self._get_protein_encoder()
106
+
107
+ def _get_text_encoder(self):
108
+ """Get or create text encoder."""
109
+ if self._text_encoder is None:
110
+ from bioflow.plugins.encoders.text_encoder import TextEncoder
111
+ self._text_encoder = TextEncoder(
112
+ model_name=self.text_model,
113
+ device=self.device
114
+ )
115
+ return self._text_encoder
116
+
117
+ def _get_molecule_encoder(self):
118
+ """Get or create molecule encoder."""
119
+ if self._molecule_encoder is None:
120
+ from bioflow.plugins.encoders.molecule_encoder import MoleculeEncoder
121
+ self._molecule_encoder = MoleculeEncoder(
122
+ backend=self.molecule_model if self.molecule_model.startswith("rdkit") else "chemberta",
123
+ model_name=None if self.molecule_model.startswith("rdkit") else self.molecule_model,
124
+ device=self.device
125
+ )
126
+ return self._molecule_encoder
127
+
128
+ def _get_protein_encoder(self):
129
+ """Get or create protein encoder."""
130
+ if self._protein_encoder is None:
131
+ from bioflow.plugins.encoders.protein_encoder import ProteinEncoder
132
+ self._protein_encoder = ProteinEncoder(
133
+ model_name=self.protein_model,
134
+ device=self.device
135
+ )
136
+ return self._protein_encoder
137
+
138
+ def _get_encoder_for_modality(self, modality: Modality) -> BioEncoder:
139
+ """Get the appropriate encoder for a modality."""
140
+ if modality == Modality.TEXT:
141
+ return self._get_text_encoder()
142
+ elif modality == Modality.SMILES:
143
+ return self._get_molecule_encoder()
144
+ elif modality == Modality.PROTEIN:
145
+ return self._get_protein_encoder()
146
+ else:
147
+ raise ValueError(f"Unsupported modality: {modality}")
148
+
149
+ def _project_embedding(self, vector: List[float], source_dim: int) -> List[float]:
150
+ """
151
+ Project embedding to output dimension.
152
+
153
+ For simplicity, uses truncation/padding. In production,
154
+ you would train a projection layer.
155
+ """
156
+ if source_dim == self._output_dim:
157
+ return vector
158
+ elif source_dim > self._output_dim:
159
+ # Truncate (or use PCA in production)
160
+ return vector[:self._output_dim]
161
+ else:
162
+ # Pad with zeros (or use learned projection)
163
+ return vector + [0.0] * (self._output_dim - source_dim)
164
+
165
+ @property
166
+ def dimension(self) -> int:
167
+ return self._output_dim
168
+
169
+ @property
170
+ def supported_modalities(self) -> List[Modality]:
171
+ return [Modality.TEXT, Modality.SMILES, Modality.PROTEIN]
172
+
173
+ def encode(self, content: Any, modality: Modality) -> EmbeddingResult:
174
+ """
175
+ Encode content from any supported modality.
176
+
177
+ Args:
178
+ content: Raw input (text, SMILES, or protein sequence)
179
+ modality: Type of the input
180
+
181
+ Returns:
182
+ EmbeddingResult with unified dimension
183
+ """
184
+ # Get appropriate encoder
185
+ encoder = self._get_encoder_for_modality(modality)
186
+
187
+ # Encode
188
+ result = encoder.encode(content, modality)
189
+
190
+ # Project to unified dimension
191
+ projected_vector = self._project_embedding(result.vector, encoder.dimension)
192
+
193
+ return EmbeddingResult(
194
+ vector=projected_vector,
195
+ modality=modality,
196
+ dimension=self._output_dim,
197
+ metadata={
198
+ **result.metadata,
199
+ "source_encoder": encoder.__class__.__name__,
200
+ "source_dim": encoder.dimension,
201
+ "projected": encoder.dimension != self._output_dim
202
+ }
203
+ )
204
+
205
+ def encode_auto(self, content: str) -> EmbeddingResult:
206
+ """
207
+ Auto-detect modality and encode.
208
+
209
+ Uses heuristics to determine if input is:
210
+ - Protein: Contains only amino acid letters (ACDEFGHIKLMNPQRSTVWY)
211
+ - SMILES: Contains typical SMILES characters (=, #, @, etc.)
212
+ - Text: Everything else
213
+ """
214
+ content = content.strip()
215
+
216
+ # Check for protein (only amino acid letters)
217
+ amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
218
+ if content.isupper() and set(content).issubset(amino_acids) and len(content) > 10:
219
+ return self.encode(content, Modality.PROTEIN)
220
+
221
+ # Check for SMILES (contains typical characters)
222
+ smiles_chars = set("=#@[]()+-.")
223
+ if any(c in content for c in smiles_chars) or (
224
+ len(content) < 100 and not " " in content and content[0].isupper()
225
+ ):
226
+ try:
227
+ # Validate as SMILES
228
+ return self.encode(content, Modality.SMILES)
229
+ except:
230
+ pass
231
+
232
+ # Default to text
233
+ return self.encode(content, Modality.TEXT)
234
+
235
+ def batch_encode(
236
+ self,
237
+ contents: List[Any],
238
+ modality: Modality
239
+ ) -> List[EmbeddingResult]:
240
+ """Batch encode multiple items of the same modality."""
241
+ encoder = self._get_encoder_for_modality(modality)
242
+ results = encoder.batch_encode(contents, modality)
243
+
244
+ # Project all to unified dimension
245
+ projected_results = []
246
+ for result in results:
247
+ projected_vector = self._project_embedding(result.vector, encoder.dimension)
248
+ projected_results.append(EmbeddingResult(
249
+ vector=projected_vector,
250
+ modality=modality,
251
+ dimension=self._output_dim,
252
+ metadata={**result.metadata, "source_dim": encoder.dimension}
253
+ ))
254
+
255
+ return projected_results
256
+
257
+ def similarity(self, emb1: EmbeddingResult, emb2: EmbeddingResult) -> float:
258
+ """
259
+ Compute cosine similarity between two embeddings.
260
+
261
+ Useful for cross-modal similarity (e.g., text-molecule).
262
+ """
263
+ v1 = np.array(emb1.vector)
264
+ v2 = np.array(emb2.vector)
265
+
266
+ return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
267
+
268
+ def get_encoder_info(self) -> Dict[str, Any]:
269
+ """Get information about loaded encoders."""
270
+ info = {
271
+ "output_dim": self._output_dim,
272
+ "device": self.device,
273
+ "encoders": {}
274
+ }
275
+
276
+ if self._text_encoder:
277
+ info["encoders"]["text"] = {
278
+ "model": self._text_encoder.model_path,
279
+ "dim": self._text_encoder.dimension
280
+ }
281
+
282
+ if self._molecule_encoder:
283
+ info["encoders"]["molecule"] = {
284
+ "backend": self._molecule_encoder.backend.value,
285
+ "dim": self._molecule_encoder.dimension
286
+ }
287
+
288
+ if self._protein_encoder:
289
+ info["encoders"]["protein"] = {
290
+ "model": self._protein_encoder.model_path,
291
+ "dim": self._protein_encoder.dimension
292
+ }
293
+
294
+ return info
bioflow/plugins/obm_plugin.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OBM Plugin - Deprecated
3
+ ========================
4
+
5
+ This module is deprecated. Use OBMEncoder from bioflow.plugins.obm_encoder instead.
6
+ """
7
+
8
+ # Redirect to new implementation
9
+ from bioflow.plugins.obm_encoder import OBMEncoder
10
+
11
+ # Alias for backward compatibility
12
+ OBMPlugin = OBMEncoder
13
+
14
+ __all__ = ["OBMEncoder", "OBMPlugin"]
15
+ if modality == Modality.TEXT:
16
+ return self._encode_text(content)
17
+ elif modality == Modality.SMILES:
18
+ return self._encode_smiles(content)
19
+ elif modality == Modality.PROTEIN:
20
+ return self._encode_protein(content)
21
+ return []
22
+
23
+ @property
24
+ def dimension(self) -> int:
25
+ return 4096 # Placeholder for model dimension
26
+
27
+ def _encode_text(self, text: str):
28
+ # Placeholder for text encoding using open-source model
29
+ pass
30
+
31
+ def _encode_smiles(self, smiles: str):
32
+ # Placeholder for SMILES encoding using open-source model
33
+ pass
34
+
35
+ def _encode_protein(self, protein: str):
36
+ # Placeholder for protein encoding using open-source model
37
+ pass
38
+
39
+ # Auto-register the tool so the orchestrator can find it
40
+ ToolRegistry.register_encoder("obm", OBMPlugin())
bioflow/plugins/qdrant_retriever.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Qdrant Retriever - Vector Database Integration
3
+ ================================================
4
+
5
+ Implements BioRetriever interface for Qdrant vector database.
6
+ Provides semantic search and ingestion for the BioFlow platform.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Union
11
+ import uuid
12
+
13
+ from bioflow.core import BioRetriever, BioEncoder, Modality, RetrievalResult
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Lazy import
18
+ _qdrant_client = None
19
+
20
+
21
+ def _load_qdrant():
22
+ global _qdrant_client
23
+ if _qdrant_client is None:
24
+ try:
25
+ from qdrant_client import QdrantClient
26
+ from qdrant_client.models import (
27
+ PointStruct,
28
+ VectorParams,
29
+ Distance,
30
+ Filter,
31
+ FieldCondition,
32
+ MatchValue,
33
+ )
34
+ _qdrant_client = {
35
+ "QdrantClient": QdrantClient,
36
+ "PointStruct": PointStruct,
37
+ "VectorParams": VectorParams,
38
+ "Distance": Distance,
39
+ "Filter": Filter,
40
+ "FieldCondition": FieldCondition,
41
+ "MatchValue": MatchValue,
42
+ }
43
+ except ImportError:
44
+ raise ImportError(
45
+ "qdrant-client is required. Install with: pip install qdrant-client"
46
+ )
47
+ return _qdrant_client
48
+
49
+
50
+ class QdrantRetriever(BioRetriever):
51
+ """
52
+ Vector database retriever using Qdrant.
53
+
54
+ Supports:
55
+ - Semantic search with embedding vectors
56
+ - Payload filtering (by modality, species, etc.)
57
+ - Batch ingestion of data
58
+
59
+ Example:
60
+ >>> from bioflow.plugins.obm_encoder import OBMEncoder
61
+ >>>
62
+ >>> encoder = OBMEncoder()
63
+ >>> retriever = QdrantRetriever(encoder=encoder, collection="molecules")
64
+ >>>
65
+ >>> # Ingest data
66
+ >>> retriever.ingest("CCO", Modality.SMILES, {"name": "Ethanol"})
67
+ >>>
68
+ >>> # Search
69
+ >>> results = retriever.search("alcohol compounds", limit=5)
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ encoder: BioEncoder,
75
+ collection: str = "bioflow_memory",
76
+ url: str = None,
77
+ path: str = None,
78
+ distance: str = "cosine"
79
+ ):
80
+ """
81
+ Initialize QdrantRetriever.
82
+
83
+ Args:
84
+ encoder: BioEncoder instance for vectorization
85
+ collection: Default collection name
86
+ url: Qdrant server URL (for remote)
87
+ path: Local path for persistent storage
88
+ distance: Distance metric (cosine, euclid, dot)
89
+ """
90
+ qdrant = _load_qdrant()
91
+
92
+ self.encoder = encoder
93
+ self.collection = collection
94
+ self.distance = distance
95
+
96
+ # Initialize client
97
+ if url:
98
+ self.client = qdrant["QdrantClient"](url=url)
99
+ logger.info(f"Connected to Qdrant server at {url}")
100
+ elif path:
101
+ self.client = qdrant["QdrantClient"](path=path)
102
+ logger.info(f"Using local Qdrant at {path}")
103
+ else:
104
+ self.client = qdrant["QdrantClient"](":memory:")
105
+ logger.info("Using in-memory Qdrant (data will be lost on exit)")
106
+
107
+ # Create collection if not exists
108
+ self._ensure_collection()
109
+
110
+ def _ensure_collection(self, name: str = None):
111
+ """Ensure collection exists."""
112
+ qdrant = _load_qdrant()
113
+ name = name or self.collection
114
+
115
+ collections = [c.name for c in self.client.get_collections().collections]
116
+
117
+ if name not in collections:
118
+ distance_map = {
119
+ "cosine": qdrant["Distance"].COSINE,
120
+ "euclid": qdrant["Distance"].EUCLID,
121
+ "dot": qdrant["Distance"].DOT,
122
+ }
123
+
124
+ self.client.create_collection(
125
+ collection_name=name,
126
+ vectors_config=qdrant["VectorParams"](
127
+ size=self.encoder.dimension,
128
+ distance=distance_map.get(self.distance, qdrant["Distance"].COSINE)
129
+ )
130
+ )
131
+ logger.info(f"Created collection: {name} (dim={self.encoder.dimension})")
132
+
133
+ def search(
134
+ self,
135
+ query: Union[List[float], str],
136
+ limit: int = 10,
137
+ filters: Optional[Dict[str, Any]] = None,
138
+ collection: str = None,
139
+ modality: Modality = Modality.TEXT,
140
+ **kwargs
141
+ ) -> List[RetrievalResult]:
142
+ """
143
+ Search for similar items.
144
+
145
+ Args:
146
+ query: Query vector or raw content to encode
147
+ limit: Maximum results
148
+ filters: Payload filters (e.g., {"species": "human"})
149
+ collection: Collection to search (uses default if None)
150
+ modality: Modality of query (if string)
151
+
152
+ Returns:
153
+ List of RetrievalResult sorted by similarity
154
+ """
155
+ qdrant = _load_qdrant()
156
+ collection = collection or self.collection
157
+
158
+ # Encode query if string
159
+ if isinstance(query, str):
160
+ result = self.encoder.encode(query, modality)
161
+ query_vector = result.vector
162
+ else:
163
+ query_vector = query
164
+
165
+ # Build filter
166
+ qdrant_filter = None
167
+ if filters:
168
+ conditions = []
169
+ for key, value in filters.items():
170
+ conditions.append(
171
+ qdrant["FieldCondition"](
172
+ key=key,
173
+ match=qdrant["MatchValue"](value=value)
174
+ )
175
+ )
176
+ qdrant_filter = qdrant["Filter"](must=conditions)
177
+
178
+ # Search (use query method for newer qdrant-client versions)
179
+ try:
180
+ # New API (qdrant-client >= 1.6)
181
+ results = self.client.query_points(
182
+ collection_name=collection,
183
+ query=query_vector,
184
+ limit=limit,
185
+ query_filter=qdrant_filter
186
+ ).points
187
+ except AttributeError:
188
+ # Fallback to old API
189
+ results = self.client.search(
190
+ collection_name=collection,
191
+ query_vector=query_vector,
192
+ limit=limit,
193
+ query_filter=qdrant_filter
194
+ )
195
+
196
+ # Convert to RetrievalResult
197
+ return [
198
+ RetrievalResult(
199
+ id=str(r.id),
200
+ score=r.score,
201
+ content=r.payload.get("content", ""),
202
+ modality=Modality(r.payload.get("modality", "text")),
203
+ payload=r.payload
204
+ )
205
+ for r in results
206
+ ]
207
+
208
+ def ingest(
209
+ self,
210
+ content: Any,
211
+ modality: Modality,
212
+ payload: Optional[Dict[str, Any]] = None,
213
+ collection: str = None,
214
+ id: str = None
215
+ ) -> str:
216
+ """
217
+ Ingest content into the vector database.
218
+
219
+ Args:
220
+ content: Raw content to encode
221
+ modality: Type of content
222
+ payload: Additional metadata
223
+ collection: Target collection
224
+ id: Custom ID (auto-generated if None)
225
+
226
+ Returns:
227
+ ID of inserted item
228
+ """
229
+ qdrant = _load_qdrant()
230
+ collection = collection or self.collection
231
+
232
+ # Encode content
233
+ result = self.encoder.encode(content, modality)
234
+
235
+ # Generate ID
236
+ point_id = id or str(uuid.uuid4())
237
+
238
+ # Build payload
239
+ full_payload = {
240
+ "content": content,
241
+ "modality": modality.value,
242
+ **(payload or {})
243
+ }
244
+
245
+ # Insert
246
+ self.client.upsert(
247
+ collection_name=collection,
248
+ points=[
249
+ qdrant["PointStruct"](
250
+ id=point_id,
251
+ vector=result.vector,
252
+ payload=full_payload
253
+ )
254
+ ]
255
+ )
256
+
257
+ logger.debug(f"Ingested {modality.value}: {point_id}")
258
+ return point_id
259
+
260
+ def batch_ingest(
261
+ self,
262
+ items: List[Dict[str, Any]],
263
+ collection: str = None
264
+ ) -> List[str]:
265
+ """
266
+ Batch ingest multiple items.
267
+
268
+ Args:
269
+ items: List of {"content": ..., "modality": ..., "payload": ...}
270
+ collection: Target collection
271
+
272
+ Returns:
273
+ List of inserted IDs
274
+ """
275
+ qdrant = _load_qdrant()
276
+ collection = collection or self.collection
277
+
278
+ points = []
279
+ ids = []
280
+
281
+ for item in items:
282
+ content = item["content"]
283
+ modality = Modality(item.get("modality", "text"))
284
+ payload = item.get("payload", {})
285
+
286
+ result = self.encoder.encode(content, modality)
287
+ point_id = str(uuid.uuid4())
288
+
289
+ points.append(
290
+ qdrant["PointStruct"](
291
+ id=point_id,
292
+ vector=result.vector,
293
+ payload={"content": content, "modality": modality.value, **payload}
294
+ )
295
+ )
296
+ ids.append(point_id)
297
+
298
+ self.client.upsert(collection_name=collection, points=points)
299
+ logger.info(f"Batch ingested {len(ids)} items to {collection}")
300
+
301
+ return ids
302
+
303
+ def count(self, collection: str = None) -> int:
304
+ """Get count of items in collection."""
305
+ collection = collection or self.collection
306
+ return self.client.count(collection_name=collection).count
307
+
308
+ def delete_collection(self, collection: str = None):
309
+ """Delete a collection."""
310
+ collection = collection or self.collection
311
+ self.client.delete_collection(collection_name=collection)
312
+ logger.info(f"Deleted collection: {collection}")
bioflow/qdrant_manager.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Qdrant Manager - Vector Database Integration
3
+ ==============================================
4
+
5
+ This module provides high-level management for Qdrant collections,
6
+ including ingestion, search, and retrieval operations for BioFlow.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Union
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ import uuid
14
+
15
+ try:
16
+ from qdrant_client import QdrantClient
17
+ from qdrant_client.models import (
18
+ PointStruct,
19
+ VectorParams,
20
+ Distance,
21
+ Filter,
22
+ FieldCondition,
23
+ MatchValue,
24
+ SearchRequest,
25
+ UpdateStatus
26
+ )
27
+ QDRANT_AVAILABLE = True
28
+ except ImportError:
29
+ QDRANT_AVAILABLE = False
30
+
31
+ from bioflow.obm_wrapper import OBMWrapper, EmbeddingResult, ModalityType
32
+
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class SearchResult:
39
+ """Container for search results."""
40
+ id: str
41
+ score: float
42
+ content: str
43
+ modality: str
44
+ payload: Dict[str, Any] = field(default_factory=dict)
45
+
46
+
47
+ class QdrantManager:
48
+ """
49
+ High-level manager for Qdrant vector database operations.
50
+
51
+ Provides methods for:
52
+ - Collection management (create, delete, info)
53
+ - Data ingestion with automatic embedding
54
+ - Cross-modal similarity search
55
+ - Filtered retrieval
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ obm: OBMWrapper,
61
+ qdrant_url: str = None,
62
+ qdrant_path: str = None,
63
+ default_collection: str = "bioflow_memory"
64
+ ):
65
+ """
66
+ Initialize QdrantManager.
67
+
68
+ Args:
69
+ obm: Initialized OBMWrapper instance.
70
+ qdrant_url: URL for remote Qdrant server.
71
+ qdrant_path: Path for local persistent storage.
72
+ default_collection: Default collection name.
73
+ """
74
+ if not QDRANT_AVAILABLE:
75
+ raise ImportError("qdrant-client is required. Install with: pip install qdrant-client")
76
+
77
+ self.obm = obm
78
+ self.default_collection = default_collection
79
+
80
+ if qdrant_url:
81
+ self.client = QdrantClient(url=qdrant_url)
82
+ logger.info(f"Connected to Qdrant server at {qdrant_url}")
83
+ elif qdrant_path:
84
+ self.client = QdrantClient(path=qdrant_path)
85
+ logger.info(f"Using local Qdrant at {qdrant_path}")
86
+ else:
87
+ self.client = QdrantClient(":memory:")
88
+ logger.info("Using in-memory Qdrant (data will be lost on exit)")
89
+
90
+ def create_collection(
91
+ self,
92
+ name: str = None,
93
+ recreate: bool = False
94
+ ) -> bool:
95
+ """
96
+ Create a new collection.
97
+
98
+ Args:
99
+ name: Collection name (uses default if None).
100
+ recreate: If True, deletes existing collection first.
101
+
102
+ Returns:
103
+ True if created successfully.
104
+ """
105
+ name = name or self.default_collection
106
+
107
+ if recreate:
108
+ try:
109
+ self.client.delete_collection(name)
110
+ except Exception:
111
+ pass
112
+
113
+ try:
114
+ self.client.create_collection(
115
+ collection_name=name,
116
+ vectors_config=VectorParams(
117
+ size=self.obm.vector_dim,
118
+ distance=Distance.COSINE
119
+ )
120
+ )
121
+ logger.info(f"Created collection '{name}' with dim={self.obm.vector_dim}")
122
+ return True
123
+ except Exception as e:
124
+ logger.warning(f"Collection might exist: {e}")
125
+ return False
126
+
127
+ def collection_exists(self, name: str = None) -> bool:
128
+ """Check if collection exists."""
129
+ name = name or self.default_collection
130
+ try:
131
+ collections = self.client.get_collections().collections
132
+ return any(c.name == name for c in collections)
133
+ except Exception:
134
+ return False
135
+
136
+ def get_collection_info(self, name: str = None) -> Dict[str, Any]:
137
+ """Get collection statistics."""
138
+ name = name or self.default_collection
139
+ try:
140
+ info = self.client.get_collection(name)
141
+ # Handle different qdrant-client versions
142
+ points_count = getattr(info, 'points_count', None) or getattr(info, 'vectors_count', 0)
143
+ status = getattr(info.status, 'value', 'unknown') if hasattr(info, 'status') and info.status else 'unknown'
144
+
145
+ # Try to get vector size from config
146
+ vector_size = self.obm.vector_dim
147
+ if hasattr(info, 'config') and info.config:
148
+ if hasattr(info.config, 'params') and hasattr(info.config.params, 'vectors'):
149
+ vectors_config = info.config.params.vectors
150
+ if hasattr(vectors_config, 'size'):
151
+ vector_size = vectors_config.size
152
+ elif isinstance(vectors_config, dict) and '' in vectors_config:
153
+ vector_size = vectors_config[''].size
154
+
155
+ return {
156
+ "name": name,
157
+ "points_count": points_count,
158
+ "status": status,
159
+ "vector_size": vector_size
160
+ }
161
+ except Exception as e:
162
+ return {"error": str(e)}
163
+
164
+ def ingest(
165
+ self,
166
+ items: List[Dict[str, Any]],
167
+ collection: str = None,
168
+ batch_size: int = 100
169
+ ) -> Dict[str, int]:
170
+ """
171
+ Ingest multiple items with automatic embedding.
172
+
173
+ Args:
174
+ items: List of dicts with 'content', 'modality', and optional metadata.
175
+ collection: Target collection name.
176
+ batch_size: Number of items per batch.
177
+
178
+ Returns:
179
+ Statistics dict with success/failure counts.
180
+ """
181
+ collection = collection or self.default_collection
182
+
183
+ if not self.collection_exists(collection):
184
+ self.create_collection(collection)
185
+
186
+ stats = {"success": 0, "failed": 0, "skipped": 0}
187
+ points = []
188
+
189
+ for item in items:
190
+ content = item.get("content")
191
+ modality = item.get("modality", item.get("type", "text"))
192
+
193
+ if not content:
194
+ stats["skipped"] += 1
195
+ continue
196
+
197
+ try:
198
+ embedding = self.obm.encode(content, modality)
199
+
200
+ payload = {k: v for k, v in item.items() if k != "content"}
201
+ payload["content"] = content
202
+ payload["modality"] = modality
203
+ payload["content_hash"] = embedding.content_hash
204
+
205
+ point_id = item.get("id", str(uuid.uuid4()))
206
+
207
+ points.append(PointStruct(
208
+ id=point_id if isinstance(point_id, int) else hash(point_id) % (10**8),
209
+ vector=embedding.vector.tolist(),
210
+ payload=payload
211
+ ))
212
+ stats["success"] += 1
213
+
214
+ # Batch upload
215
+ if len(points) >= batch_size:
216
+ self.client.upsert(collection_name=collection, points=points)
217
+ points = []
218
+
219
+ except Exception as e:
220
+ logger.error(f"Failed to embed: {e}")
221
+ stats["failed"] += 1
222
+
223
+ # Upload remaining
224
+ if points:
225
+ self.client.upsert(collection_name=collection, points=points)
226
+
227
+ logger.info(f"Ingestion complete: {stats}")
228
+ return stats
229
+
230
+ def search(
231
+ self,
232
+ query: str,
233
+ query_modality: str = "text",
234
+ collection: str = None,
235
+ limit: int = 10,
236
+ filter_modality: str = None,
237
+ filters: Dict[str, Any] = None
238
+ ) -> List[SearchResult]:
239
+ """
240
+ Search for similar items.
241
+
242
+ Args:
243
+ query: Query content (text, SMILES, or protein sequence).
244
+ query_modality: Modality of the query.
245
+ collection: Collection to search.
246
+ limit: Maximum results to return.
247
+ filter_modality: Only return results of this modality.
248
+ filters: Additional payload filters.
249
+
250
+ Returns:
251
+ List of SearchResult objects.
252
+ """
253
+ collection = collection or self.default_collection
254
+
255
+ # Encode query
256
+ embedding = self.obm.encode(query, query_modality)
257
+
258
+ # Build filter
259
+ qdrant_filter = None
260
+ conditions = []
261
+
262
+ if filter_modality:
263
+ conditions.append(
264
+ FieldCondition(key="modality", match=MatchValue(value=filter_modality))
265
+ )
266
+
267
+ if filters:
268
+ for key, value in filters.items():
269
+ conditions.append(
270
+ FieldCondition(key=key, match=MatchValue(value=value))
271
+ )
272
+
273
+ if conditions:
274
+ qdrant_filter = Filter(must=conditions)
275
+
276
+ # Search using query_points (new API)
277
+ results = self.client.query_points(
278
+ collection_name=collection,
279
+ query=embedding.vector.tolist(),
280
+ limit=limit,
281
+ query_filter=qdrant_filter
282
+ )
283
+
284
+ return [
285
+ SearchResult(
286
+ id=str(r.id),
287
+ score=r.score,
288
+ content=r.payload.get("content", "") if r.payload else "",
289
+ modality=r.payload.get("modality", "unknown") if r.payload else "unknown",
290
+ payload=r.payload or {}
291
+ )
292
+ for r in results.points
293
+ ]
294
+
295
+ def cross_modal_search(
296
+ self,
297
+ query: str,
298
+ query_modality: str,
299
+ target_modality: str,
300
+ collection: str = None,
301
+ limit: int = 10
302
+ ) -> List[SearchResult]:
303
+ """
304
+ Search across modalities (e.g., text query → molecule results).
305
+
306
+ Args:
307
+ query: Query content.
308
+ query_modality: Modality of query ('text', 'smiles', 'protein').
309
+ target_modality: Modality of desired results.
310
+ collection: Collection to search.
311
+ limit: Maximum results.
312
+
313
+ Returns:
314
+ List of SearchResult objects from target modality.
315
+ """
316
+ return self.search(
317
+ query=query,
318
+ query_modality=query_modality,
319
+ collection=collection,
320
+ limit=limit,
321
+ filter_modality=target_modality
322
+ )
323
+
324
+ def get_neighbors_diversity(
325
+ self,
326
+ query: str,
327
+ query_modality: str,
328
+ collection: str = None,
329
+ k: int = 10
330
+ ) -> Dict[str, Any]:
331
+ """
332
+ Analyze diversity of top-k neighbors.
333
+
334
+ Returns statistics about the embedding neighborhood:
335
+ - Mean/std of similarity scores
336
+ - Modality distribution
337
+ - Diversity score
338
+ """
339
+ results = self.search(query, query_modality, collection, limit=k)
340
+
341
+ if not results:
342
+ return {"error": "No results found"}
343
+
344
+ scores = [r.score for r in results]
345
+ modalities = [r.modality for r in results]
346
+
347
+ # Modality distribution
348
+ modality_counts = {}
349
+ for m in modalities:
350
+ modality_counts[m] = modality_counts.get(m, 0) + 1
351
+
352
+ # Diversity score (1 - variance of normalized scores)
353
+ import numpy as np
354
+ scores_arr = np.array(scores)
355
+ diversity = 1.0 - np.std(scores_arr) if len(scores_arr) > 1 else 0.0
356
+
357
+ return {
358
+ "k": k,
359
+ "mean_similarity": float(np.mean(scores_arr)),
360
+ "std_similarity": float(np.std(scores_arr)),
361
+ "min_similarity": float(np.min(scores_arr)),
362
+ "max_similarity": float(np.max(scores_arr)),
363
+ "modality_distribution": modality_counts,
364
+ "diversity_score": float(diversity)
365
+ }
bioflow/ui/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow UI Package
3
+ ===================
4
+
5
+ Modern Streamlit-based interface for the BioFlow platform.
6
+
7
+ Pages:
8
+ - Home: Dashboard with key metrics and quick actions
9
+ - Discovery: Drug discovery pipeline interface
10
+ - Explorer: Vector space visualization
11
+ - Data: Data ingestion and management
12
+ - Settings: Configuration and preferences
13
+ """
14
+
15
+ __version__ = "2.0.0"
bioflow/ui/app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - AI-Powered Drug Discovery Platform
3
+ ==============================================
4
+ Main application entry point.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+
11
+ # Setup path for imports
12
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
13
+
14
+ from bioflow.ui.config import get_css
15
+ from bioflow.ui.components import side_nav
16
+ from bioflow.ui.pages import home, discovery, explorer, data, settings
17
+
18
+
19
+ def main():
20
+ """Main application."""
21
+
22
+ # Page config
23
+ st.set_page_config(
24
+ page_title="BioFlow",
25
+ page_icon="🧬",
26
+ layout="wide",
27
+ initial_sidebar_state="collapsed",
28
+ )
29
+
30
+ # Inject custom CSS
31
+ st.markdown(get_css(), unsafe_allow_html=True)
32
+
33
+ # Initialize session state
34
+ if "current_page" not in st.session_state:
35
+ st.session_state.current_page = "home"
36
+
37
+ # Layout with left navigation
38
+ nav_col, content_col = st.columns([1, 3.6], gap="large")
39
+
40
+ with nav_col:
41
+ selected = side_nav(active_page=st.session_state.current_page)
42
+
43
+ if selected != st.session_state.current_page:
44
+ st.session_state.current_page = selected
45
+ st.rerun()
46
+
47
+ with content_col:
48
+ page_map = {
49
+ "home": home.render,
50
+ "discovery": discovery.render,
51
+ "explorer": explorer.render,
52
+ "data": data.render,
53
+ "settings": settings.render,
54
+ }
55
+
56
+ render_fn = page_map.get(st.session_state.current_page, home.render)
57
+ render_fn()
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
bioflow/ui/components.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow UI - Components Library
3
+ ================================
4
+ Reusable, modern UI components for Streamlit.
5
+ """
6
+
7
+ import streamlit as st
8
+ from typing import List, Dict, Any, Optional, Callable
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+
12
+ # Import colors
13
+ import sys
14
+ import os
15
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
16
+ from bioflow.ui.config import COLORS
17
+
18
+
19
+ # === Navigation ===
20
+
21
+ def side_nav(active_page: str = "home") -> str:
22
+ """Left vertical navigation list. Returns the selected page key."""
23
+
24
+ nav_items = [
25
+ ("home", "🏠", "Home"),
26
+ ("discovery", "🔬", "Discovery"),
27
+ ("explorer", "🧬", "Explorer"),
28
+ ("data", "📊", "Data"),
29
+ ("settings", "⚙️", "Settings"),
30
+ ]
31
+
32
+ st.markdown(
33
+ f"""
34
+ <div class="nav-rail">
35
+ <div class="nav-brand">
36
+ <div class="nav-logo">🧬</div>
37
+ <div class="nav-title">Bio<span>Flow</span></div>
38
+ </div>
39
+ <div class="nav-section">Navigation</div>
40
+ </div>
41
+ """,
42
+ unsafe_allow_html=True,
43
+ )
44
+
45
+ label_map = {key: f"{icon} {label}" for key, icon, label in nav_items}
46
+ options = [item[0] for item in nav_items]
47
+
48
+ selected = st.radio(
49
+ "Navigation",
50
+ options=options,
51
+ index=options.index(active_page),
52
+ format_func=lambda x: label_map.get(x, x),
53
+ key="nav_radio",
54
+ label_visibility="collapsed",
55
+ )
56
+
57
+ return selected
58
+
59
+
60
+ # === Page Structure ===
61
+
62
+ def page_header(title: str, subtitle: str = "", icon: str = ""):
63
+ """Page header with title and optional subtitle."""
64
+ header_html = f"""
65
+ <div style="margin-bottom: 2rem;">
66
+ <h1 style="display: flex; align-items: center; gap: 0.75rem; margin: 0;">
67
+ {f'<span style="font-size: 2rem;">{icon}</span>' if icon else ''}
68
+ {title}
69
+ </h1>
70
+ {f'<p style="margin-top: 0.5rem; font-size: 1rem; color: {COLORS.text_muted};">{subtitle}</p>' if subtitle else ''}
71
+ </div>
72
+ """
73
+ st.markdown(header_html, unsafe_allow_html=True)
74
+
75
+
76
+ def section_header(title: str, icon: str = "", link_text: str = "", link_action: Optional[Callable] = None):
77
+ """Section header with optional action link."""
78
+ col1, col2 = st.columns([4, 1])
79
+
80
+ with col1:
81
+ st.markdown(f"""
82
+ <div class="section-title">
83
+ {f'<span>{icon}</span>' if icon else ''}
84
+ {title}
85
+ </div>
86
+ """, unsafe_allow_html=True)
87
+
88
+ with col2:
89
+ if link_text:
90
+ if st.button(link_text, key=f"section_{title}", use_container_width=True):
91
+ if link_action:
92
+ link_action()
93
+
94
+
95
+ def divider():
96
+ """Visual divider."""
97
+ st.markdown('<div class="divider"></div>', unsafe_allow_html=True)
98
+
99
+
100
+ def spacer(height: str = "1rem"):
101
+ """Vertical spacer."""
102
+ st.markdown(f'<div style="height: {height};"></div>', unsafe_allow_html=True)
103
+
104
+
105
+ # === Metrics ===
106
+
107
+ def metric_card(
108
+ value: str,
109
+ label: str,
110
+ icon: str = "📊",
111
+ change: Optional[str] = None,
112
+ change_type: str = "up",
113
+ color: str = COLORS.primary
114
+ ):
115
+ """Single metric card with icon and optional trend."""
116
+ bg_color = color.replace(")", ", 0.15)").replace("rgb", "rgba") if "rgb" in color else f"{color}22"
117
+ change_html = ""
118
+ if change:
119
+ arrow = "↑" if change_type == "up" else "↓"
120
+ change_html = f'<div class="metric-change {change_type}">{arrow} {change}</div>'
121
+
122
+ st.markdown(f"""
123
+ <div class="metric">
124
+ <div class="metric-icon" style="background: {bg_color}; color: {color};">
125
+ {icon}
126
+ </div>
127
+ <div class="metric-value">{value}</div>
128
+ <div class="metric-label">{label}</div>
129
+ {change_html}
130
+ </div>
131
+ """, unsafe_allow_html=True)
132
+
133
+
134
+ def metric_row(metrics: List[Dict[str, Any]]):
135
+ """Row of metric cards."""
136
+ cols = st.columns(len(metrics))
137
+ for col, metric in zip(cols, metrics):
138
+ with col:
139
+ metric_card(**metric)
140
+
141
+
142
+ # === Quick Actions ===
143
+
144
+ def quick_action(icon: str, title: str, description: str, key: str) -> bool:
145
+ """Single quick action card. Returns True if clicked."""
146
+ clicked = st.button(
147
+ f"{icon} {title}",
148
+ key=key,
149
+ use_container_width=True,
150
+ help=description
151
+ )
152
+ return clicked
153
+
154
+
155
+ def quick_actions_grid(actions: List[Dict[str, Any]], columns: int = 4) -> Optional[str]:
156
+ """Grid of quick action cards. Returns clicked action key or None."""
157
+ cols = st.columns(columns)
158
+ clicked_key = None
159
+
160
+ for i, action in enumerate(actions):
161
+ with cols[i % columns]:
162
+ st.markdown(f"""
163
+ <div class="quick-action">
164
+ <span class="quick-action-icon">{action['icon']}</span>
165
+ <div class="quick-action-title">{action['title']}</div>
166
+ <div class="quick-action-desc">{action.get('description', '')}</div>
167
+ </div>
168
+ """, unsafe_allow_html=True)
169
+
170
+ if st.button("Select", key=action['key'], use_container_width=True):
171
+ clicked_key = action['key']
172
+
173
+ return clicked_key
174
+
175
+
176
+ # === Pipeline Progress ===
177
+
178
+ def pipeline_progress(steps: List[Dict[str, Any]]):
179
+ """Visual pipeline with steps showing progress."""
180
+ html = '<div class="pipeline">'
181
+
182
+ for i, step in enumerate(steps):
183
+ status = step.get('status', 'pending')
184
+ icon = step.get('icon', str(i + 1))
185
+ name = step.get('name', f'Step {i + 1}')
186
+
187
+ # Display icon for completed steps
188
+ if status == 'done':
189
+ display = '✓'
190
+ elif status == 'active':
191
+ display = icon
192
+ else:
193
+ display = str(i + 1)
194
+
195
+ html += f'''
196
+ <div class="step">
197
+ <div class="step-dot {status}">{display}</div>
198
+ <span class="step-name">{name}</span>
199
+ </div>
200
+ '''
201
+
202
+ # Add connecting line (except after last step)
203
+ if i < len(steps) - 1:
204
+ line_status = 'done' if status == 'done' else ''
205
+ html += f'<div class="step-line {line_status}"></div>'
206
+
207
+ html += '</div>'
208
+ st.markdown(html, unsafe_allow_html=True)
209
+
210
+
211
+ # === Results ===
212
+
213
+ def result_card(
214
+ title: str,
215
+ score: float,
216
+ properties: Dict[str, str] = None,
217
+ badges: List[str] = None,
218
+ key: str = ""
219
+ ) -> bool:
220
+ """Result card with score and properties. Returns True if clicked."""
221
+
222
+ # Score color
223
+ if score >= 0.8:
224
+ score_class = "score-high"
225
+ elif score >= 0.5:
226
+ score_class = "score-med"
227
+ else:
228
+ score_class = "score-low"
229
+
230
+ # Properties HTML
231
+ props_html = ""
232
+ if properties:
233
+ props_html = '<div style="display: flex; gap: 1rem; margin-top: 0.75rem; flex-wrap: wrap;">'
234
+ for k, v in properties.items():
235
+ props_html += f'''
236
+ <div style="font-size: 0.8125rem;">
237
+ <span style="color: {COLORS.text_muted};">{k}:</span>
238
+ <span style="color: {COLORS.text_secondary}; margin-left: 0.25rem;">{v}</span>
239
+ </div>
240
+ '''
241
+ props_html += '</div>'
242
+
243
+ # Badges HTML
244
+ badges_html = ""
245
+ if badges:
246
+ badges_html = '<div style="display: flex; gap: 0.5rem; margin-top: 0.75rem;">'
247
+ for b in badges:
248
+ badges_html += f'<span class="badge badge-primary">{b}</span>'
249
+ badges_html += '</div>'
250
+
251
+ st.markdown(f"""
252
+ <div class="result">
253
+ <div style="display: flex; justify-content: space-between; align-items: flex-start;">
254
+ <div style="font-weight: 600; color: {COLORS.text_primary};">{title}</div>
255
+ <div class="{score_class}" style="font-size: 1.25rem; font-weight: 700;">{score:.1%}</div>
256
+ </div>
257
+ {props_html}
258
+ {badges_html}
259
+ </div>
260
+ """, unsafe_allow_html=True)
261
+
262
+ return st.button("View Details", key=key, use_container_width=True) if key else False
263
+
264
+
265
+ def results_list(results: List[Dict[str, Any]], empty_message: str = "No results found"):
266
+ """List of result cards."""
267
+ if not results:
268
+ empty_state(icon="🔍", title="No Results", description=empty_message)
269
+ return
270
+
271
+ for i, result in enumerate(results):
272
+ result_card(
273
+ title=result.get('title', f'Result {i + 1}'),
274
+ score=result.get('score', 0),
275
+ properties=result.get('properties'),
276
+ badges=result.get('badges'),
277
+ key=f"result_{i}"
278
+ )
279
+ spacer("0.75rem")
280
+
281
+
282
+ # === Charts ===
283
+
284
+ def bar_chart(data: Dict[str, float], title: str = "", height: int = 300):
285
+ """Styled bar chart."""
286
+ fig = go.Figure(data=[
287
+ go.Bar(
288
+ x=list(data.keys()),
289
+ y=list(data.values()),
290
+ marker_color=COLORS.primary,
291
+ marker_line_width=0,
292
+ )
293
+ ])
294
+
295
+ fig.update_layout(
296
+ title=title,
297
+ paper_bgcolor='rgba(0,0,0,0)',
298
+ plot_bgcolor='rgba(0,0,0,0)',
299
+ font=dict(family="Inter", color=COLORS.text_secondary),
300
+ height=height,
301
+ margin=dict(l=40, r=20, t=40, b=40),
302
+ xaxis=dict(
303
+ showgrid=False,
304
+ showline=True,
305
+ linecolor=COLORS.border,
306
+ ),
307
+ yaxis=dict(
308
+ showgrid=True,
309
+ gridcolor=COLORS.border,
310
+ showline=False,
311
+ ),
312
+ )
313
+
314
+ st.plotly_chart(fig, use_container_width=True)
315
+
316
+
317
+ def scatter_chart(x: List, y: List, labels: List = None, title: str = "", height: int = 400):
318
+ """Styled scatter plot."""
319
+ fig = go.Figure(data=[
320
+ go.Scatter(
321
+ x=x,
322
+ y=y,
323
+ mode='markers',
324
+ marker=dict(
325
+ size=10,
326
+ color=COLORS.primary,
327
+ opacity=0.7,
328
+ ),
329
+ text=labels,
330
+ hovertemplate='<b>%{text}</b><br>X: %{x}<br>Y: %{y}<extra></extra>' if labels else None,
331
+ )
332
+ ])
333
+
334
+ fig.update_layout(
335
+ title=title,
336
+ paper_bgcolor='rgba(0,0,0,0)',
337
+ plot_bgcolor='rgba(0,0,0,0)',
338
+ font=dict(family="Inter", color=COLORS.text_secondary),
339
+ height=height,
340
+ margin=dict(l=40, r=20, t=40, b=40),
341
+ xaxis=dict(
342
+ showgrid=True,
343
+ gridcolor=COLORS.border,
344
+ showline=True,
345
+ linecolor=COLORS.border,
346
+ ),
347
+ yaxis=dict(
348
+ showgrid=True,
349
+ gridcolor=COLORS.border,
350
+ showline=True,
351
+ linecolor=COLORS.border,
352
+ ),
353
+ )
354
+
355
+ st.plotly_chart(fig, use_container_width=True)
356
+
357
+
358
+ def heatmap(data: List[List[float]], x_labels: List[str], y_labels: List[str], title: str = "", height: int = 400):
359
+ """Styled heatmap."""
360
+ fig = go.Figure(data=[
361
+ go.Heatmap(
362
+ z=data,
363
+ x=x_labels,
364
+ y=y_labels,
365
+ colorscale=[
366
+ [0, COLORS.bg_hover],
367
+ [0.5, COLORS.primary],
368
+ [1, COLORS.cyan],
369
+ ],
370
+ )
371
+ ])
372
+
373
+ fig.update_layout(
374
+ title=title,
375
+ paper_bgcolor='rgba(0,0,0,0)',
376
+ plot_bgcolor='rgba(0,0,0,0)',
377
+ font=dict(family="Inter", color=COLORS.text_secondary),
378
+ height=height,
379
+ margin=dict(l=80, r=20, t=40, b=60),
380
+ )
381
+
382
+ st.plotly_chart(fig, use_container_width=True)
383
+
384
+
385
+ # === Data Display ===
386
+
387
+ def data_table(data: List[Dict], columns: List[str] = None):
388
+ """Styled data table."""
389
+ import pandas as pd
390
+ df = pd.DataFrame(data)
391
+ if columns:
392
+ df = df[columns]
393
+ st.dataframe(df, use_container_width=True, hide_index=True)
394
+
395
+
396
+ # === States ===
397
+
398
+ def empty_state(icon: str = "📭", title: str = "No Data", description: str = ""):
399
+ """Empty state placeholder."""
400
+ st.markdown(f"""
401
+ <div class="empty">
402
+ <div class="empty-icon">{icon}</div>
403
+ <div class="empty-title">{title}</div>
404
+ <div class="empty-desc">{description}</div>
405
+ </div>
406
+ """, unsafe_allow_html=True)
407
+
408
+
409
+ def loading_state(message: str = "Loading..."):
410
+ """Loading state with spinner."""
411
+ st.markdown(f"""
412
+ <div class="loading">
413
+ <div class="spinner"></div>
414
+ <div class="loading-text">{message}</div>
415
+ </div>
416
+ """, unsafe_allow_html=True)
417
+
418
+
419
+ # === Molecule Display ===
420
+
421
+ def molecule_2d(smiles: str, size: int = 200):
422
+ """Display 2D molecule structure from SMILES."""
423
+ try:
424
+ from rdkit import Chem
425
+ from rdkit.Chem import Draw
426
+ import base64
427
+ from io import BytesIO
428
+
429
+ mol = Chem.MolFromSmiles(smiles)
430
+ if mol:
431
+ img = Draw.MolToImage(mol, size=(size, size))
432
+ buffered = BytesIO()
433
+ img.save(buffered, format="PNG")
434
+ img_str = base64.b64encode(buffered.getvalue()).decode()
435
+
436
+ st.markdown(f"""
437
+ <div class="mol-container">
438
+ <img src="data:image/png;base64,{img_str}" alt="Molecule" style="max-width: 100%; height: auto;">
439
+ </div>
440
+ """, unsafe_allow_html=True)
441
+ else:
442
+ st.warning("Invalid SMILES")
443
+ except ImportError:
444
+ st.info(f"SMILES: `{smiles}`")
445
+
446
+
447
+ # === Evidence & Links ===
448
+
449
+ def evidence_row(items: List[Dict[str, str]]):
450
+ """Row of evidence/source links."""
451
+ html = '<div style="display: flex; gap: 0.5rem; flex-wrap: wrap; margin-top: 0.75rem;">'
452
+ for item in items:
453
+ icon = item.get('icon', '📄')
454
+ label = item.get('label', 'Source')
455
+ url = item.get('url', '#')
456
+ html += f'''
457
+ <a href="{url}" target="_blank" class="evidence">
458
+ <span>{icon}</span>
459
+ <span>{label}</span>
460
+ </a>
461
+ '''
462
+ html += '</div>'
463
+ st.markdown(html, unsafe_allow_html=True)
464
+
465
+
466
+ # === Badges ===
467
+
468
+ def badge(text: str, variant: str = "primary"):
469
+ """Inline badge component."""
470
+ st.markdown(f'<span class="badge badge-{variant}">{text}</span>', unsafe_allow_html=True)
471
+
472
+
473
+ def badge_row(badges: List[Dict[str, str]]):
474
+ """Row of badges."""
475
+ html = '<div style="display: flex; gap: 0.5rem; flex-wrap: wrap;">'
476
+ for b in badges:
477
+ text = b.get('text', '')
478
+ variant = b.get('variant', 'primary')
479
+ html += f'<span class="badge badge-{variant}">{text}</span>'
480
+ html += '</div>'
481
+ st.markdown(html, unsafe_allow_html=True)
bioflow/ui/config.py ADDED
@@ -0,0 +1,583 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow UI - Modern Design System
3
+ ==================================
4
+ Clean, minimal, and highly usable interface.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class Colors:
12
+ """Color palette - Modern dark theme."""
13
+ # Primary
14
+ primary: str = "#8B5CF6"
15
+ primary_hover: str = "#A78BFA"
16
+ primary_muted: str = "rgba(139, 92, 246, 0.15)"
17
+
18
+ # Accents
19
+ cyan: str = "#22D3EE"
20
+ emerald: str = "#34D399"
21
+ amber: str = "#FBBF24"
22
+ rose: str = "#FB7185"
23
+
24
+ # Backgrounds
25
+ bg_app: str = "#0C0E14"
26
+ bg_surface: str = "#14161E"
27
+ bg_elevated: str = "#1C1F2B"
28
+ bg_hover: str = "#252836"
29
+
30
+ # Text
31
+ text_primary: str = "#F8FAFC"
32
+ text_secondary: str = "#A1A7BB"
33
+ text_muted: str = "#6B7280"
34
+
35
+ # Borders
36
+ border: str = "#2A2D3A"
37
+ border_hover: str = "#3F4354"
38
+
39
+ # Status
40
+ success: str = "#10B981"
41
+ warning: str = "#F59E0B"
42
+ error: str = "#EF4444"
43
+ info: str = "#3B82F6"
44
+
45
+
46
+ COLORS = Colors()
47
+
48
+
49
+ def get_css() -> str:
50
+ """Minimalist, professional CSS using string concatenation to avoid f-string issues."""
51
+
52
+ css = """
53
+ <style>
54
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
55
+
56
+ :root {
57
+ --primary: """ + COLORS.primary + """;
58
+ --bg-app: """ + COLORS.bg_app + """;
59
+ --bg-surface: """ + COLORS.bg_surface + """;
60
+ --text: """ + COLORS.text_primary + """;
61
+ --text-muted: """ + COLORS.text_muted + """;
62
+ --border: """ + COLORS.border + """;
63
+ --radius: 12px;
64
+ --transition: 150ms ease;
65
+ }
66
+
67
+ .stApp {
68
+ background: """ + COLORS.bg_app + """;
69
+ font-family: 'Inter', sans-serif;
70
+ }
71
+
72
+ #MainMenu, footer, header { visibility: hidden; }
73
+ .stDeployButton { display: none; }
74
+
75
+ ::-webkit-scrollbar { width: 6px; height: 6px; }
76
+ ::-webkit-scrollbar-track { background: transparent; }
77
+ ::-webkit-scrollbar-thumb { background: """ + COLORS.border + """; border-radius: 3px; }
78
+ ::-webkit-scrollbar-thumb:hover { background: """ + COLORS.border_hover + """; }
79
+
80
+ section[data-testid="stSidebar"] { display: none !important; }
81
+
82
+ h1, h2, h3 {
83
+ font-weight: 600;
84
+ color: """ + COLORS.text_primary + """;
85
+ letter-spacing: -0.025em;
86
+ }
87
+
88
+ h1 { font-size: 1.875rem; margin-bottom: 0.5rem; }
89
+ h2 { font-size: 1.5rem; }
90
+ h3 { font-size: 1.125rem; }
91
+
92
+ p { color: """ + COLORS.text_secondary + """; line-height: 1.6; }
93
+
94
+ .card {
95
+ background: """ + COLORS.bg_surface + """;
96
+ border: 1px solid """ + COLORS.border + """;
97
+ border-radius: var(--radius);
98
+ padding: 1.25rem;
99
+ }
100
+
101
+ .metric {
102
+ background: """ + COLORS.bg_surface + """;
103
+ border: 1px solid """ + COLORS.border + """;
104
+ border-radius: var(--radius);
105
+ padding: 1.25rem;
106
+ transition: border-color var(--transition);
107
+ }
108
+
109
+ .metric:hover { border-color: """ + COLORS.primary + """; }
110
+
111
+ .metric-icon {
112
+ width: 44px;
113
+ height: 44px;
114
+ border-radius: 10px;
115
+ display: flex;
116
+ align-items: center;
117
+ justify-content: center;
118
+ font-size: 1.375rem;
119
+ margin-bottom: 1rem;
120
+ }
121
+
122
+ .metric-value {
123
+ font-size: 2rem;
124
+ font-weight: 700;
125
+ color: """ + COLORS.text_primary + """;
126
+ line-height: 1;
127
+ }
128
+
129
+ .metric-label {
130
+ font-size: 0.875rem;
131
+ color: """ + COLORS.text_muted + """;
132
+ margin-top: 0.375rem;
133
+ }
134
+
135
+ .metric-change {
136
+ display: inline-flex;
137
+ align-items: center;
138
+ font-size: 0.75rem;
139
+ font-weight: 500;
140
+ padding: 0.25rem 0.5rem;
141
+ border-radius: 6px;
142
+ margin-top: 0.5rem;
143
+ }
144
+
145
+ .metric-change.up { background: rgba(16, 185, 129, 0.15); color: """ + COLORS.success + """; }
146
+ .metric-change.down { background: rgba(239, 68, 68, 0.15); color: """ + COLORS.error + """; }
147
+
148
+ .stButton > button {
149
+ font-family: 'Inter', sans-serif;
150
+ font-weight: 500;
151
+ font-size: 0.875rem;
152
+ border-radius: 8px;
153
+ padding: 0.625rem 1.25rem;
154
+ transition: all var(--transition);
155
+ border: none;
156
+ }
157
+
158
+ .stTextInput input,
159
+ .stTextArea textarea,
160
+ .stSelectbox > div > div {
161
+ background: """ + COLORS.bg_app + """ !important;
162
+ border: 1px solid """ + COLORS.border + """ !important;
163
+ border-radius: 10px !important;
164
+ color: """ + COLORS.text_primary + """ !important;
165
+ font-family: 'Inter', sans-serif !important;
166
+ }
167
+
168
+ .stTextInput input:focus,
169
+ .stTextArea textarea:focus {
170
+ border-color: """ + COLORS.primary + """ !important;
171
+ box-shadow: 0 0 0 3px """ + COLORS.primary_muted + """ !important;
172
+ }
173
+
174
+ .stTabs [data-baseweb="tab-list"] {
175
+ gap: 0;
176
+ background: """ + COLORS.bg_surface + """;
177
+ border-radius: 10px;
178
+ padding: 4px;
179
+ border: 1px solid """ + COLORS.border + """;
180
+ }
181
+
182
+ .stTabs [data-baseweb="tab"] {
183
+ height: auto;
184
+ padding: 0.625rem 1.25rem;
185
+ border-radius: 8px;
186
+ font-weight: 500;
187
+ font-size: 0.875rem;
188
+ color: """ + COLORS.text_muted + """;
189
+ background: transparent;
190
+ }
191
+
192
+ .stTabs [aria-selected="true"] {
193
+ background: """ + COLORS.primary + """ !important;
194
+ color: white !important;
195
+ }
196
+
197
+ .stTabs [data-baseweb="tab-highlight"],
198
+ .stTabs [data-baseweb="tab-border"] { display: none; }
199
+
200
+ .pipeline {
201
+ display: flex;
202
+ align-items: center;
203
+ background: """ + COLORS.bg_surface + """;
204
+ border: 1px solid """ + COLORS.border + """;
205
+ border-radius: var(--radius);
206
+ padding: 1.5rem;
207
+ gap: 0;
208
+ }
209
+
210
+ .step {
211
+ display: flex;
212
+ flex-direction: column;
213
+ align-items: center;
214
+ gap: 0.5rem;
215
+ flex: 1;
216
+ }
217
+
218
+ .step-dot {
219
+ width: 44px;
220
+ height: 44px;
221
+ border-radius: 50%;
222
+ display: flex;
223
+ align-items: center;
224
+ justify-content: center;
225
+ font-size: 1.125rem;
226
+ font-weight: 600;
227
+ transition: all var(--transition);
228
+ }
229
+
230
+ .step-dot.pending {
231
+ background: """ + COLORS.bg_hover + """;
232
+ color: """ + COLORS.text_muted + """;
233
+ border: 2px dashed """ + COLORS.border_hover + """;
234
+ }
235
+
236
+ .step-dot.active {
237
+ background: """ + COLORS.primary + """;
238
+ color: white;
239
+ box-shadow: 0 0 24px rgba(139, 92, 246, 0.5);
240
+ }
241
+
242
+ .step-dot.done {
243
+ background: """ + COLORS.emerald + """;
244
+ color: white;
245
+ }
246
+
247
+ .step-name {
248
+ font-size: 0.75rem;
249
+ font-weight: 500;
250
+ color: """ + COLORS.text_muted + """;
251
+ }
252
+
253
+ .step-line {
254
+ flex: 0.6;
255
+ height: 2px;
256
+ background: """ + COLORS.border + """;
257
+ }
258
+
259
+ .step-line.done { background: """ + COLORS.emerald + """; }
260
+
261
+ .result {
262
+ background: """ + COLORS.bg_surface + """;
263
+ border: 1px solid """ + COLORS.border + """;
264
+ border-radius: var(--radius);
265
+ padding: 1.25rem;
266
+ transition: all var(--transition);
267
+ cursor: pointer;
268
+ }
269
+
270
+ .result:hover {
271
+ border-color: """ + COLORS.primary + """;
272
+ transform: translateY(-2px);
273
+ box-shadow: 0 8px 24px rgba(0, 0, 0, 0.2);
274
+ }
275
+
276
+ .score-high { color: """ + COLORS.emerald + """; }
277
+ .score-med { color: """ + COLORS.amber + """; }
278
+ .score-low { color: """ + COLORS.rose + """; }
279
+
280
+ .badge {
281
+ display: inline-flex;
282
+ align-items: center;
283
+ padding: 0.25rem 0.625rem;
284
+ border-radius: 6px;
285
+ font-size: 0.6875rem;
286
+ font-weight: 600;
287
+ text-transform: uppercase;
288
+ }
289
+
290
+ .badge-primary { background: """ + COLORS.primary_muted + """; color: """ + COLORS.primary + """; }
291
+ .badge-success { background: rgba(16, 185, 129, 0.15); color: """ + COLORS.success + """; }
292
+ .badge-warning { background: rgba(245, 158, 11, 0.15); color: """ + COLORS.warning + """; }
293
+ .badge-error { background: rgba(239, 68, 68, 0.15); color: """ + COLORS.error + """; }
294
+
295
+ .quick-action {
296
+ background: """ + COLORS.bg_surface + """;
297
+ border: 1px solid """ + COLORS.border + """;
298
+ border-radius: var(--radius);
299
+ padding: 1.5rem;
300
+ text-align: center;
301
+ cursor: pointer;
302
+ transition: all var(--transition);
303
+ }
304
+
305
+ .quick-action:hover {
306
+ border-color: """ + COLORS.primary + """;
307
+ transform: translateY(-4px);
308
+ box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25);
309
+ }
310
+
311
+ .quick-action-icon {
312
+ font-size: 2.5rem;
313
+ margin-bottom: 0.75rem;
314
+ display: block;
315
+ }
316
+
317
+ .quick-action-title {
318
+ font-size: 0.9375rem;
319
+ font-weight: 600;
320
+ color: """ + COLORS.text_primary + """;
321
+ }
322
+
323
+ .quick-action-desc {
324
+ font-size: 0.8125rem;
325
+ color: """ + COLORS.text_muted + """;
326
+ margin-top: 0.25rem;
327
+ }
328
+
329
+ .section-header {
330
+ display: flex;
331
+ align-items: center;
332
+ justify-content: space-between;
333
+ margin-bottom: 1rem;
334
+ }
335
+
336
+ .section-title {
337
+ font-size: 1rem;
338
+ font-weight: 600;
339
+ color: """ + COLORS.text_primary + """;
340
+ display: flex;
341
+ align-items: center;
342
+ gap: 0.5rem;
343
+ }
344
+
345
+ .section-link {
346
+ font-size: 0.8125rem;
347
+ color: """ + COLORS.primary + """;
348
+ cursor: pointer;
349
+ }
350
+
351
+ .section-link:hover { text-decoration: underline; }
352
+
353
+ .empty {
354
+ display: flex;
355
+ flex-direction: column;
356
+ align-items: center;
357
+ justify-content: center;
358
+ padding: 4rem 2rem;
359
+ text-align: center;
360
+ }
361
+
362
+ .empty-icon { font-size: 3.5rem; margin-bottom: 1rem; opacity: 0.4; }
363
+ .empty-title { font-size: 1.125rem; font-weight: 600; color: """ + COLORS.text_primary + """; }
364
+ .empty-desc { font-size: 0.9375rem; color: """ + COLORS.text_muted + """; max-width: 320px; margin-top: 0.5rem; }
365
+
366
+ .loading {
367
+ display: flex;
368
+ flex-direction: column;
369
+ align-items: center;
370
+ padding: 3rem;
371
+ }
372
+
373
+ .spinner {
374
+ width: 40px;
375
+ height: 40px;
376
+ border: 3px solid """ + COLORS.border + """;
377
+ border-top-color: """ + COLORS.primary + """;
378
+ border-radius: 50%;
379
+ animation: spin 0.8s linear infinite;
380
+ }
381
+
382
+ @keyframes spin { to { transform: rotate(360deg); } }
383
+
384
+ .loading-text {
385
+ margin-top: 1rem;
386
+ color: """ + COLORS.text_muted + """;
387
+ font-size: 0.875rem;
388
+ }
389
+
390
+ .stProgress > div > div > div {
391
+ background: linear-gradient(90deg, """ + COLORS.primary + """ 0%, """ + COLORS.cyan + """ 100%);
392
+ border-radius: 4px;
393
+ }
394
+
395
+ .stProgress > div > div {
396
+ background: """ + COLORS.bg_hover + """;
397
+ border-radius: 4px;
398
+ }
399
+
400
+ .divider {
401
+ height: 1px;
402
+ background: """ + COLORS.border + """;
403
+ margin: 1.5rem 0;
404
+ }
405
+
406
+ .mol-container {
407
+ background: white;
408
+ border-radius: 10px;
409
+ padding: 0.75rem;
410
+ display: flex;
411
+ align-items: center;
412
+ justify-content: center;
413
+ }
414
+
415
+ .evidence {
416
+ display: inline-flex;
417
+ align-items: center;
418
+ gap: 0.375rem;
419
+ padding: 0.5rem 0.75rem;
420
+ background: """ + COLORS.bg_app + """;
421
+ border: 1px solid """ + COLORS.border + """;
422
+ border-radius: 8px;
423
+ font-size: 0.8125rem;
424
+ color: """ + COLORS.text_secondary + """;
425
+ transition: all var(--transition);
426
+ text-decoration: none;
427
+ }
428
+
429
+ .evidence:hover {
430
+ border-color: """ + COLORS.primary + """;
431
+ color: """ + COLORS.primary + """;
432
+ }
433
+
434
+ .stAlert { border-radius: 10px; border: none; }
435
+
436
+ .stDataFrame {
437
+ border-radius: var(--radius);
438
+ overflow: hidden;
439
+ border: 1px solid """ + COLORS.border + """;
440
+ }
441
+
442
+ .block-container {
443
+ padding-top: 1.25rem;
444
+ }
445
+
446
+ .nav-rail {
447
+ position: sticky;
448
+ top: 1rem;
449
+ display: flex;
450
+ flex-direction: column;
451
+ gap: 0.75rem;
452
+ padding: 1rem;
453
+ background: """ + COLORS.bg_surface + """;
454
+ border: 1px solid """ + COLORS.border + """;
455
+ border-radius: 16px;
456
+ margin-bottom: 1rem;
457
+ }
458
+
459
+ .nav-brand {
460
+ display: flex;
461
+ align-items: center;
462
+ gap: 0.75rem;
463
+ padding-bottom: 0.5rem;
464
+ border-bottom: 1px solid """ + COLORS.border + """;
465
+ }
466
+
467
+ .nav-logo { font-size: 1.5rem; }
468
+
469
+ .nav-title {
470
+ font-size: 1.1rem;
471
+ font-weight: 700;
472
+ color: """ + COLORS.text_primary + """;
473
+ }
474
+
475
+ .nav-title span {
476
+ background: linear-gradient(135deg, """ + COLORS.primary + """ 0%, """ + COLORS.cyan + """ 100%);
477
+ -webkit-background-clip: text;
478
+ -webkit-text-fill-color: transparent;
479
+ }
480
+
481
+ .nav-section {
482
+ font-size: 0.75rem;
483
+ text-transform: uppercase;
484
+ letter-spacing: 0.08em;
485
+ color: """ + COLORS.text_muted + """;
486
+ }
487
+
488
+ div[data-testid="stRadio"] {
489
+ background: """ + COLORS.bg_surface + """;
490
+ border: 1px solid """ + COLORS.border + """;
491
+ border-radius: 16px;
492
+ padding: 0.75rem;
493
+ }
494
+
495
+ div[data-testid="stRadio"] div[role="radiogroup"] {
496
+ display: flex;
497
+ flex-direction: column;
498
+ gap: 0.5rem;
499
+ margin-top: 0.25rem;
500
+ }
501
+
502
+ div[data-testid="stRadio"] input {
503
+ display: none !important;
504
+ }
505
+
506
+ div[data-testid="stRadio"] label {
507
+ background: """ + COLORS.bg_app + """;
508
+ border: 1px solid """ + COLORS.border + """;
509
+ border-radius: 12px;
510
+ padding: 0.65rem 0.9rem;
511
+ font-weight: 500;
512
+ color: """ + COLORS.text_secondary + """;
513
+ transition: all var(--transition);
514
+ margin: 0 !important;
515
+ }
516
+
517
+ div[data-testid="stRadio"] label:hover {
518
+ border-color: """ + COLORS.primary + """;
519
+ color: """ + COLORS.text_primary + """;
520
+ }
521
+
522
+ div[data-testid="stRadio"] label:has(input:checked) {
523
+ background: """ + COLORS.primary + """;
524
+ border-color: """ + COLORS.primary + """;
525
+ color: white;
526
+ box-shadow: 0 8px 20px rgba(139, 92, 246, 0.25);
527
+ }
528
+
529
+ .hero {
530
+ position: relative;
531
+ background: linear-gradient(135deg, rgba(139, 92, 246, 0.12) 0%, rgba(34, 211, 238, 0.08) 100%);
532
+ border: 1px solid """ + COLORS.border + """;
533
+ border-radius: 20px;
534
+ padding: 2.75rem;
535
+ overflow: hidden;
536
+ }
537
+
538
+ .hero-badge {
539
+ display: inline-flex;
540
+ align-items: center;
541
+ gap: 0.5rem;
542
+ padding: 0.35rem 0.75rem;
543
+ border-radius: 999px;
544
+ background: """ + COLORS.primary_muted + """;
545
+ color: """ + COLORS.primary + """;
546
+ font-size: 0.75rem;
547
+ font-weight: 600;
548
+ text-transform: uppercase;
549
+ letter-spacing: 0.08em;
550
+ }
551
+
552
+ .hero-title {
553
+ font-size: 2.25rem;
554
+ font-weight: 700;
555
+ color: """ + COLORS.text_primary + """;
556
+ margin-top: 1rem;
557
+ line-height: 1.1;
558
+ }
559
+
560
+ .hero-subtitle {
561
+ font-size: 1rem;
562
+ color: """ + COLORS.text_muted + """;
563
+ margin-top: 0.75rem;
564
+ max-width: 560px;
565
+ }
566
+
567
+ .hero-actions {
568
+ display: flex;
569
+ gap: 0.75rem;
570
+ margin-top: 1.5rem;
571
+ flex-wrap: wrap;
572
+ }
573
+
574
+ .hero-card {
575
+ background: """ + COLORS.bg_surface + """;
576
+ border: 1px solid """ + COLORS.border + """;
577
+ border-radius: 16px;
578
+ padding: 1.5rem;
579
+ }
580
+ </style>
581
+ """
582
+
583
+ return css
bioflow/ui/pages/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Page exports."""
2
+
3
+ from bioflow.ui.pages import home, discovery, explorer, data, settings
4
+
5
+ __all__ = ["home", "discovery", "explorer", "data", "settings"]
bioflow/ui/pages/data.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Data Page
3
+ ===================
4
+ Data management and upload.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
14
+
15
+ from bioflow.ui.components import (
16
+ page_header, section_header, divider, spacer,
17
+ metric_card, data_table, empty_state
18
+ )
19
+ from bioflow.ui.config import COLORS
20
+
21
+
22
+ def render():
23
+ """Render data page."""
24
+
25
+ page_header("Data Management", "Upload, manage, and organize your datasets", "📊")
26
+
27
+ # Stats Row
28
+ cols = st.columns(4)
29
+
30
+ with cols[0]:
31
+ metric_card("5", "Datasets", "📁", color=COLORS.primary)
32
+ with cols[1]:
33
+ metric_card("24.5K", "Molecules", "🧪", color=COLORS.cyan)
34
+ with cols[2]:
35
+ metric_card("1.2K", "Proteins", "🧬", color=COLORS.emerald)
36
+ with cols[3]:
37
+ metric_card("156 MB", "Storage Used", "💾", color=COLORS.amber)
38
+
39
+ spacer("2rem")
40
+
41
+ # Tabs
42
+ tabs = st.tabs(["📁 Datasets", "📤 Upload", "🔧 Processing"])
43
+
44
+ with tabs[0]:
45
+ section_header("Your Datasets", "📁")
46
+
47
+ # Dataset list
48
+ datasets = [
49
+ {"name": "DrugBank Compounds", "type": "Molecules", "count": "12,450", "size": "45.2 MB", "updated": "2024-01-15"},
50
+ {"name": "ChEMBL Kinase Inhibitors", "type": "Molecules", "count": "8,234", "size": "32.1 MB", "updated": "2024-01-10"},
51
+ {"name": "Custom Protein Targets", "type": "Proteins", "count": "1,245", "size": "78.5 MB", "updated": "2024-01-08"},
52
+ ]
53
+
54
+ for ds in datasets:
55
+ st.markdown(f"""
56
+ <div class="card" style="margin-bottom: 0.75rem;">
57
+ <div style="display: flex; justify-content: space-between; align-items: center;">
58
+ <div>
59
+ <div style="font-weight: 600; color: {COLORS.text_primary};">{ds["name"]}</div>
60
+ <div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;">
61
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">
62
+ <span style="color: {COLORS.primary};">●</span> {ds["type"]}
63
+ </span>
64
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">{ds["count"]} items</span>
65
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">{ds["size"]}</span>
66
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">Updated: {ds["updated"]}</span>
67
+ </div>
68
+ </div>
69
+ <div style="display: flex; gap: 0.5rem;">
70
+ <span class="badge badge-primary">{ds["type"]}</span>
71
+ </div>
72
+ </div>
73
+ </div>
74
+ """, unsafe_allow_html=True)
75
+
76
+ # Action buttons
77
+ btn_cols = st.columns([1, 1, 1, 4])
78
+ with btn_cols[0]:
79
+ st.button("View", key=f"view_{ds['name']}", use_container_width=True)
80
+ with btn_cols[1]:
81
+ st.button("Export", key=f"export_{ds['name']}", use_container_width=True)
82
+ with btn_cols[2]:
83
+ st.button("Delete", key=f"delete_{ds['name']}", use_container_width=True)
84
+
85
+ spacer("0.5rem")
86
+
87
+ with tabs[1]:
88
+ section_header("Upload New Data", "📤")
89
+
90
+ # Upload area
91
+ st.markdown(f"""
92
+ <div style="
93
+ border: 2px dashed {COLORS.border};
94
+ border-radius: 16px;
95
+ padding: 3rem;
96
+ text-align: center;
97
+ background: {COLORS.bg_surface};
98
+ ">
99
+ <div style="font-size: 3rem; margin-bottom: 1rem;">📁</div>
100
+ <div style="font-size: 1.125rem; font-weight: 600; color: {COLORS.text_primary};">
101
+ Drag & drop files here
102
+ </div>
103
+ <div style="font-size: 0.875rem; color: {COLORS.text_muted}; margin-top: 0.5rem;">
104
+ or click to browse
105
+ </div>
106
+ <div style="font-size: 0.75rem; color: {COLORS.text_muted}; margin-top: 1rem;">
107
+ Supports: CSV, SDF, FASTA, PDB, JSON
108
+ </div>
109
+ </div>
110
+ """, unsafe_allow_html=True)
111
+
112
+ uploaded_file = st.file_uploader(
113
+ "Upload file",
114
+ type=["csv", "sdf", "fasta", "pdb", "json"],
115
+ label_visibility="collapsed"
116
+ )
117
+
118
+ if uploaded_file:
119
+ st.success(f"✓ File uploaded: {uploaded_file.name}")
120
+
121
+ col1, col2 = st.columns(2)
122
+ with col1:
123
+ dataset_name = st.text_input("Dataset Name", value=uploaded_file.name.split('.')[0])
124
+ with col2:
125
+ data_type = st.selectbox("Data Type", ["Molecules", "Proteins", "Text"])
126
+
127
+ if st.button("Process & Import", type="primary", use_container_width=True):
128
+ with st.spinner("Processing..."):
129
+ import time
130
+ time.sleep(2)
131
+ st.success("✓ Dataset imported successfully!")
132
+
133
+ with tabs[2]:
134
+ section_header("Data Processing", "🔧")
135
+
136
+ st.markdown(f"""
137
+ <div class="card">
138
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.75rem;">
139
+ Available Operations
140
+ </div>
141
+ </div>
142
+ """, unsafe_allow_html=True)
143
+
144
+ operations = [
145
+ {"icon": "🧹", "name": "Clean & Validate", "desc": "Remove duplicates, fix invalid structures"},
146
+ {"icon": "🔢", "name": "Compute Descriptors", "desc": "Calculate molecular properties and fingerprints"},
147
+ {"icon": "🧠", "name": "Generate Embeddings", "desc": "Create vector representations using AI models"},
148
+ {"icon": "🔗", "name": "Merge Datasets", "desc": "Combine multiple datasets with deduplication"},
149
+ ]
150
+
151
+ for op in operations:
152
+ st.markdown(f"""
153
+ <div class="quick-action" style="margin-bottom: 0.75rem; text-align: left;">
154
+ <div style="display: flex; align-items: center; gap: 1rem;">
155
+ <span style="font-size: 1.5rem;">{op["icon"]}</span>
156
+ <div>
157
+ <div style="font-weight: 600; color: {COLORS.text_primary};">{op["name"]}</div>
158
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">{op["desc"]}</div>
159
+ </div>
160
+ </div>
161
+ </div>
162
+ """, unsafe_allow_html=True)
163
+ st.button(f"Run {op['name']}", key=f"op_{op['name']}", use_container_width=True)
bioflow/ui/pages/discovery.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Discovery Page
3
+ ========================
4
+ Drug discovery pipeline interface.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
12
+
13
+ from bioflow.ui.components import (
14
+ page_header, section_header, divider, spacer,
15
+ pipeline_progress, bar_chart, empty_state, loading_state
16
+ )
17
+ from bioflow.ui.config import COLORS
18
+
19
+
20
+ def render():
21
+ """Render discovery page."""
22
+
23
+ page_header("Drug Discovery", "Search for drug candidates with AI-powered analysis", "🔬")
24
+
25
+ # Query Input Section
26
+ st.markdown(f"""
27
+ <div class="card" style="margin-bottom: 1.5rem;">
28
+ <div style="font-size: 0.875rem; font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.75rem;">
29
+ Search Query
30
+ </div>
31
+ </div>
32
+ """, unsafe_allow_html=True)
33
+
34
+ col1, col2 = st.columns([3, 1])
35
+
36
+ with col1:
37
+ query = st.text_area(
38
+ "Query",
39
+ placeholder="Enter a natural language query, SMILES string, or FASTA sequence...",
40
+ height=100,
41
+ label_visibility="collapsed"
42
+ )
43
+
44
+ with col2:
45
+ st.selectbox("Search Type", ["Similarity", "Binding Affinity", "Properties"], label_visibility="collapsed")
46
+ st.selectbox("Database", ["All", "DrugBank", "ChEMBL", "ZINC"], label_visibility="collapsed")
47
+ search_clicked = st.button("🔍 Search", type="primary", use_container_width=True)
48
+
49
+ spacer("1.5rem")
50
+
51
+ # Pipeline Progress
52
+ section_header("Pipeline Status", "🔄")
53
+
54
+ if "discovery_step" not in st.session_state:
55
+ st.session_state.discovery_step = 0
56
+
57
+ steps = [
58
+ {"name": "Input", "status": "done" if st.session_state.discovery_step > 0 else "active"},
59
+ {"name": "Encode", "status": "done" if st.session_state.discovery_step > 1 else ("active" if st.session_state.discovery_step == 1 else "pending")},
60
+ {"name": "Search", "status": "done" if st.session_state.discovery_step > 2 else ("active" if st.session_state.discovery_step == 2 else "pending")},
61
+ {"name": "Predict", "status": "done" if st.session_state.discovery_step > 3 else ("active" if st.session_state.discovery_step == 3 else "pending")},
62
+ {"name": "Results", "status": "active" if st.session_state.discovery_step == 4 else "pending"},
63
+ ]
64
+
65
+ pipeline_progress(steps)
66
+
67
+ spacer("2rem")
68
+ divider()
69
+ spacer("2rem")
70
+
71
+ # Results Section
72
+ section_header("Results", "🎯")
73
+
74
+ if search_clicked and query:
75
+ st.session_state.discovery_step = 4
76
+ st.session_state.discovery_query = query
77
+
78
+ if st.session_state.discovery_step >= 4:
79
+ # Show results
80
+ tabs = st.tabs(["Top Candidates", "Property Analysis", "Evidence"])
81
+
82
+ with tabs[0]:
83
+ # Results list
84
+ results = [
85
+ {"name": "Candidate A", "score": 0.95, "mw": "342.4", "logp": "2.1", "hbd": "2"},
86
+ {"name": "Candidate B", "score": 0.89, "mw": "298.3", "logp": "1.8", "hbd": "3"},
87
+ {"name": "Candidate C", "score": 0.82, "mw": "415.5", "logp": "3.2", "hbd": "1"},
88
+ {"name": "Candidate D", "score": 0.76, "mw": "267.3", "logp": "1.5", "hbd": "2"},
89
+ {"name": "Candidate E", "score": 0.71, "mw": "389.4", "logp": "2.8", "hbd": "2"},
90
+ ]
91
+
92
+ for r in results:
93
+ score_color = COLORS.emerald if r["score"] >= 0.8 else (COLORS.amber if r["score"] >= 0.5 else COLORS.rose)
94
+ st.markdown(f"""
95
+ <div class="result">
96
+ <div style="display: flex; justify-content: space-between; align-items: flex-start;">
97
+ <div>
98
+ <div style="font-weight: 600; color: {COLORS.text_primary};">{r["name"]}</div>
99
+ <div style="display: flex; gap: 1rem; margin-top: 0.5rem;">
100
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">MW: {r["mw"]}</span>
101
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">LogP: {r["logp"]}</span>
102
+ <span style="font-size: 0.8125rem; color: {COLORS.text_muted};">HBD: {r["hbd"]}</span>
103
+ </div>
104
+ </div>
105
+ <div style="font-size: 1.5rem; font-weight: 700; color: {score_color};">{r["score"]:.0%}</div>
106
+ </div>
107
+ </div>
108
+ """, unsafe_allow_html=True)
109
+ spacer("0.75rem")
110
+
111
+ with tabs[1]:
112
+ # Property distribution
113
+ col1, col2 = st.columns(2)
114
+
115
+ with col1:
116
+ bar_chart(
117
+ {"<200": 5, "200-300": 12, "300-400": 8, "400-500": 3, ">500": 2},
118
+ title="Molecular Weight Distribution",
119
+ height=250
120
+ )
121
+
122
+ with col2:
123
+ bar_chart(
124
+ {"<1": 4, "1-2": 10, "2-3": 8, "3-4": 5, ">4": 3},
125
+ title="LogP Distribution",
126
+ height=250
127
+ )
128
+
129
+ with tabs[2]:
130
+ # Evidence
131
+ st.markdown(f"""
132
+ <div class="card">
133
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 1rem;">
134
+ Related Literature
135
+ </div>
136
+ </div>
137
+ """, unsafe_allow_html=True)
138
+
139
+ papers = [
140
+ {"title": "Novel therapeutic targets for cancer treatment", "year": "2024", "journal": "Nature Medicine"},
141
+ {"title": "Molecular docking studies of kinase inhibitors", "year": "2023", "journal": "J. Med. Chem."},
142
+ {"title": "AI-driven drug discovery approaches", "year": "2024", "journal": "Drug Discovery Today"},
143
+ ]
144
+
145
+ for p in papers:
146
+ st.markdown(f"""
147
+ <div style="
148
+ padding: 1rem;
149
+ border: 1px solid {COLORS.border};
150
+ border-radius: 8px;
151
+ margin-bottom: 0.75rem;
152
+ ">
153
+ <div style="font-weight: 500; color: {COLORS.text_primary};">{p["title"]}</div>
154
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted}; margin-top: 0.25rem;">
155
+ {p["journal"]} • {p["year"]}
156
+ </div>
157
+ </div>
158
+ """, unsafe_allow_html=True)
159
+
160
+ else:
161
+ empty_state(
162
+ "🔍",
163
+ "No Results Yet",
164
+ "Enter a query and click Search to find drug candidates"
165
+ )
bioflow/ui/pages/explorer.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Explorer Page
3
+ =======================
4
+ Data exploration and visualization.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+ import numpy as np
11
+
12
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
13
+
14
+ from bioflow.ui.components import (
15
+ page_header, section_header, divider, spacer,
16
+ scatter_chart, heatmap, metric_card, empty_state
17
+ )
18
+ from bioflow.ui.config import COLORS
19
+
20
+
21
+ def render():
22
+ """Render explorer page."""
23
+
24
+ page_header("Data Explorer", "Visualize molecular embeddings and relationships", "🧬")
25
+
26
+ # Controls
27
+ col1, col2, col3, col4 = st.columns(4)
28
+
29
+ with col1:
30
+ dataset = st.selectbox("Dataset", ["DrugBank", "ChEMBL", "ZINC", "Custom"])
31
+ with col2:
32
+ viz_type = st.selectbox("Visualization", ["UMAP", "t-SNE", "PCA"])
33
+ with col3:
34
+ color_by = st.selectbox("Color by", ["Activity", "MW", "LogP", "Cluster"])
35
+ with col4:
36
+ st.write("") # Spacing
37
+ st.write("")
38
+ if st.button("🔄 Refresh", use_container_width=True):
39
+ st.rerun()
40
+
41
+ spacer("1.5rem")
42
+
43
+ # Main visualization area
44
+ col_viz, col_details = st.columns([2, 1])
45
+
46
+ with col_viz:
47
+ section_header("Embedding Space", "🗺️")
48
+
49
+ # Generate sample data
50
+ np.random.seed(42)
51
+ n_points = 200
52
+
53
+ # Create clusters
54
+ cluster1_x = np.random.normal(2, 0.8, n_points // 4)
55
+ cluster1_y = np.random.normal(3, 0.8, n_points // 4)
56
+
57
+ cluster2_x = np.random.normal(-2, 1, n_points // 4)
58
+ cluster2_y = np.random.normal(-1, 1, n_points // 4)
59
+
60
+ cluster3_x = np.random.normal(4, 0.6, n_points // 4)
61
+ cluster3_y = np.random.normal(-2, 0.6, n_points // 4)
62
+
63
+ cluster4_x = np.random.normal(-1, 0.9, n_points // 4)
64
+ cluster4_y = np.random.normal(4, 0.9, n_points // 4)
65
+
66
+ x = list(cluster1_x) + list(cluster2_x) + list(cluster3_x) + list(cluster4_x)
67
+ y = list(cluster1_y) + list(cluster2_y) + list(cluster3_y) + list(cluster4_y)
68
+ labels = [f"Mol_{i}" for i in range(n_points)]
69
+
70
+ scatter_chart(x, y, labels, title=f"{viz_type} Projection - {dataset}", height=450)
71
+
72
+ with col_details:
73
+ section_header("Statistics", "📊")
74
+
75
+ metric_card("12,450", "Total Molecules", "🧪", color=COLORS.primary)
76
+ spacer("0.75rem")
77
+ metric_card("4", "Clusters Found", "🎯", color=COLORS.cyan)
78
+ spacer("0.75rem")
79
+ metric_card("0.89", "Silhouette Score", "📈", color=COLORS.emerald)
80
+ spacer("0.75rem")
81
+ metric_card("85%", "Coverage", "✓", color=COLORS.amber)
82
+
83
+ spacer("2rem")
84
+ divider()
85
+ spacer("2rem")
86
+
87
+ # Similarity Heatmap
88
+ section_header("Similarity Matrix", "🔥")
89
+
90
+ # Sample similarity matrix
91
+ np.random.seed(123)
92
+ labels_short = ["Cluster A", "Cluster B", "Cluster C", "Cluster D", "Cluster E"]
93
+ similarity = np.random.uniform(0.3, 1.0, (5, 5))
94
+ similarity = (similarity + similarity.T) / 2 # Make symmetric
95
+ np.fill_diagonal(similarity, 1.0)
96
+
97
+ heatmap(
98
+ similarity.tolist(),
99
+ labels_short,
100
+ labels_short,
101
+ title="Inter-cluster Similarity",
102
+ height=350
103
+ )
104
+
105
+ spacer("2rem")
106
+
107
+ # Export options
108
+ st.markdown(f"""
109
+ <div class="card">
110
+ <div style="display: flex; justify-content: space-between; align-items: center;">
111
+ <div>
112
+ <div style="font-weight: 600; color: {COLORS.text_primary};">Export Data</div>
113
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted}; margin-top: 0.25rem;">
114
+ Download embeddings, clusters, or full dataset
115
+ </div>
116
+ </div>
117
+ </div>
118
+ </div>
119
+ """, unsafe_allow_html=True)
120
+
121
+ exp_cols = st.columns(3)
122
+ with exp_cols[0]:
123
+ st.button("📥 Embeddings (CSV)", use_container_width=True)
124
+ with exp_cols[1]:
125
+ st.button("📥 Clusters (JSON)", use_container_width=True)
126
+ with exp_cols[2]:
127
+ st.button("📥 Full Dataset", use_container_width=True)
bioflow/ui/pages/home.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Home Page
3
+ ====================
4
+ Clean dashboard with key metrics and quick actions.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
12
+
13
+ from bioflow.ui.components import (
14
+ section_header, divider, spacer,
15
+ metric_card, pipeline_progress, bar_chart
16
+ )
17
+ from bioflow.ui.config import COLORS
18
+
19
+
20
+ def render():
21
+ """Render home page."""
22
+
23
+ # Hero Section (Tailark-inspired)
24
+ hero_col, hero_side = st.columns([3, 1.4])
25
+
26
+ with hero_col:
27
+ st.markdown(f"""
28
+ <div class="hero">
29
+ <div class="hero-badge">New • BioFlow 2.0</div>
30
+ <div class="hero-title">AI-Powered Drug Discovery</div>
31
+ <div class="hero-subtitle">
32
+ Run discovery pipelines, predict binding, and surface evidence in one streamlined workspace.
33
+ </div>
34
+ <div class="hero-actions">
35
+ <span class="badge badge-primary">Model-aware search</span>
36
+ <span class="badge badge-success">Evidence-linked</span>
37
+ <span class="badge badge-warning">Fast iteration</span>
38
+ </div>
39
+ </div>
40
+ """, unsafe_allow_html=True)
41
+
42
+ spacer("0.75rem")
43
+ btn1, btn2 = st.columns(2)
44
+ with btn1:
45
+ if st.button("Start Discovery", type="primary", use_container_width=True):
46
+ st.session_state.current_page = "discovery"
47
+ st.rerun()
48
+ with btn2:
49
+ if st.button("Explore Data", use_container_width=True):
50
+ st.session_state.current_page = "explorer"
51
+ st.rerun()
52
+
53
+ with hero_side:
54
+ st.markdown(f"""
55
+ <div class="hero-card">
56
+ <div style="font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.08em; color: {COLORS.text_muted};">
57
+ Today
58
+ </div>
59
+ <div style="font-size: 1.75rem; font-weight: 700; color: {COLORS.text_primary}; margin-top: 0.5rem;">
60
+ 156 Discoveries
61
+ </div>
62
+ <div style="font-size: 0.875rem; color: {COLORS.text_muted}; margin-top: 0.5rem;">
63
+ +12% vs last week
64
+ </div>
65
+ <div class="divider" style="margin: 1rem 0;"></div>
66
+ <div style="display: flex; flex-direction: column; gap: 0.5rem;">
67
+ <span class="badge badge-primary">Discovery</span>
68
+ <span class="badge badge-success">Prediction</span>
69
+ <span class="badge badge-warning">Evidence</span>
70
+ </div>
71
+ </div>
72
+ """, unsafe_allow_html=True)
73
+
74
+ # Metrics Row
75
+ cols = st.columns(4)
76
+
77
+ with cols[0]:
78
+ metric_card("12.5M", "Molecules", "🧪", "+2.3%", "up", COLORS.primary)
79
+ with cols[1]:
80
+ metric_card("847K", "Proteins", "🧬", "+1.8%", "up", COLORS.cyan)
81
+ with cols[2]:
82
+ metric_card("1.2M", "Papers", "📚", "+5.2%", "up", COLORS.emerald)
83
+ with cols[3]:
84
+ metric_card("156", "Discoveries", "✨", "+12%", "up", COLORS.amber)
85
+
86
+ spacer("2rem")
87
+
88
+ # Quick Actions
89
+ section_header("Quick Actions", "⚡")
90
+
91
+ action_cols = st.columns(4)
92
+
93
+ with action_cols[0]:
94
+ st.markdown(f"""
95
+ <div class="quick-action">
96
+ <span class="quick-action-icon">🔍</span>
97
+ <div class="quick-action-title">New Discovery</div>
98
+ <div class="quick-action-desc">Start a pipeline</div>
99
+ </div>
100
+ """, unsafe_allow_html=True)
101
+ if st.button("Start", key="qa_discovery", use_container_width=True):
102
+ st.session_state.current_page = "discovery"
103
+ st.rerun()
104
+
105
+ with action_cols[1]:
106
+ st.markdown(f"""
107
+ <div class="quick-action">
108
+ <span class="quick-action-icon">📊</span>
109
+ <div class="quick-action-title">Explore Data</div>
110
+ <div class="quick-action-desc">Visualize embeddings</div>
111
+ </div>
112
+ """, unsafe_allow_html=True)
113
+ if st.button("Explore", key="qa_explorer", use_container_width=True):
114
+ st.session_state.current_page = "explorer"
115
+ st.rerun()
116
+
117
+ with action_cols[2]:
118
+ st.markdown(f"""
119
+ <div class="quick-action">
120
+ <span class="quick-action-icon">📁</span>
121
+ <div class="quick-action-title">Upload Data</div>
122
+ <div class="quick-action-desc">Add molecules</div>
123
+ </div>
124
+ """, unsafe_allow_html=True)
125
+ if st.button("Upload", key="qa_data", use_container_width=True):
126
+ st.session_state.current_page = "data"
127
+ st.rerun()
128
+
129
+ with action_cols[3]:
130
+ st.markdown(f"""
131
+ <div class="quick-action">
132
+ <span class="quick-action-icon">⚙️</span>
133
+ <div class="quick-action-title">Settings</div>
134
+ <div class="quick-action-desc">Configure models</div>
135
+ </div>
136
+ """, unsafe_allow_html=True)
137
+ if st.button("Configure", key="qa_settings", use_container_width=True):
138
+ st.session_state.current_page = "settings"
139
+ st.rerun()
140
+
141
+ spacer("2rem")
142
+ divider()
143
+ spacer("2rem")
144
+
145
+ # Two Column Layout
146
+ col1, col2 = st.columns([3, 2])
147
+
148
+ with col1:
149
+ section_header("Recent Discoveries", "🎯")
150
+
151
+ # Sample results
152
+ results = [
153
+ {"name": "Aspirin analog", "score": 0.94, "mw": "180.16"},
154
+ {"name": "Novel kinase inhibitor", "score": 0.87, "mw": "331.39"},
155
+ {"name": "EGFR binder candidate", "score": 0.72, "mw": "311.38"},
156
+ ]
157
+
158
+ for r in results:
159
+ score_color = COLORS.emerald if r["score"] >= 0.8 else (COLORS.amber if r["score"] >= 0.5 else COLORS.rose)
160
+ st.markdown(f"""
161
+ <div class="result">
162
+ <div style="display: flex; justify-content: space-between; align-items: center;">
163
+ <div style="font-weight: 600; color: {COLORS.text_primary};">{r["name"]}</div>
164
+ <div style="font-size: 1.25rem; font-weight: 700; color: {score_color};">{r["score"]:.0%}</div>
165
+ </div>
166
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted}; margin-top: 0.5rem;">
167
+ MW: {r["mw"]}
168
+ </div>
169
+ </div>
170
+ """, unsafe_allow_html=True)
171
+ spacer("0.75rem")
172
+
173
+ with col2:
174
+ section_header("Pipeline Activity", "📈")
175
+
176
+ bar_chart(
177
+ {"Mon": 23, "Tue": 31, "Wed": 28, "Thu": 45, "Fri": 38, "Sat": 12, "Sun": 8},
178
+ title="",
179
+ height=250
180
+ )
181
+
182
+ spacer("1rem")
183
+ section_header("Active Pipeline", "🔄")
184
+
185
+ pipeline_progress([
186
+ {"name": "Encode", "status": "done"},
187
+ {"name": "Search", "status": "active"},
188
+ {"name": "Predict", "status": "pending"},
189
+ {"name": "Verify", "status": "pending"},
190
+ ])
191
+
192
+ spacer("2rem")
193
+
194
+ # Tip
195
+ st.markdown(f"""
196
+ <div style="
197
+ background: {COLORS.bg_surface};
198
+ border: 1px solid {COLORS.border};
199
+ border-radius: 12px;
200
+ padding: 1.25rem;
201
+ display: flex;
202
+ align-items: center;
203
+ gap: 1rem;
204
+ ">
205
+ <span style="font-size: 1.5rem;">💡</span>
206
+ <div>
207
+ <div style="font-size: 0.9375rem; color: {COLORS.text_primary}; font-weight: 500;">Pro Tip</div>
208
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">
209
+ Use natural language like "Find molecules similar to aspirin that can cross the blood-brain barrier"
210
+ </div>
211
+ </div>
212
+ </div>
213
+ """, unsafe_allow_html=True)
bioflow/ui/pages/settings.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow - Settings Page
3
+ ========================
4
+ Configuration and preferences.
5
+ """
6
+
7
+ import streamlit as st
8
+ import sys
9
+ import os
10
+
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
12
+
13
+ from bioflow.ui.components import page_header, section_header, divider, spacer
14
+ from bioflow.ui.config import COLORS
15
+
16
+
17
+ def render():
18
+ """Render settings page."""
19
+
20
+ page_header("Settings", "Configure models, databases, and preferences", "⚙️")
21
+
22
+ # Tabs for different settings sections
23
+ tabs = st.tabs(["🧠 Models", "🗄️ Database", "🔌 API Keys", "🎨 Appearance"])
24
+
25
+ with tabs[0]:
26
+ section_header("Model Configuration", "🧠")
27
+
28
+ st.markdown(f"""
29
+ <div class="card" style="margin-bottom: 1rem;">
30
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.5rem;">
31
+ Embedding Models
32
+ </div>
33
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">
34
+ Configure models used for molecular and protein embeddings
35
+ </div>
36
+ </div>
37
+ """, unsafe_allow_html=True)
38
+
39
+ col1, col2 = st.columns(2)
40
+
41
+ with col1:
42
+ st.selectbox(
43
+ "Molecule Encoder",
44
+ ["MolCLR (Recommended)", "ChemBERTa", "GraphMVP", "MolBERT"],
45
+ help="Model for generating molecular embeddings"
46
+ )
47
+
48
+ st.selectbox(
49
+ "Protein Encoder",
50
+ ["ESM-2 (Recommended)", "ProtTrans", "UniRep", "SeqVec"],
51
+ help="Model for generating protein embeddings"
52
+ )
53
+
54
+ with col2:
55
+ st.selectbox(
56
+ "Binding Predictor",
57
+ ["DrugBAN (Recommended)", "DeepDTA", "GraphDTA", "Custom"],
58
+ help="Model for predicting drug-target binding"
59
+ )
60
+
61
+ st.selectbox(
62
+ "Property Predictor",
63
+ ["ADMET-AI (Recommended)", "ChemProp", "Custom"],
64
+ help="Model for ADMET property prediction"
65
+ )
66
+
67
+ spacer("1rem")
68
+
69
+ st.markdown(f"""
70
+ <div class="card" style="margin-bottom: 1rem;">
71
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.5rem;">
72
+ LLM Settings
73
+ </div>
74
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">
75
+ Configure language models for evidence retrieval and reasoning
76
+ </div>
77
+ </div>
78
+ """, unsafe_allow_html=True)
79
+
80
+ col1, col2 = st.columns(2)
81
+
82
+ with col1:
83
+ st.selectbox(
84
+ "LLM Provider",
85
+ ["OpenAI", "Anthropic", "Local (Ollama)", "Azure OpenAI"]
86
+ )
87
+
88
+ with col2:
89
+ st.selectbox(
90
+ "Model",
91
+ ["GPT-4o", "GPT-4-turbo", "Claude 3.5 Sonnet", "Llama 3.1 70B"]
92
+ )
93
+
94
+ st.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
95
+ st.number_input("Max Tokens", 100, 4096, 2048, 100)
96
+
97
+ with tabs[1]:
98
+ section_header("Database Configuration", "🗄️")
99
+
100
+ st.markdown(f"""
101
+ <div class="card" style="margin-bottom: 1rem;">
102
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.5rem;">
103
+ Vector Database
104
+ </div>
105
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">
106
+ Configure the vector store for similarity search
107
+ </div>
108
+ </div>
109
+ """, unsafe_allow_html=True)
110
+
111
+ col1, col2 = st.columns(2)
112
+
113
+ with col1:
114
+ st.selectbox("Vector Store", ["Qdrant (Recommended)", "Milvus", "Pinecone", "Weaviate", "ChromaDB"])
115
+ st.text_input("Host", value="localhost")
116
+
117
+ with col2:
118
+ st.number_input("Port", 1, 65535, 6333)
119
+ st.text_input("Collection Name", value="bioflow_embeddings")
120
+
121
+ spacer("1rem")
122
+
123
+ st.markdown(f"""
124
+ <div class="card" style="margin-bottom: 1rem;">
125
+ <div style="font-weight: 600; color: {COLORS.text_primary}; margin-bottom: 0.5rem;">
126
+ Knowledge Sources
127
+ </div>
128
+ <div style="font-size: 0.8125rem; color: {COLORS.text_muted};">
129
+ External databases for evidence retrieval
130
+ </div>
131
+ </div>
132
+ """, unsafe_allow_html=True)
133
+
134
+ col1, col2 = st.columns(2)
135
+
136
+ with col1:
137
+ st.checkbox("PubMed", value=True)
138
+ st.checkbox("DrugBank", value=True)
139
+ st.checkbox("ChEMBL", value=True)
140
+
141
+ with col2:
142
+ st.checkbox("UniProt", value=True)
143
+ st.checkbox("KEGG", value=False)
144
+ st.checkbox("Reactome", value=False)
145
+
146
+ with tabs[2]:
147
+ section_header("API Keys", "🔌")
148
+
149
+ st.warning("⚠️ API keys are stored locally and never sent to external servers.")
150
+
151
+ st.text_input("OpenAI API Key", type="password", placeholder="sk-...")
152
+ st.text_input("Anthropic API Key", type="password", placeholder="sk-ant-...")
153
+ st.text_input("PubMed API Key", type="password", placeholder="Optional - for higher rate limits")
154
+ st.text_input("ChEMBL API Key", type="password", placeholder="Optional")
155
+
156
+ spacer("1rem")
157
+
158
+ if st.button("💾 Save API Keys", type="primary"):
159
+ st.success("✓ API keys saved securely")
160
+
161
+ with tabs[3]:
162
+ section_header("Appearance", "🎨")
163
+
164
+ st.selectbox("Theme", ["Dark (Default)", "Light", "System"])
165
+ st.selectbox("Accent Color", ["Purple", "Blue", "Green", "Cyan", "Pink"])
166
+ st.checkbox("Enable animations", value=True)
167
+ st.checkbox("Compact mode", value=False)
168
+ st.slider("Font size", 12, 18, 14)
169
+
170
+ spacer("2rem")
171
+ divider()
172
+ spacer("1rem")
173
+
174
+ # Save buttons
175
+ col1, col2, col3 = st.columns([1, 1, 2])
176
+
177
+ with col1:
178
+ if st.button("💾 Save Settings", type="primary", use_container_width=True):
179
+ st.success("✓ Settings saved successfully!")
180
+
181
+ with col2:
182
+ if st.button("🔄 Reset to Defaults", use_container_width=True):
183
+ st.info("Settings reset to defaults")
184
+
185
+ spacer("2rem")
186
+
187
+ # Version info
188
+ st.markdown(f"""
189
+ <div style="text-align: center; padding: 1rem; color: {COLORS.text_muted}; font-size: 0.75rem;">
190
+ BioFlow v0.1.0 • Built with OpenBioMed
191
+ </div>
192
+ """, unsafe_allow_html=True)
bioflow/ui/requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BioFlow UI Dependencies
2
+ # =======================
3
+
4
+ # Core Streamlit
5
+ streamlit>=1.29.0
6
+
7
+ # Visualization
8
+ plotly>=5.18.0
9
+ altair>=5.2.0
10
+
11
+ # Data handling
12
+ pandas>=2.0.0
13
+ numpy>=1.24.0
14
+
15
+ # Molecular visualization (optional)
16
+ rdkit>=2023.9.1
17
+ py3Dmol>=2.0.0
18
+
19
+ # Machine Learning (optional, for real encoders)
20
+ # torch>=2.0.0
21
+ # transformers>=4.35.0
22
+
23
+ # Vector database
24
+ qdrant-client>=1.7.0
25
+
26
+ # Image processing
27
+ Pillow>=10.0.0
28
+
29
+ # Utilities
30
+ python-dotenv>=1.0.0
31
+ pyyaml>=6.0.1
bioflow/visualizer.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Visualizer - Embedding and Structure Visualization
3
+ ===========================================================
4
+
5
+ This module provides visualization utilities for embeddings,
6
+ molecular structures, and search results.
7
+ """
8
+
9
+ import numpy as np
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Optional imports for visualization
16
+ try:
17
+ import plotly.express as px
18
+ import plotly.graph_objects as go
19
+ from plotly.subplots import make_subplots
20
+ PLOTLY_AVAILABLE = True
21
+ except ImportError:
22
+ PLOTLY_AVAILABLE = False
23
+
24
+ try:
25
+ from sklearn.decomposition import PCA
26
+ from sklearn.manifold import TSNE
27
+ SKLEARN_AVAILABLE = True
28
+ except ImportError:
29
+ SKLEARN_AVAILABLE = False
30
+
31
+
32
+ class EmbeddingVisualizer:
33
+ """Visualize high-dimensional embeddings in 2D/3D."""
34
+
35
+ @staticmethod
36
+ def reduce_dimensions(
37
+ embeddings: np.ndarray,
38
+ method: str = "pca",
39
+ n_components: int = 2,
40
+ **kwargs
41
+ ) -> np.ndarray:
42
+ """
43
+ Reduce embedding dimensions for visualization.
44
+
45
+ Args:
46
+ embeddings: Array of shape (n_samples, n_features).
47
+ method: 'pca' or 'tsne'.
48
+ n_components: Target dimensions (2 or 3).
49
+
50
+ Returns:
51
+ Reduced embeddings of shape (n_samples, n_components).
52
+ """
53
+ if not SKLEARN_AVAILABLE:
54
+ raise ImportError("sklearn required for dimensionality reduction")
55
+
56
+ if method == "pca":
57
+ reducer = PCA(n_components=n_components, **kwargs)
58
+ elif method == "tsne":
59
+ reducer = TSNE(n_components=n_components, **kwargs)
60
+ else:
61
+ raise ValueError(f"Unknown method: {method}")
62
+
63
+ return reducer.fit_transform(embeddings)
64
+
65
+ @staticmethod
66
+ def plot_embeddings_2d(
67
+ embeddings: np.ndarray,
68
+ labels: List[str] = None,
69
+ colors: List[str] = None,
70
+ title: str = "Embedding Space",
71
+ hover_data: List[Dict] = None
72
+ ):
73
+ """
74
+ Create 2D scatter plot of embeddings.
75
+
76
+ Returns:
77
+ Plotly figure object.
78
+ """
79
+ if not PLOTLY_AVAILABLE:
80
+ raise ImportError("plotly required for visualization")
81
+
82
+ if embeddings.shape[1] > 2:
83
+ coords = EmbeddingVisualizer.reduce_dimensions(embeddings, "pca", 2)
84
+ else:
85
+ coords = embeddings
86
+
87
+ fig = go.Figure()
88
+
89
+ # Group by color if provided
90
+ if colors:
91
+ unique_colors = list(set(colors))
92
+ for color in unique_colors:
93
+ mask = [c == color for c in colors]
94
+ x = [coords[i, 0] for i in range(len(coords)) if mask[i]]
95
+ y = [coords[i, 1] for i in range(len(coords)) if mask[i]]
96
+ text = [labels[i] if labels else f"Point {i}" for i in range(len(coords)) if mask[i]]
97
+
98
+ fig.add_trace(go.Scatter(
99
+ x=x, y=y,
100
+ mode='markers',
101
+ name=color,
102
+ text=text,
103
+ hoverinfo='text'
104
+ ))
105
+ else:
106
+ fig.add_trace(go.Scatter(
107
+ x=coords[:, 0],
108
+ y=coords[:, 1],
109
+ mode='markers',
110
+ text=labels or [f"Point {i}" for i in range(len(coords))],
111
+ hoverinfo='text'
112
+ ))
113
+
114
+ fig.update_layout(
115
+ title=title,
116
+ xaxis_title="Dimension 1",
117
+ yaxis_title="Dimension 2",
118
+ hovermode='closest'
119
+ )
120
+
121
+ return fig
122
+
123
+ @staticmethod
124
+ def plot_embeddings_3d(
125
+ embeddings: np.ndarray,
126
+ labels: List[str] = None,
127
+ colors: List[str] = None,
128
+ title: str = "3D Embedding Space"
129
+ ):
130
+ """Create 3D scatter plot of embeddings."""
131
+ if not PLOTLY_AVAILABLE:
132
+ raise ImportError("plotly required for visualization")
133
+
134
+ if embeddings.shape[1] > 3:
135
+ coords = EmbeddingVisualizer.reduce_dimensions(embeddings, "pca", 3)
136
+ else:
137
+ coords = embeddings
138
+
139
+ fig = go.Figure(data=[go.Scatter3d(
140
+ x=coords[:, 0],
141
+ y=coords[:, 1],
142
+ z=coords[:, 2],
143
+ mode='markers',
144
+ text=labels or [f"Point {i}" for i in range(len(coords))],
145
+ marker=dict(
146
+ size=5,
147
+ color=list(range(len(coords))) if not colors else None,
148
+ colorscale='Viridis',
149
+ opacity=0.8
150
+ )
151
+ )])
152
+
153
+ fig.update_layout(
154
+ title=title,
155
+ scene=dict(
156
+ xaxis_title='Dim 1',
157
+ yaxis_title='Dim 2',
158
+ zaxis_title='Dim 3'
159
+ )
160
+ )
161
+
162
+ return fig
163
+
164
+ @staticmethod
165
+ def plot_similarity_matrix(
166
+ embeddings: np.ndarray,
167
+ labels: List[str] = None,
168
+ title: str = "Similarity Matrix"
169
+ ):
170
+ """Plot pairwise similarity matrix."""
171
+ if not PLOTLY_AVAILABLE:
172
+ raise ImportError("plotly required for visualization")
173
+
174
+ # Compute cosine similarity
175
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
176
+ normalized = embeddings / np.clip(norms, 1e-9, None)
177
+ similarity = np.dot(normalized, normalized.T)
178
+
179
+ labels = labels or [f"Item {i}" for i in range(len(embeddings))]
180
+
181
+ fig = go.Figure(data=go.Heatmap(
182
+ z=similarity,
183
+ x=labels,
184
+ y=labels,
185
+ colorscale='RdBu',
186
+ zmid=0
187
+ ))
188
+
189
+ fig.update_layout(
190
+ title=title,
191
+ xaxis_title="Items",
192
+ yaxis_title="Items"
193
+ )
194
+
195
+ return fig
196
+
197
+
198
+ class MoleculeVisualizer:
199
+ """Visualize molecular structures."""
200
+
201
+ @staticmethod
202
+ def smiles_to_svg(smiles: str, size: Tuple[int, int] = (300, 200)) -> str:
203
+ """
204
+ Convert SMILES to SVG image.
205
+
206
+ Args:
207
+ smiles: SMILES string.
208
+ size: (width, height) tuple.
209
+
210
+ Returns:
211
+ SVG string.
212
+ """
213
+ try:
214
+ from rdkit import Chem
215
+ from rdkit.Chem import Draw
216
+
217
+ mol = Chem.MolFromSmiles(smiles)
218
+ if mol is None:
219
+ return f"<svg><text>Invalid SMILES</text></svg>"
220
+
221
+ drawer = Draw.MolDraw2DSVG(size[0], size[1])
222
+ drawer.DrawMolecule(mol)
223
+ drawer.FinishDrawing()
224
+ return drawer.GetDrawingText()
225
+ except ImportError:
226
+ return f"<svg><text>RDKit not available</text></svg>"
227
+
228
+ @staticmethod
229
+ def plot_molecule_grid(
230
+ smiles_list: List[str],
231
+ labels: List[str] = None,
232
+ mols_per_row: int = 4,
233
+ size: Tuple[int, int] = (200, 200)
234
+ ):
235
+ """
236
+ Create a grid of molecule images.
237
+
238
+ Returns:
239
+ PIL Image or Plotly figure.
240
+ """
241
+ try:
242
+ from rdkit import Chem
243
+ from rdkit.Chem import Draw
244
+
245
+ mols = [Chem.MolFromSmiles(s) for s in smiles_list]
246
+ legends = labels or smiles_list
247
+
248
+ img = Draw.MolsToGridImage(
249
+ mols,
250
+ molsPerRow=mols_per_row,
251
+ subImgSize=size,
252
+ legends=legends
253
+ )
254
+ return img
255
+ except ImportError:
256
+ logger.warning("RDKit not available for molecule visualization")
257
+ return None
258
+
259
+
260
+ class ResultsVisualizer:
261
+ """Visualize search and pipeline results."""
262
+
263
+ @staticmethod
264
+ def plot_search_scores(
265
+ results: List[Dict[str, Any]],
266
+ title: str = "Search Results Scores"
267
+ ):
268
+ """Plot bar chart of search result scores."""
269
+ if not PLOTLY_AVAILABLE:
270
+ raise ImportError("plotly required")
271
+
272
+ labels = [r.get("content", "")[:30] for r in results]
273
+ scores = [r.get("score", 0) for r in results]
274
+ modalities = [r.get("modality", "unknown") for r in results]
275
+
276
+ fig = go.Figure(data=[go.Bar(
277
+ x=labels,
278
+ y=scores,
279
+ marker_color=[
280
+ 'blue' if m == 'text' else 'green' if m in ['smiles', 'molecule'] else 'red'
281
+ for m in modalities
282
+ ],
283
+ text=[f"{m}: {s:.3f}" for m, s in zip(modalities, scores)],
284
+ textposition='outside'
285
+ )])
286
+
287
+ fig.update_layout(
288
+ title=title,
289
+ xaxis_title="Content",
290
+ yaxis_title="Similarity Score",
291
+ xaxis_tickangle=-45
292
+ )
293
+
294
+ return fig
295
+
296
+ @staticmethod
297
+ def plot_modality_distribution(
298
+ items: List[Dict[str, Any]],
299
+ title: str = "Modality Distribution"
300
+ ):
301
+ """Plot pie chart of modality distribution."""
302
+ if not PLOTLY_AVAILABLE:
303
+ raise ImportError("plotly required")
304
+
305
+ modalities = [item.get("modality", "unknown") for item in items]
306
+ unique = list(set(modalities))
307
+ counts = [modalities.count(m) for m in unique]
308
+
309
+ fig = go.Figure(data=[go.Pie(
310
+ labels=unique,
311
+ values=counts,
312
+ hole=0.3
313
+ )])
314
+
315
+ fig.update_layout(title=title)
316
+ return fig
317
+
318
+ @staticmethod
319
+ def create_dashboard(
320
+ search_results: List[Dict],
321
+ embeddings: np.ndarray = None,
322
+ labels: List[str] = None
323
+ ):
324
+ """Create a multi-panel dashboard."""
325
+ if not PLOTLY_AVAILABLE:
326
+ raise ImportError("plotly required")
327
+
328
+ fig = make_subplots(
329
+ rows=2, cols=2,
330
+ subplot_titles=(
331
+ "Search Scores",
332
+ "Modality Distribution",
333
+ "Embedding Space",
334
+ "Similarity Heatmap"
335
+ ),
336
+ specs=[
337
+ [{"type": "bar"}, {"type": "pie"}],
338
+ [{"type": "scatter"}, {"type": "heatmap"}]
339
+ ]
340
+ )
341
+
342
+ # Panel 1: Search scores
343
+ scores = [r.get("score", 0) for r in search_results]
344
+ fig.add_trace(
345
+ go.Bar(y=scores, name="Scores"),
346
+ row=1, col=1
347
+ )
348
+
349
+ # Panel 2: Modality distribution
350
+ modalities = [r.get("modality", "unknown") for r in search_results]
351
+ unique_mods = list(set(modalities))
352
+ counts = [modalities.count(m) for m in unique_mods]
353
+ fig.add_trace(
354
+ go.Pie(labels=unique_mods, values=counts, name="Modalities"),
355
+ row=1, col=2
356
+ )
357
+
358
+ # Panel 3: Embedding scatter (if provided)
359
+ if embeddings is not None and len(embeddings) > 0:
360
+ if embeddings.shape[1] > 2:
361
+ coords = EmbeddingVisualizer.reduce_dimensions(embeddings, "pca", 2)
362
+ else:
363
+ coords = embeddings
364
+ fig.add_trace(
365
+ go.Scatter(
366
+ x=coords[:, 0],
367
+ y=coords[:, 1],
368
+ mode='markers',
369
+ text=labels,
370
+ name="Embeddings"
371
+ ),
372
+ row=2, col=1
373
+ )
374
+
375
+ # Panel 4: Similarity matrix (if embeddings provided)
376
+ if embeddings is not None and len(embeddings) > 1:
377
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
378
+ normalized = embeddings / np.clip(norms, 1e-9, None)
379
+ similarity = np.dot(normalized, normalized.T)
380
+ fig.add_trace(
381
+ go.Heatmap(z=similarity, colorscale='RdBu', name="Similarity"),
382
+ row=2, col=2
383
+ )
384
+
385
+ fig.update_layout(height=800, title_text="BioFlow Dashboard")
386
+ return fig
bioflow/workflows/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Workflows
3
+ ==================
4
+
5
+ Pre-built pipelines for common discovery tasks.
6
+
7
+ Pipelines:
8
+ - DiscoveryPipeline: Drug discovery with DTI prediction
9
+ - LiteratureMiningPipeline: Scientific literature search
10
+ - ProteinDesignPipeline: Protein homolog search
11
+
12
+ Ingestion Utilities:
13
+ - load_json_data, load_csv_data
14
+ - parse_smiles_file, parse_fasta_file
15
+ - generate_sample_* for testing
16
+ """
17
+
18
+ from bioflow.workflows.discovery import (
19
+ DiscoveryPipeline,
20
+ DiscoveryResult,
21
+ LiteratureMiningPipeline,
22
+ ProteinDesignPipeline,
23
+ )
24
+
25
+ from bioflow.workflows.ingestion import (
26
+ load_json_data,
27
+ load_csv_data,
28
+ parse_smiles_file,
29
+ parse_fasta_file,
30
+ generate_sample_molecules,
31
+ generate_sample_proteins,
32
+ generate_sample_abstracts,
33
+ )
34
+
35
+ __all__ = [
36
+ # Pipelines
37
+ "DiscoveryPipeline",
38
+ "DiscoveryResult",
39
+ "LiteratureMiningPipeline",
40
+ "ProteinDesignPipeline",
41
+ # Ingestion
42
+ "load_json_data",
43
+ "load_csv_data",
44
+ "parse_smiles_file",
45
+ "parse_fasta_file",
46
+ "generate_sample_molecules",
47
+ "generate_sample_proteins",
48
+ "generate_sample_abstracts",
49
+ ]
bioflow/workflows/discovery.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioFlow Discovery Pipeline
3
+ ===========================
4
+
5
+ High-level API for common discovery workflows.
6
+ Connects encoders, retrievers, and predictors into seamless pipelines.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Union
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+
14
+ from bioflow.core import (
15
+ Modality,
16
+ ToolRegistry,
17
+ BioFlowOrchestrator,
18
+ WorkflowConfig,
19
+ NodeConfig,
20
+ NodeType,
21
+ RetrievalResult,
22
+ )
23
+ from bioflow.core.nodes import (
24
+ NodeResult,
25
+ EncodeNode,
26
+ RetrieveNode,
27
+ PredictNode,
28
+ IngestNode,
29
+ FilterNode,
30
+ TraceabilityNode,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class DiscoveryResult:
38
+ """Complete result from a discovery pipeline."""
39
+ query: str
40
+ query_modality: str
41
+ candidates: List[Dict[str, Any]]
42
+ predictions: List[Dict[str, Any]]
43
+ top_hits: List[Dict[str, Any]]
44
+ execution_time_ms: float
45
+ metadata: Dict[str, Any] = field(default_factory=dict)
46
+
47
+ def to_dict(self) -> Dict[str, Any]:
48
+ return {
49
+ "query": self.query,
50
+ "query_modality": self.query_modality,
51
+ "num_candidates": len(self.candidates),
52
+ "num_predictions": len(self.predictions),
53
+ "top_hits": self.top_hits[:5],
54
+ "execution_time_ms": self.execution_time_ms,
55
+ }
56
+
57
+ def __repr__(self):
58
+ return f"DiscoveryResult(hits={len(self.top_hits)}, time={self.execution_time_ms:.0f}ms)"
59
+
60
+
61
+ class DiscoveryPipeline:
62
+ """
63
+ High-level discovery pipeline for drug-target interactions.
64
+
65
+ Workflow:
66
+ 1. Encode query (text/molecule/protein) → vector
67
+ 2. Search vector DB for similar compounds/sequences
68
+ 3. Predict binding affinity for each candidate
69
+ 4. Filter and rank by predicted score
70
+ 5. Add evidence links for traceability
71
+
72
+ Example:
73
+ >>> from bioflow.plugins import OBMEncoder, QdrantRetriever, DeepPurposePredictor
74
+ >>>
75
+ >>> pipeline = DiscoveryPipeline(
76
+ ... encoder=OBMEncoder(),
77
+ ... retriever=QdrantRetriever(...),
78
+ ... predictor=DeepPurposePredictor()
79
+ ... )
80
+ >>>
81
+ >>> # Search for drug candidates
82
+ >>> results = pipeline.discover(
83
+ ... query="EGFR inhibitor with low toxicity",
84
+ ... target_sequence="MRKH...",
85
+ ... limit=20
86
+ ... )
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ encoder,
92
+ retriever,
93
+ predictor,
94
+ collection: str = "molecules"
95
+ ):
96
+ """
97
+ Initialize discovery pipeline.
98
+
99
+ Args:
100
+ encoder: BioEncoder instance (e.g., OBMEncoder)
101
+ retriever: BioRetriever instance (e.g., QdrantRetriever)
102
+ predictor: BioPredictor instance (e.g., DeepPurposePredictor)
103
+ collection: Default collection for retrieval
104
+ """
105
+ self.encoder = encoder
106
+ self.retriever = retriever
107
+ self.predictor = predictor
108
+ self.collection = collection
109
+
110
+ # Initialize nodes
111
+ self._encode_node = EncodeNode("encode", encoder, auto_detect=True)
112
+ self._retrieve_node = RetrieveNode("retrieve", retriever, collection=collection)
113
+ self._predict_node = PredictNode("predict", predictor)
114
+ self._filter_node = FilterNode("filter", threshold=0.3, top_k=10)
115
+ self._trace_node = TraceabilityNode("trace")
116
+
117
+ logger.info("DiscoveryPipeline initialized")
118
+
119
+ def discover(
120
+ self,
121
+ query: str,
122
+ target_sequence: str,
123
+ modality: Modality = None,
124
+ limit: int = 20,
125
+ filters: Dict[str, Any] = None,
126
+ threshold: float = 0.3,
127
+ top_k: int = 10
128
+ ) -> DiscoveryResult:
129
+ """
130
+ Run full discovery pipeline.
131
+
132
+ Args:
133
+ query: Search query (text, SMILES, or protein)
134
+ target_sequence: Target protein sequence for DTI prediction
135
+ modality: Input modality (auto-detected if None)
136
+ limit: Number of candidates to retrieve
137
+ filters: Metadata filters for retrieval
138
+ threshold: Minimum prediction score
139
+ top_k: Number of top hits to return
140
+
141
+ Returns:
142
+ DiscoveryResult with ranked candidates
143
+ """
144
+ start_time = datetime.now()
145
+
146
+ # Detect modality if not provided
147
+ if modality is None:
148
+ if hasattr(self.encoder, 'encode_auto'):
149
+ # Will auto-detect
150
+ modality = Modality.TEXT
151
+ else:
152
+ modality = Modality.TEXT
153
+
154
+ # Step 1: Encode query
155
+ logger.info(f"Encoding query: {query[:50]}...")
156
+ self._encode_node.modality = modality
157
+ encode_result = self._encode_node.execute(query)
158
+
159
+ # Step 2: Retrieve candidates
160
+ logger.info(f"Retrieving up to {limit} candidates...")
161
+ self._retrieve_node.limit = limit
162
+ self._retrieve_node.filters = filters or {}
163
+ retrieve_result = self._retrieve_node.execute(
164
+ encode_result.data,
165
+ context={"modality": modality}
166
+ )
167
+ candidates = retrieve_result.data
168
+
169
+ if not candidates:
170
+ logger.warning("No candidates found")
171
+ return DiscoveryResult(
172
+ query=query,
173
+ query_modality=modality.value,
174
+ candidates=[],
175
+ predictions=[],
176
+ top_hits=[],
177
+ execution_time_ms=(datetime.now() - start_time).total_seconds() * 1000
178
+ )
179
+
180
+ # Step 3: Predict binding
181
+ logger.info(f"Predicting binding for {len(candidates)} candidates...")
182
+ self._predict_node.target_sequence = target_sequence
183
+ self._predict_node.threshold = threshold
184
+ predict_result = self._predict_node.execute(candidates)
185
+ predictions = predict_result.data
186
+
187
+ # Step 4: Filter and rank
188
+ logger.info("Filtering and ranking...")
189
+ self._filter_node.threshold = threshold
190
+ self._filter_node.top_k = top_k
191
+ filter_result = self._filter_node.execute(predictions)
192
+ top_hits = filter_result.data
193
+
194
+ # Step 5: Add evidence links
195
+ logger.info("Adding evidence links...")
196
+ trace_result = self._trace_node.execute(top_hits)
197
+ enriched_hits = trace_result.data
198
+
199
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
200
+
201
+ return DiscoveryResult(
202
+ query=query,
203
+ query_modality=modality.value,
204
+ candidates=[{"id": c.id, "content": c.content, "score": c.score} for c in candidates],
205
+ predictions=predictions,
206
+ top_hits=enriched_hits,
207
+ execution_time_ms=execution_time,
208
+ metadata={
209
+ "collection": self.collection,
210
+ "limit": limit,
211
+ "threshold": threshold,
212
+ "target_length": len(target_sequence)
213
+ }
214
+ )
215
+
216
+ def ingest(
217
+ self,
218
+ data: List[Dict[str, Any]],
219
+ modality: Modality = Modality.SMILES,
220
+ content_field: str = "smiles"
221
+ ) -> List[str]:
222
+ """
223
+ Ingest data into the vector database.
224
+
225
+ Args:
226
+ data: List of items with content and metadata
227
+ modality: Type of content
228
+ content_field: Field name containing the content
229
+
230
+ Returns:
231
+ List of ingested IDs
232
+ """
233
+ ingest_node = IngestNode(
234
+ "ingest",
235
+ self.retriever,
236
+ collection=self.collection,
237
+ modality=modality,
238
+ content_field=content_field
239
+ )
240
+
241
+ result = ingest_node.execute(data)
242
+ logger.info(f"Ingested {len(result.data)} items into {self.collection}")
243
+
244
+ return result.data
245
+
246
+ def search(
247
+ self,
248
+ query: str,
249
+ modality: Modality = None,
250
+ limit: int = 10,
251
+ filters: Dict[str, Any] = None
252
+ ) -> List[RetrievalResult]:
253
+ """
254
+ Simple similarity search without prediction.
255
+
256
+ Args:
257
+ query: Search query
258
+ modality: Input modality
259
+ limit: Maximum results
260
+ filters: Metadata filters
261
+
262
+ Returns:
263
+ List of similar items
264
+ """
265
+ # Encode
266
+ self._encode_node.modality = modality or Modality.TEXT
267
+ self._encode_node.auto_detect = modality is None
268
+ encode_result = self._encode_node.execute(query)
269
+
270
+ # Retrieve
271
+ self._retrieve_node.limit = limit
272
+ self._retrieve_node.filters = filters or {}
273
+ retrieve_result = self._retrieve_node.execute(encode_result.data)
274
+
275
+ return retrieve_result.data
276
+
277
+
278
+ class LiteratureMiningPipeline:
279
+ """
280
+ Pipeline for searching and analyzing scientific literature.
281
+
282
+ Workflow:
283
+ 1. Encode query (text/molecule/protein)
284
+ 2. Search literature database
285
+ 3. Extract relevant evidence
286
+ 4. Rank by relevance and diversity
287
+ """
288
+
289
+ def __init__(
290
+ self,
291
+ encoder,
292
+ retriever,
293
+ collection: str = "pubmed_abstracts"
294
+ ):
295
+ self.encoder = encoder
296
+ self.retriever = retriever
297
+ self.collection = collection
298
+
299
+ self._encode_node = EncodeNode("encode", encoder, auto_detect=True)
300
+ self._retrieve_node = RetrieveNode("retrieve", retriever, collection=collection)
301
+ self._trace_node = TraceabilityNode("trace")
302
+
303
+ def search(
304
+ self,
305
+ query: str,
306
+ modality: Modality = Modality.TEXT,
307
+ limit: int = 20,
308
+ filters: Dict[str, Any] = None
309
+ ) -> List[Dict[str, Any]]:
310
+ """
311
+ Search literature for relevant papers.
312
+
313
+ Args:
314
+ query: Search query
315
+ modality: Query type
316
+ limit: Maximum results
317
+ filters: Filters (e.g., year, species)
318
+
319
+ Returns:
320
+ List of papers with evidence links
321
+ """
322
+ # Encode query
323
+ self._encode_node.modality = modality
324
+ encode_result = self._encode_node.execute(query)
325
+
326
+ # Search
327
+ self._retrieve_node.limit = limit
328
+ self._retrieve_node.filters = filters or {}
329
+ retrieve_result = self._retrieve_node.execute(encode_result.data)
330
+
331
+ # Add evidence links
332
+ trace_result = self._trace_node.execute(retrieve_result.data)
333
+
334
+ return trace_result.data
335
+
336
+
337
+ class ProteinDesignPipeline:
338
+ """
339
+ Pipeline for protein/antibody design workflows.
340
+
341
+ Workflow:
342
+ 1. Encode seed protein
343
+ 2. Find similar sequences in database
344
+ 3. Analyze conservation and mutations
345
+ 4. Suggest design candidates
346
+ """
347
+
348
+ def __init__(
349
+ self,
350
+ encoder,
351
+ retriever,
352
+ collection: str = "proteins"
353
+ ):
354
+ self.encoder = encoder
355
+ self.retriever = retriever
356
+ self.collection = collection
357
+
358
+ def find_homologs(
359
+ self,
360
+ sequence: str,
361
+ limit: int = 50,
362
+ species_filter: str = None
363
+ ) -> List[Dict[str, Any]]:
364
+ """
365
+ Find homologous proteins.
366
+
367
+ Args:
368
+ sequence: Query protein sequence
369
+ limit: Maximum results
370
+ species_filter: Filter by species
371
+
372
+ Returns:
373
+ List of homologous proteins with metadata
374
+ """
375
+ # Encode
376
+ embedding = self.encoder.encode(sequence, Modality.PROTEIN)
377
+
378
+ # Build filters
379
+ filters = {}
380
+ if species_filter:
381
+ filters["species"] = species_filter
382
+
383
+ # Search
384
+ results = self.retriever.search(
385
+ query=embedding.vector,
386
+ limit=limit,
387
+ filters=filters if filters else None,
388
+ collection=self.collection,
389
+ modality=Modality.PROTEIN
390
+ )
391
+
392
+ return [
393
+ {
394
+ "id": r.id,
395
+ "sequence": r.content,
396
+ "score": r.score,
397
+ **r.payload
398
+ }
399
+ for r in results
400
+ ]
bioflow/workflows/drug_discovery.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # BioFlow Workflow Example: Drug Discovery Pipeline
3
+ # =============================================================================
4
+ # This YAML defines a simple drug discovery workflow that:
5
+ # 1. Encodes a query molecule
6
+ # 2. Retrieves similar compounds from memory
7
+ # 3. Predicts binding affinity for top candidates
8
+ # =============================================================================
9
+
10
+ name: drug_discovery_basic
11
+ description: >
12
+ Basic drug discovery pipeline: encode query -> retrieve analogs -> predict DTI
13
+
14
+ # Define the nodes in execution order
15
+ nodes:
16
+ - id: encode_query
17
+ type: encode
18
+ tool: obm # Uses OBM multimodal encoder
19
+ inputs: [input] # Takes pipeline input
20
+ params:
21
+ modality: smiles
22
+
23
+ - id: find_analogs
24
+ type: retrieve
25
+ tool: qdrant # Uses Qdrant vector search
26
+ inputs: [encode_query] # Takes encoded vector
27
+ params:
28
+ limit: 10
29
+ modality: smiles
30
+ collection: molecules
31
+
32
+ - id: predict_binding
33
+ type: predict
34
+ tool: deeppurpose # Uses DeepPurpose DTI predictor
35
+ inputs: [find_analogs] # Takes retrieved molecules
36
+ params:
37
+ target: "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
38
+
39
+ - id: filter_hits
40
+ type: filter
41
+ tool: builtin # Built-in filter
42
+ inputs: [predict_binding]
43
+ params:
44
+ threshold: 0.7
45
+ key: score
46
+
47
+ # Final output comes from this node
48
+ output_node: filter_hits
49
+
50
+ # Optional metadata
51
+ metadata:
52
+ version: "1.0"
53
+ author: "BioFlow Team"
54
+ tags: [drug-discovery, dti, molecules]
bioflow/workflows/ingestion.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Ingestion Utilities
3
+ =========================
4
+
5
+ Helpers for ingesting data from common biological sources:
6
+ - PubMed abstracts
7
+ - UniProt proteins
8
+ - ChEMBL molecules
9
+ - Custom CSV/JSON files
10
+ """
11
+
12
+ import logging
13
+ from typing import List, Dict, Any, Optional, Generator
14
+ from pathlib import Path
15
+ import json
16
+
17
+ from bioflow.core import Modality
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def load_json_data(
23
+ path: str,
24
+ content_field: str = "content",
25
+ modality_field: str = None,
26
+ limit: int = None
27
+ ) -> List[Dict[str, Any]]:
28
+ """
29
+ Load data from JSON file.
30
+
31
+ Args:
32
+ path: Path to JSON file
33
+ content_field: Field containing main content
34
+ modality_field: Field indicating modality (optional)
35
+ limit: Maximum items to load
36
+
37
+ Returns:
38
+ List of data items
39
+ """
40
+ with open(path, 'r', encoding='utf-8') as f:
41
+ data = json.load(f)
42
+
43
+ if isinstance(data, dict):
44
+ data = data.get("data", data.get("items", [data]))
45
+
46
+ if limit:
47
+ data = data[:limit]
48
+
49
+ logger.info(f"Loaded {len(data)} items from {path}")
50
+ return data
51
+
52
+
53
+ def load_csv_data(
54
+ path: str,
55
+ content_field: str = "content",
56
+ delimiter: str = ",",
57
+ limit: int = None
58
+ ) -> List[Dict[str, Any]]:
59
+ """
60
+ Load data from CSV file.
61
+
62
+ Args:
63
+ path: Path to CSV file
64
+ content_field: Column containing main content
65
+ delimiter: CSV delimiter
66
+ limit: Maximum items to load
67
+
68
+ Returns:
69
+ List of data items as dictionaries
70
+ """
71
+ import csv
72
+
73
+ data = []
74
+ with open(path, 'r', encoding='utf-8') as f:
75
+ reader = csv.DictReader(f, delimiter=delimiter)
76
+ for i, row in enumerate(reader):
77
+ if limit and i >= limit:
78
+ break
79
+ data.append(dict(row))
80
+
81
+ logger.info(f"Loaded {len(data)} items from {path}")
82
+ return data
83
+
84
+
85
+ def parse_smiles_file(
86
+ path: str,
87
+ name_field: int = 1,
88
+ smiles_field: int = 0,
89
+ has_header: bool = True,
90
+ limit: int = None
91
+ ) -> List[Dict[str, Any]]:
92
+ """
93
+ Parse SMILES file (commonly .smi format).
94
+
95
+ Args:
96
+ path: Path to SMILES file
97
+ name_field: Column index for molecule name
98
+ smiles_field: Column index for SMILES string
99
+ has_header: Whether file has header row
100
+ limit: Maximum items to load
101
+
102
+ Returns:
103
+ List of molecule dictionaries
104
+ """
105
+ data = []
106
+ with open(path, 'r') as f:
107
+ for i, line in enumerate(f):
108
+ if has_header and i == 0:
109
+ continue
110
+ if limit and len(data) >= limit:
111
+ break
112
+
113
+ parts = line.strip().split()
114
+ if len(parts) >= 2:
115
+ data.append({
116
+ "smiles": parts[smiles_field],
117
+ "name": parts[name_field] if len(parts) > name_field else f"mol_{i}",
118
+ "modality": "smiles"
119
+ })
120
+ elif len(parts) == 1:
121
+ data.append({
122
+ "smiles": parts[0],
123
+ "name": f"mol_{i}",
124
+ "modality": "smiles"
125
+ })
126
+
127
+ logger.info(f"Loaded {len(data)} molecules from {path}")
128
+ return data
129
+
130
+
131
+ def parse_fasta_file(
132
+ path: str,
133
+ limit: int = None
134
+ ) -> List[Dict[str, Any]]:
135
+ """
136
+ Parse FASTA file for protein sequences.
137
+
138
+ Args:
139
+ path: Path to FASTA file
140
+ limit: Maximum sequences to load
141
+
142
+ Returns:
143
+ List of protein dictionaries
144
+ """
145
+ data = []
146
+ current_header = None
147
+ current_sequence = []
148
+
149
+ with open(path, 'r') as f:
150
+ for line in f:
151
+ line = line.strip()
152
+ if line.startswith('>'):
153
+ # Save previous sequence
154
+ if current_header and current_sequence:
155
+ seq = ''.join(current_sequence)
156
+ data.append({
157
+ "sequence": seq,
158
+ "header": current_header,
159
+ "uniprot_id": _extract_uniprot_id(current_header),
160
+ "modality": "protein"
161
+ })
162
+
163
+ if limit and len(data) >= limit:
164
+ break
165
+
166
+ current_header = line[1:] # Remove >
167
+ current_sequence = []
168
+ else:
169
+ current_sequence.append(line)
170
+
171
+ # Don't forget last sequence
172
+ if current_header and current_sequence and (not limit or len(data) < limit):
173
+ seq = ''.join(current_sequence)
174
+ data.append({
175
+ "sequence": seq,
176
+ "header": current_header,
177
+ "uniprot_id": _extract_uniprot_id(current_header),
178
+ "modality": "protein"
179
+ })
180
+
181
+ logger.info(f"Loaded {len(data)} proteins from {path}")
182
+ return data
183
+
184
+
185
+ def _extract_uniprot_id(header: str) -> str:
186
+ """Extract UniProt ID from FASTA header."""
187
+ # Common formats: sp|P12345|NAME or tr|P12345|NAME
188
+ if '|' in header:
189
+ parts = header.split('|')
190
+ if len(parts) >= 2:
191
+ return parts[1]
192
+ # Just take first word
193
+ return header.split()[0]
194
+
195
+
196
+ def generate_sample_molecules() -> List[Dict[str, Any]]:
197
+ """
198
+ Generate sample molecule data for testing.
199
+
200
+ Returns:
201
+ List of common drug molecules
202
+ """
203
+ return [
204
+ {"smiles": "CC(=O)Oc1ccccc1C(=O)O", "name": "Aspirin", "drugbank_id": "DB00945", "modality": "smiles"},
205
+ {"smiles": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", "name": "Caffeine", "pubchem_id": "2519", "modality": "smiles"},
206
+ {"smiles": "CC(C)Cc1ccc(cc1)C(C)C(=O)O", "name": "Ibuprofen", "drugbank_id": "DB01050", "modality": "smiles"},
207
+ {"smiles": "CC(=O)Nc1ccc(cc1)O", "name": "Acetaminophen", "drugbank_id": "DB00316", "modality": "smiles"},
208
+ {"smiles": "CCO", "name": "Ethanol", "pubchem_id": "702", "modality": "smiles"},
209
+ {"smiles": "c1ccccc1", "name": "Benzene", "pubchem_id": "241", "modality": "smiles"},
210
+ {"smiles": "CC(C)NCC(O)c1ccc(O)c(O)c1", "name": "Isoprenaline", "drugbank_id": "DB01064", "modality": "smiles"},
211
+ {"smiles": "Clc1ccc2c(c1)C(=NCC2)c3ccccc3", "name": "Diazepam", "drugbank_id": "DB00829", "modality": "smiles"},
212
+ ]
213
+
214
+
215
+ def generate_sample_proteins() -> List[Dict[str, Any]]:
216
+ """
217
+ Generate sample protein data for testing.
218
+
219
+ Returns:
220
+ List of common proteins
221
+ """
222
+ return [
223
+ {
224
+ "sequence": "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
225
+ "name": "Sample kinase fragment",
226
+ "uniprot_id": "P00533",
227
+ "species": "human",
228
+ "modality": "protein"
229
+ },
230
+ {
231
+ "sequence": "MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSH",
232
+ "name": "Hemoglobin alpha",
233
+ "uniprot_id": "P69905",
234
+ "species": "human",
235
+ "modality": "protein"
236
+ },
237
+ {
238
+ "sequence": "MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL",
239
+ "name": "EGFR fragment",
240
+ "uniprot_id": "P00533",
241
+ "species": "human",
242
+ "modality": "protein"
243
+ },
244
+ ]
245
+
246
+
247
+ def generate_sample_abstracts() -> List[Dict[str, Any]]:
248
+ """
249
+ Generate sample PubMed-style abstracts for testing.
250
+
251
+ Returns:
252
+ List of sample abstracts
253
+ """
254
+ return [
255
+ {
256
+ "content": "EGFR mutations are common in non-small cell lung cancer and predict response to tyrosine kinase inhibitors. Gefitinib and erlotinib have shown significant efficacy in patients with EGFR-mutant tumors.",
257
+ "pmid": "12345678",
258
+ "title": "EGFR inhibitors in lung cancer",
259
+ "year": 2023,
260
+ "modality": "text"
261
+ },
262
+ {
263
+ "content": "Drug-target interaction prediction using deep learning has emerged as a powerful approach for drug discovery. Neural networks can learn complex patterns from molecular structures and protein sequences.",
264
+ "pmid": "23456789",
265
+ "title": "Deep learning for DTI prediction",
266
+ "year": 2024,
267
+ "modality": "text"
268
+ },
269
+ {
270
+ "content": "Aspirin inhibits cyclooxygenase enzymes (COX-1 and COX-2), reducing prostaglandin synthesis. This mechanism underlies its anti-inflammatory and analgesic effects.",
271
+ "pmid": "34567890",
272
+ "title": "Mechanism of aspirin",
273
+ "year": 2022,
274
+ "modality": "text"
275
+ },
276
+ ]
bioflow/workflows/literature_mining.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # BioFlow Workflow Example: Literature Mining Pipeline
3
+ # =============================================================================
4
+ # This workflow searches scientific literature for relevant papers
5
+ # based on a text, molecule, or protein query.
6
+ # =============================================================================
7
+
8
+ name: literature_mining
9
+ description: >
10
+ Cross-modal literature search: encode any modality -> retrieve papers -> rank by relevance
11
+
12
+ nodes:
13
+ - id: encode_query
14
+ type: encode
15
+ tool: obm
16
+ inputs: [input]
17
+ params:
18
+ modality: text # Can be: text, smiles, protein
19
+
20
+ - id: search_papers
21
+ type: retrieve
22
+ tool: qdrant
23
+ inputs: [encode_query]
24
+ params:
25
+ limit: 20
26
+ collection: pubmed_abstracts
27
+ modality: text
28
+
29
+ - id: rank_results
30
+ type: custom
31
+ tool: mmr_rerank # Maximum Marginal Relevance
32
+ inputs: [search_papers]
33
+ params:
34
+ diversity: 0.3
35
+ top_k: 10
36
+
37
+ output_node: rank_results
38
+
39
+ metadata:
40
+ version: "1.0"
41
+ tags: [literature, search, text-mining]
checkpoints/.placeholder ADDED
File without changes