DouDou commited on
Commit
2b084d3
·
verified ·
1 Parent(s): a7c0211

Upload data2/step22/emb_qwen_md.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/emb_qwen_md.py +275 -0
data2/step22/emb_qwen_md.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requires vllm>=0.8.5
2
+ import torch
3
+ import vllm
4
+ from vllm import LLM
5
+ from pathlib import Path
6
+ import os
7
+ import json
8
+
9
+ def get_detailed_instruct(task_description: str, query: str) -> str:
10
+ return f'Instruct: {task_description}\nQuery:{query}'
11
+
12
+
13
+ keywords = ["Quantum mechanics",
14
+ "Gene editing",
15
+ "Folding",
16
+ "System biology",
17
+ "Antibody",
18
+ "Heterogeneity",
19
+ "Ligand",
20
+ "Drug repurpose",
21
+ "Kinetics",
22
+ "Next-generation sequencing",
23
+ "Pharmacogenetics",
24
+ "Phase-field technique",
25
+ "Human",
26
+ "Potential",
27
+ "Hartree-Fock",
28
+ "Flow matching",
29
+ "Lipid",
30
+ "Biomedical",
31
+ "Antigen",
32
+ "Stochastic modeling",
33
+ "Coupled cluster",
34
+ "Quantum biology",
35
+ "Spatial biology",
36
+ "Antagonist",
37
+ "Free energy perturbation",
38
+ "Cycle",
39
+ "Pharmacology",
40
+ "Redox",
41
+ "Physiology",
42
+ "Protein-Protein Interactions",
43
+ "Single-cell",
44
+ "Screening",
45
+ "Hydrophobic",
46
+ "First-principles based DFT",
47
+ "Molecular biology",
48
+ "Mechanism",
49
+ "Reproduction number",
50
+ "Spatial Transcriptomics",
51
+ "Ion",
52
+ "Computational Materials",
53
+ "Absorption",
54
+ "Pharmacometrics",
55
+ "GAN",
56
+ "Compartmental model",
57
+ "Diagnostics",
58
+ "Lead discovery",
59
+ "QAPR",
60
+ "Rosettafold",
61
+ "Autoregressive",
62
+ "Pharmacokinetics",
63
+ "Biotechnology",
64
+ "Hydrophilic",
65
+ "3D",
66
+ "Protein",
67
+ "QM/MM",
68
+ "Activation",
69
+ "AMR",
70
+ "Networks",
71
+ "Genotype",
72
+ "Gene regulatory networks",
73
+ "Biologics",
74
+ "Phenotype",
75
+ "Nowcasting",
76
+ "DFT",
77
+ "AlphaFold",
78
+ "Pandemic",
79
+ "Immunology",
80
+ "Pathology",
81
+ "Chemical space",
82
+ "Transformer",
83
+ "Homeostasis",
84
+ "Score",
85
+ "High-throughput",
86
+ "Cheminformatics",
87
+ "Hit-to-lead",
88
+ "Sequencing",
89
+ "Enzyme",
90
+ "Antimicrobial resistance modeling",
91
+ "Allosteric",
92
+ "Inhibition",
93
+ "Computational Biochemistry",
94
+ "Bioinformatics",
95
+ "Transcriptomics",
96
+ "Diffusion",
97
+ "Anomaly detection",
98
+ "Multi-omics",
99
+ "Biology",
100
+ "Pathway",
101
+ "Metabolomics",
102
+ "Synthetic biology",
103
+ "Microbial",
104
+ "Proteomics",
105
+ "Pharmaceutics",
106
+ "Organoid",
107
+ "Network pharmacology",
108
+ "Imaging",
109
+ "Generative adversarial networks",
110
+ "Microbiology",
111
+ "Organ-on-a-chip",
112
+ "De novo",
113
+ "Substrate",
114
+ "Personalized",
115
+ "Drug",
116
+ "Transcription",
117
+ "RNA",
118
+ "Explainable AI",
119
+ "Generate",
120
+ "Docking",
121
+ "Pathogens",
122
+ "Bio foundation model",
123
+ "Reinforcement learning",
124
+ "Mechanism of action",
125
+ "Generative",
126
+ "Metabolic",
127
+ "Metabolic Flux Analysis",
128
+ "Computational Chemistry",
129
+ "Vaccine",
130
+ "Biophysics",
131
+ "Integration",
132
+ "Biochemistry",
133
+ "Physiologically based pharmacokinetics model",
134
+ "Medicine",
135
+ "Crystal",
136
+ "Conjugate",
137
+ "Variational autoencoders",
138
+ "In Silico",
139
+ "Protein-protein",
140
+ "CRISPR",
141
+ "Spatial transcriptomics",
142
+ "Gene",
143
+ "Translation",
144
+ "Glycomics",
145
+ "Lead optimization",
146
+ "Pharmacodynamics",
147
+ "Ab initio",
148
+ "System immunology",
149
+ "Pseudotime analysis",
150
+ "Generative AI",
151
+ "RNN",
152
+ "Regulatory networks",
153
+ "PBPK model",
154
+ "Beta-blocker",
155
+ "Lipidomics",
156
+ "Reaction",
157
+ "Bio",
158
+ "Genesis",
159
+ "Evolution",
160
+ "Computational Biology",
161
+ "VAE",
162
+ "Pharmacogenomics",
163
+ "Assay",
164
+ "Sensors",
165
+ "Conformation",
166
+ "Finite element method",
167
+ "Human atlas",
168
+ "Translational medicine",
169
+ "Neurology",
170
+ "Genomics",
171
+ "Cell biology",
172
+ "Porous",
173
+ "Biomarker",
174
+ "Bioengineering",
175
+ "Allele",
176
+ "Recurrent neural networks",
177
+ "Carbohydrate",
178
+ "Metamaterial",
179
+ "Virtual human",
180
+ "DNA",
181
+ "Omics",
182
+ "Agonist",
183
+ "Receptor",
184
+ "Cofactor",
185
+ "Metabolic flux analysis",
186
+ "Cell atlas",
187
+ "Signaling",
188
+ "Electronic structure",
189
+ "Monte Carlo",
190
+ "Genomic surveillance",
191
+ "Agent-based model",
192
+ "Biosensors",
193
+ "2D",
194
+ "QSAR",
195
+ "Codon",
196
+ "Coenzyme",
197
+ "Nucleic acids",
198
+ "Dynamics",
199
+ "Ensemble",
200
+ "Spectrometry",
201
+ "Multi-scale modeling",
202
+ "ADMET",
203
+ "Marker",
204
+ "Toxicology",
205
+ "Profiling",
206
+ "Design",
207
+ "Viral",
208
+ "Chemistry",
209
+ "Epigenetics",
210
+ "Homo-Lumo",
211
+ "Modeling",
212
+ "Prediction",
213
+ "Quantum Chemistry",
214
+ "Half-life",
215
+ "Material",
216
+ "Disease",
217
+ "Phylodynamic model",
218
+ "Metagenomics",
219
+ "Digital twin",
220
+ "Cancer biology",
221
+ "Discovery",
222
+ "Bioavailability",
223
+ "Digital PCR"
224
+ ]
225
+
226
+ # Each query must come with a one-sentence instruction that describes the task
227
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
228
+
229
+ queries = [
230
+ get_detailed_instruct(task, ' '.join(keywords))
231
+ ]
232
+
233
+ def get_md_contents(dir):
234
+ subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
235
+ contents = []
236
+ for subdir in subdirs:
237
+ json_path = os.path.join(dir, subdir, 'readme_summary.json')
238
+ if os.path.exists(json_path):
239
+ with open(json_path, 'r', encoding='utf-8') as f:
240
+ contents.append(json.load(f)['readme_summary'])
241
+ return contents
242
+
243
+
244
+ md_contents = get_md_contents('/home/weifengsun/tangou1/step2/step22/dataset')
245
+ # print(len(md_contents))
246
+ # print(md_contents[0])
247
+
248
+
249
+
250
+
251
+ # No need to add instruction for retrieval documents
252
+ documents = md_contents
253
+ input_texts = queries + documents
254
+
255
+ model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
256
+
257
+ outputs = model.embed(input_texts)
258
+ embeddings = torch.tensor([o.outputs.embedding for o in outputs])
259
+ scores = (embeddings[0] @ embeddings[1:].T)
260
+ # print(scores.tolist())
261
+ # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
262
+
263
+
264
+ dataset_dir = '/home/weifengsun/tangou1/step2/step22/dataset'
265
+ subdirs = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))])
266
+ valid_subdirs = [d for d in subdirs if os.path.exists(os.path.join(dataset_dir, d, 'readme_summary.json'))]
267
+
268
+ score_list = scores.tolist()
269
+ for i, subdir in enumerate(valid_subdirs):
270
+ json_path = os.path.join(dataset_dir, subdir, 'readme_summary.json')
271
+ with open(json_path, 'r', encoding='utf-8') as f:
272
+ data = json.load(f)
273
+ data['score'] = score_list[i]
274
+ with open(json_path, 'w', encoding='utf-8') as f:
275
+ json.dump(data, f, ensure_ascii=False, indent=4)