| |
| import torch |
| import vllm |
| from vllm import LLM |
| from pathlib import Path |
| import os |
| import json |
|
|
| def get_detailed_instruct(task_description: str, query: str) -> str: |
| return f'Instruct: {task_description}\nQuery:{query}' |
|
|
|
|
| keywords = ["Quantum mechanics", |
| "Gene editing", |
| "Folding", |
| "System biology", |
| "Antibody", |
| "Heterogeneity", |
| "Ligand", |
| "Drug repurpose", |
| "Kinetics", |
| "Next-generation sequencing", |
| "Pharmacogenetics", |
| "Phase-field technique", |
| "Human", |
| "Potential", |
| "Hartree-Fock", |
| "Flow matching", |
| "Lipid", |
| "Biomedical", |
| "Antigen", |
| "Stochastic modeling", |
| "Coupled cluster", |
| "Quantum biology", |
| "Spatial biology", |
| "Antagonist", |
| "Free energy perturbation", |
| "Cycle", |
| "Pharmacology", |
| "Redox", |
| "Physiology", |
| "Protein-Protein Interactions", |
| "Single-cell", |
| "Screening", |
| "Hydrophobic", |
| "First-principles based DFT", |
| "Molecular biology", |
| "Mechanism", |
| "Reproduction number", |
| "Spatial Transcriptomics", |
| "Ion", |
| "Computational Materials", |
| "Absorption", |
| "Pharmacometrics", |
| "GAN", |
| "Compartmental model", |
| "Diagnostics", |
| "Lead discovery", |
| "QAPR", |
| "Rosettafold", |
| "Autoregressive", |
| "Pharmacokinetics", |
| "Biotechnology", |
| "Hydrophilic", |
| "3D", |
| "Protein", |
| "QM/MM", |
| "Activation", |
| "AMR", |
| "Networks", |
| "Genotype", |
| "Gene regulatory networks", |
| "Biologics", |
| "Phenotype", |
| "Nowcasting", |
| "DFT", |
| "AlphaFold", |
| "Pandemic", |
| "Immunology", |
| "Pathology", |
| "Chemical space", |
| "Transformer", |
| "Homeostasis", |
| "Score", |
| "High-throughput", |
| "Cheminformatics", |
| "Hit-to-lead", |
| "Sequencing", |
| "Enzyme", |
| "Antimicrobial resistance modeling", |
| "Allosteric", |
| "Inhibition", |
| "Computational Biochemistry", |
| "Bioinformatics", |
| "Transcriptomics", |
| "Diffusion", |
| "Anomaly detection", |
| "Multi-omics", |
| "Biology", |
| "Pathway", |
| "Metabolomics", |
| "Synthetic biology", |
| "Microbial", |
| "Proteomics", |
| "Pharmaceutics", |
| "Organoid", |
| "Network pharmacology", |
| "Imaging", |
| "Generative adversarial networks", |
| "Microbiology", |
| "Organ-on-a-chip", |
| "De novo", |
| "Substrate", |
| "Personalized", |
| "Drug", |
| "Transcription", |
| "RNA", |
| "Explainable AI", |
| "Generate", |
| "Docking", |
| "Pathogens", |
| "Bio foundation model", |
| "Reinforcement learning", |
| "Mechanism of action", |
| "Generative", |
| "Metabolic", |
| "Metabolic Flux Analysis", |
| "Computational Chemistry", |
| "Vaccine", |
| "Biophysics", |
| "Integration", |
| "Biochemistry", |
| "Physiologically based pharmacokinetics model", |
| "Medicine", |
| "Crystal", |
| "Conjugate", |
| "Variational autoencoders", |
| "In Silico", |
| "Protein-protein", |
| "CRISPR", |
| "Spatial transcriptomics", |
| "Gene", |
| "Translation", |
| "Glycomics", |
| "Lead optimization", |
| "Pharmacodynamics", |
| "Ab initio", |
| "System immunology", |
| "Pseudotime analysis", |
| "Generative AI", |
| "RNN", |
| "Regulatory networks", |
| "PBPK model", |
| "Beta-blocker", |
| "Lipidomics", |
| "Reaction", |
| "Bio", |
| "Genesis", |
| "Evolution", |
| "Computational Biology", |
| "VAE", |
| "Pharmacogenomics", |
| "Assay", |
| "Sensors", |
| "Conformation", |
| "Finite element method", |
| "Human atlas", |
| "Translational medicine", |
| "Neurology", |
| "Genomics", |
| "Cell biology", |
| "Porous", |
| "Biomarker", |
| "Bioengineering", |
| "Allele", |
| "Recurrent neural networks", |
| "Carbohydrate", |
| "Metamaterial", |
| "Virtual human", |
| "DNA", |
| "Omics", |
| "Agonist", |
| "Receptor", |
| "Cofactor", |
| "Metabolic flux analysis", |
| "Cell atlas", |
| "Signaling", |
| "Electronic structure", |
| "Monte Carlo", |
| "Genomic surveillance", |
| "Agent-based model", |
| "Biosensors", |
| "2D", |
| "QSAR", |
| "Codon", |
| "Coenzyme", |
| "Nucleic acids", |
| "Dynamics", |
| "Ensemble", |
| "Spectrometry", |
| "Multi-scale modeling", |
| "ADMET", |
| "Marker", |
| "Toxicology", |
| "Profiling", |
| "Design", |
| "Viral", |
| "Chemistry", |
| "Epigenetics", |
| "Homo-Lumo", |
| "Modeling", |
| "Prediction", |
| "Quantum Chemistry", |
| "Half-life", |
| "Material", |
| "Disease", |
| "Phylodynamic model", |
| "Metagenomics", |
| "Digital twin", |
| "Cancer biology", |
| "Discovery", |
| "Bioavailability", |
| "Digital PCR" |
| ] |
|
|
| |
| task = 'Given a web search query, retrieve relevant passages that answer the query' |
|
|
| queries = [ |
| get_detailed_instruct(task, ' '.join(keywords)) |
| ] |
|
|
| def get_md_contents(dir): |
| subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]) |
| contents = [] |
| for subdir in subdirs: |
| json_path = os.path.join(dir, subdir, 'readme_summary.json') |
| if os.path.exists(json_path): |
| with open(json_path, 'r', encoding='utf-8') as f: |
| contents.append(json.load(f)['readme_summary']) |
| return contents |
|
|
|
|
| md_contents = get_md_contents('/home/weifengsun/tangou1/step2/step22/dataset') |
| |
| |
|
|
|
|
|
|
|
|
| |
| documents = md_contents |
| input_texts = queries + documents |
|
|
| model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") |
|
|
| outputs = model.embed(input_texts) |
| embeddings = torch.tensor([o.outputs.embedding for o in outputs]) |
| scores = (embeddings[0] @ embeddings[1:].T) |
| |
| |
|
|
|
|
| dataset_dir = '/home/weifengsun/tangou1/step2/step22/dataset' |
| subdirs = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]) |
| valid_subdirs = [d for d in subdirs if os.path.exists(os.path.join(dataset_dir, d, 'readme_summary.json'))] |
|
|
| score_list = scores.tolist() |
| for i, subdir in enumerate(valid_subdirs): |
| json_path = os.path.join(dataset_dir, subdir, 'readme_summary.json') |
| with open(json_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| data['score'] = score_list[i] |
| with open(json_path, 'w', encoding='utf-8') as f: |
| json.dump(data, f, ensure_ascii=False, indent=4) |
|
|