|
|
| |
| import torch |
| import vllm |
| from vllm import LLM |
| from transformers import AutoTokenizer |
| from pathlib import Path |
| import os |
| import jsonlines |
|
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
| |
| import multiprocessing as mp |
| mp.set_start_method("spawn", force=True) |
|
|
|
|
| def get_detailed_instruct(task_description: str, query: str) -> str: |
| return f'Instruct: {task_description}\nQuery:{query}' |
|
|
|
|
| keywords = ["Quantum mechanics", |
| "Gene editing", |
| "Folding", |
| "System biology", |
| "Antibody", |
| "Heterogeneity", |
| "Ligand", |
| "Drug repurpose", |
| "Kinetics", |
| "Next-generation sequencing", |
| "Pharmacogenetics", |
| "Phase-field technique", |
| "Human", |
| "Potential", |
| "Hartree-Fock", |
| "Flow matching", |
| "Lipid", |
| "Biomedical", |
| "Antigen", |
| "Stochastic modeling", |
| "Coupled cluster", |
| "Quantum biology", |
| "Spatial biology", |
| "Antagonist", |
| "Free energy perturbation", |
| "Cycle", |
| "Pharmacology", |
| "Redox", |
| "Physiology", |
| "Protein-Protein Interactions", |
| "Single-cell", |
| "Screening", |
| "Hydrophobic", |
| "First-principles based DFT", |
| "Molecular biology", |
| "Mechanism", |
| "Reproduction number", |
| "Spatial Transcriptomics", |
| "Ion", |
| "Computational Materials", |
| "Absorption", |
| "Pharmacometrics", |
| "GAN", |
| "Compartmental model", |
| "Diagnostics", |
| "Lead discovery", |
| "QAPR", |
| "Rosettafold", |
| "Autoregressive", |
| "Pharmacokinetics", |
| "Biotechnology", |
| "Hydrophilic", |
| "3D", |
| "Protein", |
| "QM/MM", |
| "Activation", |
| "AMR", |
| "Networks", |
| "Genotype", |
| "Gene regulatory networks", |
| "Biologics", |
| "Phenotype", |
| "Nowcasting", |
| "DFT", |
| "AlphaFold", |
| "Pandemic", |
| "Immunology", |
| "Pathology", |
| "Chemical space", |
| "Transformer", |
| "Homeostasis", |
| "Score", |
| "High-throughput", |
| "Cheminformatics", |
| "Hit-to-lead", |
| "Sequencing", |
| "Enzyme", |
| "Antimicrobial resistance modeling", |
| "Allosteric", |
| "Inhibition", |
| "Computational Biochemistry", |
| "Bioinformatics", |
| "Transcriptomics", |
| "Diffusion", |
| "Anomaly detection", |
| "Multi-omics", |
| "Biology", |
| "Pathway", |
| "Metabolomics", |
| "Synthetic biology", |
| "Microbial", |
| "Proteomics", |
| "Pharmaceutics", |
| "Organoid", |
| "Network pharmacology", |
| "Imaging", |
| "Generative adversarial networks", |
| "Microbiology", |
| "Organ-on-a-chip", |
| "De novo", |
| "Substrate", |
| "Personalized", |
| "Drug", |
| "Transcription", |
| "RNA", |
| "Explainable AI", |
| "Generate", |
| "Docking", |
| "Pathogens", |
| "Bio foundation model", |
| "Reinforcement learning", |
| "Mechanism of action", |
| "Generative", |
| "Metabolic", |
| "Metabolic Flux Analysis", |
| "Computational Chemistry", |
| "Vaccine", |
| "Biophysics", |
| "Integration", |
| "Biochemistry", |
| "Physiologically based pharmacokinetics model", |
| "Medicine", |
| "Crystal", |
| "Conjugate", |
| "Variational autoencoders", |
| "In Silico", |
| "Protein-protein", |
| "CRISPR", |
| "Spatial transcriptomics", |
| "Gene", |
| "Translation", |
| "Glycomics", |
| "Lead optimization", |
| "Pharmacodynamics", |
| "Ab initio", |
| "System immunology", |
| "Pseudotime analysis", |
| "Generative AI", |
| "RNN", |
| "Regulatory networks", |
| "PBPK model", |
| "Beta-blocker", |
| "Lipidomics", |
| "Reaction", |
| "Bio", |
| "Genesis", |
| "Evolution", |
| "Computational Biology", |
| "VAE", |
| "Pharmacogenomics", |
| "Assay", |
| "Sensors", |
| "Conformation", |
| "Finite element method", |
| "Human atlas", |
| "Translational medicine", |
| "Neurology", |
| "Genomics", |
| "Cell biology", |
| "Porous", |
| "Biomarker", |
| "Bioengineering", |
| "Allele", |
| "Recurrent neural networks", |
| "Carbohydrate", |
| "Metamaterial", |
| "Virtual human", |
| "DNA", |
| "Omics", |
| "Agonist", |
| "Receptor", |
| "Cofactor", |
| "Metabolic flux analysis", |
| "Cell atlas", |
| "Signaling", |
| "Electronic structure", |
| "Monte Carlo", |
| "Genomic surveillance", |
| "Agent-based model", |
| "Biosensors", |
| "2D", |
| "QSAR", |
| "Codon", |
| "Coenzyme", |
| "Nucleic acids", |
| "Dynamics", |
| "Ensemble", |
| "Spectrometry", |
| "Multi-scale modeling", |
| "ADMET", |
| "Marker", |
| "Toxicology", |
| "Profiling", |
| "Design", |
| "Viral", |
| "Chemistry", |
| "Epigenetics", |
| "Homo-Lumo", |
| "Modeling", |
| "Prediction", |
| "Quantum Chemistry", |
| "Half-life", |
| "Material", |
| "Disease", |
| "Phylodynamic model", |
| "Metagenomics", |
| "Digital twin", |
| "Cancer biology", |
| "Discovery", |
| "Bioavailability", |
| "Digital PCR" |
| ] |
|
|
| |
| task = 'Given a web search query, retrieve relevant passages that answer the query' |
|
|
| queries = [ |
| get_detailed_instruct(task, ' '.join(keywords)) |
| ] |
|
|
| model = LLM(model="Qwen/Qwen3-Embedding-0.6B", |
| task="embed", |
| tensor_parallel_size=1, |
| data_parallel_size=1) |
|
|
|
|
| def get_functions_contents(dir): |
| subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]) |
| for subdir in subdirs: |
| if subdir == 'ElectronicStructureLibrary___libxc': |
| continue |
| print(subdir) |
| json_path = os.path.join(dir, subdir, 'functions.jsonl') |
| contents = [] |
| if os.path.exists(json_path): |
| objs = [] |
| has_scored = False |
| has_read = False |
| with jsonlines.open(json_path) as reader: |
| has_read = True |
| for obj in reader: |
| if 'score' in obj: |
| has_scored = True |
| break |
| file_path = obj['file'] |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
| func_content = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])[:32000] |
| |
| contents.append(func_content) |
| objs.append(obj) |
| if has_read and not has_scored: |
| scores = get_scores(contents) |
| for i, obj in enumerate(objs): |
| obj['score'] = scores[i] |
| if has_read and not has_scored: |
| with jsonlines.open(json_path, 'w', flush=True) as writer: |
| writer.write_all(objs) |
| print("finish ", subdir) |
| |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| "Qwen/Qwen3-Embedding-0.6B", |
| trust_remote_code=True |
| ) |
|
|
| MAX_TOKENS = 30000 |
| |
| def truncate_to_max_tokens(text, max_tokens=MAX_TOKENS): |
| tokens = tokenizer( |
| text, |
| truncation=True, |
| max_length=max_tokens, |
| return_tensors=None |
| ) |
| return tokenizer.decode(tokens["input_ids"], skip_special_tokens=True) |
|
|
| def get_scores(documents): |
| safe_queries = [truncate_to_max_tokens(q) for q in queries] |
| safe_docs = [truncate_to_max_tokens(d) for d in documents] |
|
|
| input_texts = safe_queries + safe_docs |
| outputs = model.embed(input_texts) |
| embeddings = torch.tensor([o.outputs.embedding for o in outputs]) |
| scores = (embeddings[0] @ embeddings[1:].T) |
| return scores.tolist() |
|
|
| get_functions_contents('/home/weifengsun/tangou1/step2/step22/dataset') |