SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

2b084d3

verified ·

1 Parent(s): a7c0211

Upload data2/step22/emb_qwen_md.py with huggingface_hub

Browse files

Files changed (1) hide show

data2/step22/emb_qwen_md.py +275 -0

data2/step22/emb_qwen_md.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Requires vllm>=0.8.5
+import torch
+import vllm
+from vllm import LLM
+from pathlib import Path
+import os
+import json
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'Instruct: {task_description}\nQuery:{query}'
+keywords = ["Quantum mechanics",
+    "Gene editing",
+    "Folding",
+    "System biology",
+    "Antibody",
+    "Heterogeneity",
+    "Ligand",
+    "Drug repurpose",
+    "Kinetics",
+    "Next-generation sequencing",
+    "Pharmacogenetics",
+    "Phase-field technique",
+    "Human",
+    "Potential",
+    "Hartree-Fock",
+    "Flow matching",
+    "Lipid",
+    "Biomedical",
+    "Antigen",
+    "Stochastic modeling",
+    "Coupled cluster",
+    "Quantum biology",
+    "Spatial biology",
+    "Antagonist",
+    "Free energy perturbation",
+    "Cycle",
+    "Pharmacology",
+    "Redox",
+    "Physiology",
+    "Protein-Protein Interactions",
+    "Single-cell",
+    "Screening",
+    "Hydrophobic",
+    "First-principles based DFT",
+    "Molecular biology",
+    "Mechanism",
+    "Reproduction number",
+    "Spatial Transcriptomics",
+    "Ion",
+    "Computational Materials",
+    "Absorption",
+    "Pharmacometrics",
+    "GAN",
+    "Compartmental model",
+    "Diagnostics",
+    "Lead discovery",
+    "QAPR",
+    "Rosettafold",
+    "Autoregressive",
+    "Pharmacokinetics",
+    "Biotechnology",
+    "Hydrophilic",
+    "3D",
+    "Protein",
+    "QM/MM",
+    "Activation",
+    "AMR",
+    "Networks",
+    "Genotype",
+    "Gene regulatory networks",
+    "Biologics",
+    "Phenotype",
+    "Nowcasting",
+    "DFT",
+    "AlphaFold",
+    "Pandemic",
+    "Immunology",
+    "Pathology",
+    "Chemical space",
+    "Transformer",
+    "Homeostasis",
+    "Score",
+    "High-throughput",
+    "Cheminformatics",
+    "Hit-to-lead",
+    "Sequencing",
+    "Enzyme",
+    "Antimicrobial resistance modeling",
+    "Allosteric",
+    "Inhibition",
+    "Computational Biochemistry",
+    "Bioinformatics",
+    "Transcriptomics",
+    "Diffusion",
+    "Anomaly detection",
+    "Multi-omics",
+    "Biology",
+    "Pathway",
+    "Metabolomics",
+    "Synthetic biology",
+    "Microbial",
+    "Proteomics",
+    "Pharmaceutics",
+    "Organoid",
+    "Network pharmacology",
+    "Imaging",
+    "Generative adversarial networks",
+    "Microbiology",
+    "Organ-on-a-chip",
+    "De novo",
+    "Substrate",
+    "Personalized",
+    "Drug",
+    "Transcription",
+    "RNA",
+    "Explainable AI",
+    "Generate",
+    "Docking",
+    "Pathogens",
+    "Bio foundation model",
+    "Reinforcement learning",
+    "Mechanism of action",
+    "Generative",
+    "Metabolic",
+    "Metabolic Flux Analysis",
+    "Computational Chemistry",
+    "Vaccine",
+    "Biophysics",
+    "Integration",
+    "Biochemistry",
+    "Physiologically based pharmacokinetics model",
+    "Medicine",
+    "Crystal",
+    "Conjugate",
+    "Variational autoencoders",
+    "In Silico",
+    "Protein-protein",
+    "CRISPR",
+    "Spatial transcriptomics",
+    "Gene",
+    "Translation",
+    "Glycomics",
+    "Lead optimization",
+    "Pharmacodynamics",
+    "Ab initio",
+    "System immunology",
+    "Pseudotime analysis",
+    "Generative AI",
+    "RNN",
+    "Regulatory networks",
+    "PBPK model",
+    "Beta-blocker",
+    "Lipidomics",
+    "Reaction",
+    "Bio",
+    "Genesis",
+    "Evolution",
+    "Computational Biology",
+    "VAE",
+    "Pharmacogenomics",
+    "Assay",
+    "Sensors",
+    "Conformation",
+    "Finite element method",
+    "Human atlas",
+    "Translational medicine",
+    "Neurology",
+    "Genomics",
+    "Cell biology",
+    "Porous",
+    "Biomarker",
+    "Bioengineering",
+    "Allele",
+    "Recurrent neural networks",
+    "Carbohydrate",
+    "Metamaterial",
+    "Virtual human",
+    "DNA",
+    "Omics",
+    "Agonist",
+    "Receptor",
+    "Cofactor",
+    "Metabolic flux analysis",
+    "Cell atlas",
+    "Signaling",
+    "Electronic structure",
+    "Monte Carlo",
+    "Genomic surveillance",
+    "Agent-based model",
+    "Biosensors",
+    "2D",
+    "QSAR",
+    "Codon",
+    "Coenzyme",
+    "Nucleic acids",
+    "Dynamics",
+    "Ensemble",
+    "Spectrometry",
+    "Multi-scale modeling",
+    "ADMET",
+    "Marker",
+    "Toxicology",
+    "Profiling",
+    "Design",
+    "Viral",
+    "Chemistry",
+    "Epigenetics",
+    "Homo-Lumo",
+    "Modeling",
+    "Prediction",
+    "Quantum Chemistry",
+    "Half-life",
+    "Material",
+    "Disease",
+    "Phylodynamic model",
+    "Metagenomics",
+    "Digital twin",
+    "Cancer biology",
+    "Discovery",
+    "Bioavailability",
+    "Digital PCR"
+    ]
+# Each query must come with a one-sentence instruction that describes the task
+task = 'Given a web search query, retrieve relevant passages that answer the query'
+queries = [
+    get_detailed_instruct(task, ' '.join(keywords))
+]
+def get_md_contents(dir):
+    subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
+    contents = []
+    for subdir in subdirs:
+        json_path = os.path.join(dir, subdir, 'readme_summary.json')
+        if os.path.exists(json_path):
+            with open(json_path, 'r', encoding='utf-8') as f:
+                contents.append(json.load(f)['readme_summary'])
+    return contents
+md_contents = get_md_contents('/home/weifengsun/tangou1/step2/step22/dataset')
+# print(len(md_contents))
+# print(md_contents[0])
+# No need to add instruction for retrieval documents
+documents = md_contents
+input_texts = queries + documents
+model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
+outputs = model.embed(input_texts)
+embeddings = torch.tensor([o.outputs.embedding for o in outputs])
+scores = (embeddings[0] @ embeddings[1:].T)
+# print(scores.tolist())
+# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
+dataset_dir = '/home/weifengsun/tangou1/step2/step22/dataset'
+subdirs = sorted([d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))])
+valid_subdirs = [d for d in subdirs if os.path.exists(os.path.join(dataset_dir, d, 'readme_summary.json'))]
+score_list = scores.tolist()
+for i, subdir in enumerate(valid_subdirs):
+    json_path = os.path.join(dataset_dir, subdir, 'readme_summary.json')
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    data['score'] = score_list[i]
+    with open(json_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)