Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +139 -0
chat_template.jinja +4 -0
config.json +36 -0
generation_config.json +9 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +17 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,139 @@

+---
+license: apache-2.0
+language:
+- en
+tags:
+- biology
+- genomics
+- llama
+- fine-tuned
+- plasmid
+- gene-function
+- genome-assembly
+- gene-essentiality
+pipeline_tag: text-generation
+base_model: meta-llama/Meta-Llama-3.1-8B
+---
+# GenSyntax
+GenSyntax is a fine-tuned large language model for genomic sequence analysis and inference. Built on the Llama 3.1 8B architecture, it is specifically adapted for five core genomic tasks: plasmid host identification, gene function prediction, genome assembly, gene essentiality prediction, and minimal genome derivation.
+## Model Details
+| Property | Value |
+|---|---|
+| **Base Model** | Meta-Llama-3.1-8B |
+| **Architecture** | LlamaForCausalLM |
+| **Parameters** | ~8B |
+| **Hidden Size** | 4096 |
+| **Layers** | 32 |
+| **Attention Heads** | 32 (GQA: 8 KV heads) |
+| **Context Length** | 131,072 tokens |
+| **Precision** | bfloat16 |
+## Intended Use
+GenSyntax is designed for computational biology researchers who need to apply LLM-based reasoning to genomic sequences. It supports the following inference tasks:
+1. **Plasmid Host Identification** — predict the bacterial host range of a plasmid from its sequence.
+2. **Gene Function Prediction** — infer the functional annotation of a gene given its sequence context.
+3. **Genome Assembly** — reconstruct genome sequences from contig fragments.
+4. **Gene Essentiality Prediction** — classify whether a gene is essential for cell survival.
+5. **Minimal Genome Derivation** — determine the minimal gene set required for a viable organism.
+## Hardware Requirements
+A single NVIDIA RTX 4090 (24 GB VRAM) is sufficient for inference. For faster throughput, multi-GPU setups are supported via `device_map="auto"`.
+## How to Use
+### Load the Model
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+model_path = "MoonTideF/GenSyntax"  # or local path
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+```
+### Inference Scripts
+Clone the [GenSyntax repository](https://github.com/nishiwen1214/GenSyntax) and use the provided scripts:
+```bash
+git clone https://github.com/nishiwen1214/GenSyntax.git
+cd GenSyntax
+pip install -r requirements.txt
+```
+#### Plasmid Host Identification
+```bash
+python Plasmid_host_identification.py \
+    --model /path/to/GenSyntax \
+    --input-json-paths test_data/gene_task1_test_1000_format.json
+```
+#### Gene Function Prediction
+```bash
+python Gene_function_prediction.py \
+    --model /path/to/GenSyntax \
+    --input-json-paths test_data/gene_task2_test_500_opts.json
+```
+#### Genome Assembly
+```bash
+python Genome_assembly.py \
+    --model /path/to/GenSyntax \
+    --input-json-paths test_data/gene_task3_test_500_contig3_format.json
+```
+#### Gene Essentiality Prediction
+```bash
+python Gene_essentiality_prediction.py \
+    --model /path/to/GenSyntax \
+    --input-json-paths test_data/gene_task4_test_1000_format.json
+```
+#### Minimal Genome Derivation
+```bash
+python minimal_genome_inference.py \
+    --model /path/to/GenSyntax \
+    --input-json-paths test_data/bacteria_chromosomes_9-mini.json
+```
+## Training Data
+The training and evaluation datasets are available on HuggingFace:
+👉 [GenSyntax Datasets on HuggingFace](https://huggingface.co/datasets/ShiwenNi/GenSyntax-data)
+The dataset includes complete test sets for each task, along with training and test data for cell phenotype prediction.
+## Generation Config
+| Parameter | Value |
+|---|---|
+| `temperature` | 0.6 |
+| `top_p` | 0.9 |
+| `do_sample` | True |
+## Citation
+If you use GenSyntax in your research, please cite the corresponding paper and link to the [GitHub repository](https://github.com/nishiwen1214/GenSyntax).
+## License
+This model is released under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0).

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ 'System: ' + system_message + '<|end_of_text|>' + '
+' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '<|end_of_text|>' + '
+Assistant:' }}{% elif message['role'] == 'assistant' %}{{ content + '<|end_of_text|>' + '
+' }}{% endif %}{% endfor %}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_theta": 500000.0,
+    "rope_type": "llama3"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.7.0",
+  "use_cache": true,
+  "vocab_size": 128256
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "5.7.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a579316488a768e25e8c49d571448bd2f36d0f1e2ffa85322c8d5abf5ed19d71
+size 16060556616

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "is_local": true,
+  "local_files_only": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|end_of_text|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend"
+}