Spaces:

sochastic
/

Evo-App

Sleeping

App Files Files Community

sochasticbackup commited on Nov 11

Commit

2997d61

1 Parent(s): e994268

initialised app

Browse files

Files changed (12) hide show

.gitignore +216 -0
README.md +9 -8
app.py +468 -0
evo/__init__.py +6 -0
evo/configs/evo-1-131k-base_inference.yml +40 -0
evo/configs/evo-1-8k-base_inference.yml +38 -0
evo/generation.py +297 -0
evo/models.py +122 -0
evo/scoring.py +131 -0
evo/utils.py +183 -0
evo/version.py +1 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,216 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml

README.md CHANGED Viewed

@@ -1,14 +1,15 @@
 ---
-title: Evo App
-emoji: 😻
-colorFrom: indigo
-colorTo: red
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
-license: mit
-short_description: 4 Tasks of Evo
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Evo Model Interface
+emoji: 🧬
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: apache-2.0
+python_version: 3.11
 ---
+Check configuration
+We'll verify that the model and space are configured correctly from a few properties in the README's YAML metadata.

app.py ADDED Viewed

	@@ -0,0 +1,468 @@

+"""
+Evo Model Web Interface
+A simple Gradio app for testing Evo's predictive and generative capabilities.
+"""
+import gradio as gr
+import torch
+import numpy as np
+from evo import Evo
+from evo.scoring import score_sequences
+from evo.generation import generate
+from typing import List, Tuple, Dict
+import io
+# Global model variables
+model = None
+tokenizer = None
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+def load_model():
+    """Load Evo model once at startup."""
+    global model, tokenizer
+    if model is None:
+        print("Loading Evo model...")
+        evo_model = Evo('evo-1-8k-base')
+        model, tokenizer = evo_model.model, evo_model.tokenizer
+        model.to(device)
+        model.eval()
+        print("✓ Model loaded successfully")
+# ============================================================================
+# TASK 1: Function Prediction
+# ============================================================================
+def detect_sequence_type(seq: str) -> str:
+    """Detect if sequence is DNA, RNA, or protein."""
+    seq_upper = seq.upper()
+    if any(c in set('EFILPQZ') for c in seq_upper):
+        return 'protein'
+    if 'U' in seq_upper:
+        return 'RNA'
+    if all(c in set('ACGTN') for c in seq_upper):
+        return 'DNA'
+    return 'unknown'
+def parse_fasta_text(text: str) -> List[Tuple[str, str]]:
+    """Parse FASTA format text into (id, sequence) tuples."""
+    sequences = []
+    current_id = None
+    current_seq = []
+    for line in text.strip().split('\n'):
+        line = line.strip()
+        if line.startswith('>'):
+            if current_id is not None:
+                sequences.append((current_id, ''.join(current_seq)))
+            current_id = line[1:].split('|')[0].strip()
+            current_seq = []
+        else:
+            current_seq.append(line)
+    if current_id is not None:
+        sequences.append((current_id, ''.join(current_seq)))
+    return sequences
+def predict_function(sequences_text: str, threshold: float) -> str:
+    """Predict sequence functionality."""
+    load_model()
+    if not sequences_text.strip():
+        return "⚠️ Please enter sequences in FASTA format or paste sequences directly."
+    # Parse input
+    if sequences_text.startswith('>'):
+        # FASTA format
+        seq_data = parse_fasta_text(sequences_text)
+    else:
+        # Single sequence
+        seq_data = [("sequence_1", sequences_text.strip().replace('\n', ''))]
+    if not seq_data:
+        return "⚠️ No valid sequences found."
+    # Score sequences
+    sequences = [seq for _, seq in seq_data]
+    scores = score_sequences(sequences, model, tokenizer, reduce_method='mean', device=device)
+    # Format results
+    results = ["# Function Prediction Results\n"]
+    results.append(f"{'Sequence ID':<20} {'Type':<10} {'Score':<12} {'Prediction':<15} {'Length':<10}")
+    results.append("-" * 70)
+    for (seq_id, seq), score in zip(seq_data, scores):
+        seq_type = detect_sequence_type(seq)
+        prediction = "✓ Functional" if score > threshold else "✗ Non-functional"
+        results.append(f"{seq_id:<20} {seq_type:<10} {score:<12.4f} {prediction:<15} {len(seq):<10}")
+    results.append("\n" + "=" * 70)
+    results.append(f"Total sequences: {len(seq_data)}")
+    results.append(f"Functional: {sum(1 for s in scores if s > threshold)}")
+    results.append(f"Non-functional: {sum(1 for s in scores if s <= threshold)}")
+    results.append(f"Average score: {np.mean(scores):.4f}")
+    return "\n".join(results)
+# ============================================================================
+# TASK 2: Gene Essentiality
+# ============================================================================
+def predict_essentiality(genes_text: str) -> str:
+    """Predict gene essentiality."""
+    load_model()
+    if not genes_text.strip():
+        return "⚠️ Please enter gene sequences in FASTA format."
+    # Parse FASTA
+    if not genes_text.startswith('>'):
+        return "⚠️ Please use FASTA format: >gene_id|organism|function\\nATGC..."
+    gene_data = parse_fasta_text(genes_text)
+    if not gene_data:
+        return "⚠️ No valid genes found."
+    # Score genes
+    sequences = [seq for _, seq in gene_data]
+    scores = score_sequences(sequences, model, tokenizer, reduce_method='mean', device=device)
+    # Calculate statistics
+    scores_mean = np.mean(scores)
+    scores_std = np.std(scores)
+    # Format results
+    results = ["# Gene Essentiality Prediction\n"]
+    results.append(f"{'Gene ID':<20} {'Z-Score':<10} {'Score':<12} {'Essentiality':<15} {'Confidence':<12}")
+    results.append("-" * 70)
+    essential_count = 0
+    for (gene_id, seq), score in zip(gene_data, scores):
+        z_score = (score - scores_mean) / scores_std if scores_std > 0 else 0
+        if z_score > 0.5:
+            essentiality = "✓ Essential"
+            confidence = "High" if z_score > 1.0 else "Medium"
+            essential_count += 1
+        elif z_score < -0.5:
+            essentiality = "✗ Non-essential"
+            confidence = "High" if z_score < -1.0 else "Medium"
+        else:
+            essentiality = "? Uncertain"
+            confidence = "Low"
+        results.append(f"{gene_id:<20} {z_score:<10.2f} {score:<12.4f} {essentiality:<15} {confidence:<12}")
+    results.append("\n" + "=" * 70)
+    results.append(f"Total genes: {len(gene_data)}")
+    results.append(f"Essential: {essential_count}")
+    results.append(f"Mean score: {scores_mean:.4f} (std: {scores_std:.4f})")
+    return "\n".join(results)
+# ============================================================================
+# TASK 3: CRISPR Generation
+# ============================================================================
+def generate_crispr(n_systems: int, cas_type: str, target_seq: str, cas_length: int) -> str:
+    """Generate CRISPR-Cas systems."""
+    load_model()
+    # Templates
+    cas9_start = 'ATGAACAAGAAC'
+    cas12_start = 'ATGAGCAAGCTG'
+    results = ["# CRISPR-Cas System Generation\n"]
+    cas_types = ['cas9', 'cas12'] if cas_type == 'Both' else [cas_type.lower()]
+    for i in range(n_systems):
+        current_cas = cas_types[i % len(cas_types)]
+        prompt = cas9_start if current_cas == 'cas9' else cas12_start
+        results.append(f"\n{'='*70}")
+        results.append(f"System {i+1}: {current_cas.upper()}")
+        results.append('='*70)
+        # Generate Cas protein
+        output_seqs, _ = generate(
+            [prompt],
+            model,
+            tokenizer,
+            n_tokens=cas_length,
+            temperature=0.8,
+            top_k=4,
+            device=device,
+            verbose=0
+        )
+        cas_protein = output_seqs[0]
+        # Generate gRNA spacer
+        if target_seq:
+            complement = {'A': 'U', 'T': 'A', 'G': 'C', 'C': 'G'}
+            spacer = ''.join(complement.get(b, 'N') for b in reversed(target_seq[:20]))
+        else:
+            spacer_seqs, _ = generate(['G'], model, tokenizer, n_tokens=19, temperature=0.7,
+                                     top_k=4, device=device, verbose=0)
+            spacer = spacer_seqs[0][:20].replace('T', 'U')
+        # PAM sequence
+        pam = 'NGG' if current_cas == 'cas9' else 'TTTN'
+        results.append(f"\n{current_cas.upper()} Protein ({len(cas_protein)} nt):")
+        results.append(f"{cas_protein[:80]}..." if len(cas_protein) > 80 else cas_protein)
+        results.append(f"\ngRNA Spacer: {spacer}")
+        results.append(f"PAM Sequence: {pam}")
+        if current_cas == 'cas9':
+            results.append(f"tracrRNA: AGCAUAGCAAGUUAAAAUAAGGCUAGUCCGU")
+    return "\n".join(results)
+# ============================================================================
+# TASK 4: Regulatory Design
+# ============================================================================
+def generate_spacer_simple(length: int) -> str:
+    """Generate a simple random spacer."""
+    bases = ['A', 'T', 'G', 'C']
+    return ''.join(np.random.choice(bases) for _ in range(length))
+def design_regulatory(n_designs: int, expression_level: str) -> str:
+    """Design regulatory sequences."""
+    load_model()
+    # Templates
+    promoter_templates = {
+        'High': ('TTGACA', 'TATAAT'),
+        'Medium': ('TTGACT', 'TATACT'),
+        'Low': ('TTGCCA', 'TATGAT')
+    }
+    rbs_templates = {
+        'High': 'AGGAGGU',
+        'Medium': 'AGGAGG',
+        'Low': 'AGGA'
+    }
+    results = ["# Regulatory Sequences Design\n"]
+    levels = ['High', 'Medium', 'Low']
+    for i in range(n_designs):
+        if expression_level == 'Mixed':
+            level = levels[i % 3]
+        else:
+            level = expression_level
+        results.append(f"\n{'='*70}")
+        results.append(f"Design {i+1}: {level} Expression")
+        results.append('='*70)
+        # Get promoter boxes
+        box_35, box_10 = promoter_templates[level]
+        # Generate spacers
+        spacer_35_10 = generate_spacer_simple(17)
+        spacer_10_rbs = generate_spacer_simple(7)
+        # Get RBS
+        rbs = rbs_templates[level]
+        # Generate RBS-ATG spacer
+        spacer_rbs_atg = generate_spacer_simple(7)
+        # Assemble
+        promoter = box_35 + spacer_35_10 + box_10
+        full_region = promoter + spacer_10_rbs + rbs + spacer_rbs_atg + 'ATG'
+        gc_content = 100 * (full_region.count('G') + full_region.count('C')) / len(full_region)
+        results.append(f"\nComponents:")
+        results.append(f"  -35 box: {box_35}")
+        results.append(f"  -10 box: {box_10}")
+        results.append(f"  RBS (Shine-Dalgarno): {rbs}")
+        results.append(f"  Start codon: ATG")
+        results.append(f"\nFull Regulatory Region ({len(full_region)} bp, GC={gc_content:.1f}%):")
+        results.append(full_region)
+        results.append(f"\nPromoter only:")
+        results.append(promoter)
+    return "\n".join(results)
+# ============================================================================
+# Gradio Interface
+# ============================================================================
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="Evo Model Interface", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🧬 Evo Model Interface")
+        gr.Markdown("### Test Evo's predictive and generative capabilities")
+        with gr.Tabs():
+            # Task 1: Function Prediction
+            with gr.Tab("🔍 Function Prediction"):
+                gr.Markdown("### Predict if sequences are functional")
+                gr.Markdown("*Enter sequences in FASTA format or paste a single sequence*")
+                with gr.Row():
+                    with gr.Column():
+                        func_input = gr.Textbox(
+                            label="Input Sequences",
+                            placeholder=">seq1|description\nATCGATCGATCG...\n\nOr paste a single sequence directly",
+                            lines=8
+                        )
+                        func_threshold = gr.Slider(
+                            minimum=-3.0,
+                            maximum=0.0,
+                            value=-1.5,
+                            step=0.1,
+                            label="Functionality Threshold"
+                        )
+                        func_btn = gr.Button("Predict Function", variant="primary")
+                    with gr.Column():
+                        func_output = gr.Textbox(
+                            label="Results",
+                            lines=15,
+                            show_copy_button=True
+                        )
+                func_btn.click(
+                    fn=predict_function,
+                    inputs=[func_input, func_threshold],
+                    outputs=func_output
+                )
+                gr.Examples(
+                    examples=[
+                        [">functional_gene\nATGGCACAACCCGCGCCGAACTGGTTGACCTGAAAACCACCGCCGCACTGCGTCAGGCCAGCCAGGCGGAACAA", -1.5],
+                        [">noncoding\nGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC", -1.5],
+                    ],
+                    inputs=[func_input, func_threshold]
+                )
+            # Task 2: Gene Essentiality
+            with gr.Tab("🧬 Gene Essentiality"):
+                gr.Markdown("### Predict essential genes in bacteria/phages")
+                gr.Markdown("*Input format: >gene_id|organism|function*")
+                with gr.Row():
+                    with gr.Column():
+                        ess_input = gr.Textbox(
+                            label="Gene Sequences (FASTA)",
+                            placeholder=">dnaA|E.coli|Replication initiator\nATGTCGAAAGCCGCAT...",
+                            lines=8
+                        )
+                        ess_btn = gr.Button("Predict Essentiality", variant="primary")
+                    with gr.Column():
+                        ess_output = gr.Textbox(
+                            label="Results",
+                            lines=15,
+                            show_copy_button=True
+                        )
+                ess_btn.click(
+                    fn=predict_essentiality,
+                    inputs=ess_input,
+                    outputs=ess_output
+                )
+            # Task 3: CRISPR Generation
+            with gr.Tab("✂️ CRISPR Generation"):
+                gr.Markdown("### Generate synthetic CRISPR-Cas systems")
+                with gr.Row():
+                    with gr.Column():
+                        crispr_n = gr.Slider(
+                            minimum=1,
+                            maximum=5,
+                            value=2,
+                            step=1,
+                            label="Number of Systems"
+                        )
+                        crispr_type = gr.Radio(
+                            choices=["Cas9", "Cas12", "Both"],
+                            value="Both",
+                            label="Cas Type"
+                        )
+                        crispr_target = gr.Textbox(
+                            label="Target Sequence (optional)",
+                            placeholder="ATCGATCGATCGATCG",
+                            lines=2
+                        )
+                        crispr_length = gr.Slider(
+                            minimum=500,
+                            maximum=2000,
+                            value=1000,
+                            step=100,
+                            label="Cas Protein Length"
+                        )
+                        crispr_btn = gr.Button("Generate CRISPR Systems", variant="primary")
+                    with gr.Column():
+                        crispr_output = gr.Textbox(
+                            label="Generated Systems",
+                            lines=15,
+                            show_copy_button=True
+                        )
+                crispr_btn.click(
+                    fn=generate_crispr,
+                    inputs=[crispr_n, crispr_type, crispr_target, crispr_length],
+                    outputs=crispr_output
+                )
+            # Task 4: Regulatory Design
+            with gr.Tab("🎛️ Regulatory Design"):
+                gr.Markdown("### Design promoter-RBS pairs for gene expression")
+                with gr.Row():
+                    with gr.Column():
+                        reg_n = gr.Slider(
+                            minimum=1,
+                            maximum=10,
+                            value=3,
+                            step=1,
+                            label="Number of Designs"
+                        )
+                        reg_level = gr.Radio(
+                            choices=["High", "Medium", "Low", "Mixed"],
+                            value="Mixed",
+                            label="Expression Level"
+                        )
+                        reg_btn = gr.Button("Design Regulatory Sequences", variant="primary")
+                    with gr.Column():
+                        reg_output = gr.Textbox(
+                            label="Designed Sequences",
+                            lines=15,
+                            show_copy_button=True
+                        )
+                reg_btn.click(
+                    fn=design_regulatory,
+                    inputs=[reg_n, reg_level],
+                    outputs=reg_output
+                )
+        gr.Markdown("---")
+        gr.Markdown("💡 **Tips:** Higher scores = more functional/essential | All outputs can be copied | Model: evo-1-8k-base")
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

evo/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .version import version as __version__
+from .models import Evo
+from .generation import generate
+from .scoring import score_sequences, positional_entropies

evo/configs/evo-1-131k-base_inference.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+vocab_size: 512
+hidden_size: 4096
+num_filters: 4096
+max_sequence_len: 8192
+attn_layer_idxs: [8, 16, 24]
+hyena_layer_idxs: [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31]
+num_layers: 32
+short_filter_length: 3
+num_attention_heads: 32
+short_filter_bias: True
+mlp_init_method: torch.nn.init.zeros_
+mlp_output_init_method: torch.nn.init.zeros_
+eps: 1.0e-6
+state_size: 8
+inner_size_multiple_of: 16  # force GLU inner_size to be a multiple of
+smeared_gqa: False
+make_vocab_size_divisible_by: 8
+log_intermediate_values: False
+proj_groups: 1  # GQA
+hyena_filter_groups: 1
+split_k0: True
+model_parallel_size: 1
+pile_parallel_size: 1
+tie_embeddings: True
+inner_mlp_size: null  # set to None, so it auto-fills
+mha_out_proj_bias: True
+qkv_proj_bias: True
+final_norm: True
+rng_fork: False
+use_flash_attn: False
+use_flash_rmsnorm: False
+use_flash_depthwise: False
+use_flashfft: False
+column_split: True  # only affects outputs when proj_groups > 1
+inference_mode: True
+tokenizer_type: CharLevelTokenizer
+prefill_style: fft
+mlp_activation: gelu
+use_interpolated_rotary_pos_emb: true  # turn this one for linear interpolated context extension
+rotary_emb_scaling_factor: 16  # scaling factor for time indices in rotary embeddings

evo/configs/evo-1-8k-base_inference.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+vocab_size: 512
+hidden_size: 4096
+num_filters: 4096
+max_sequence_len: 8192
+attn_layer_idxs: [8, 16, 24]
+hyena_layer_idxs: [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31]
+num_layers: 32
+short_filter_length: 3
+num_attention_heads: 32
+short_filter_bias: True
+mlp_init_method: torch.nn.init.zeros_
+mlp_output_init_method: torch.nn.init.zeros_
+eps: 1.0e-6
+state_size: 8
+inner_size_multiple_of: 16  # force GLU inner_size to be a multiple of
+smeared_gqa: False
+make_vocab_size_divisible_by: 8
+log_intermediate_values: False
+proj_groups: 1  # GQA
+hyena_filter_groups: 1
+split_k0: True
+model_parallel_size: 1
+pile_parallel_size: 1
+tie_embeddings: True
+inner_mlp_size: null  # set to None, so it auto-fills
+mha_out_proj_bias: True
+qkv_proj_bias: True
+final_norm: True
+rng_fork: False
+use_flash_attn: False
+use_flash_rmsnorm: False
+use_flash_depthwise: False
+use_flashfft: False
+column_split: True  # only affects outputs when proj_groups > 1
+inference_mode: True
+tokenizer_type: CharLevelTokenizer
+prefill_style: fft
+mlp_activation: gelu

evo/generation.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import numpy as np
+import sys
+import torch
+from typing import List, Tuple, Union
+from stripedhyena.model import StripedHyena
+from stripedhyena.sample import sample
+from stripedhyena.tokenizer import CharLevelTokenizer
+from .scoring import logits_to_logprobs, prepare_batch
+class Generator:
+    '''
+    Adapted from https://github.com/togethercomputer/stripedhyena.
+    Modifications include:
+    - `generate()` accepts and returns the recurrent cache state, letting the user
+      keep track of it across sampling runs.
+    - Able to sample with long token prompts in which the cache is initialized with
+      recurrent teacher forcing.
+    '''
+    def __init__(
+        self,
+        model: StripedHyena,
+        tokenizer: CharLevelTokenizer,
+        top_k: int = 50,
+        top_p: float = 0.7,
+        temperature: float = 1.,
+    ):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.top_k = top_k
+        self.top_p = top_p
+        self.temperature = temperature
+        self.untils = ['\n\n']
+    def generate(
+        self,
+        device: str,
+        input_string: str = None,
+        input_ids: torch.tensor = None,
+        num_tokens: int = 32,
+        cached_generation: bool = True,
+        force_prompt_threshold: int = 128,
+        print_generation: bool = True,
+        verbose: bool = False,
+        skip_special_tokens: bool = False,
+        stop_at_eos: bool = True,
+        max_seqlen: int = None,
+        inference_params_dict: dict = None,
+    ) -> Tuple[torch.tensor, torch.tensor, dict]:
+        """
+        A version of the generate() method that enables passing in and that returns the
+        `inference_params_dict` for replaying cached sampling from a given state.
+        """
+        if isinstance(self.tokenizer.eos, int):
+            eos_token_ids = torch.LongTensor([self.tokenizer.eos]).to(device)
+        else:
+            # is a tensor
+            eos_token_ids = self.tokenizer.tokenize(self.tokenizer.eos).to(device)
+        if input_ids is None:
+            input = self.tokenizer.tokenize(input_string)
+            if isinstance(input, list):
+                input = torch.LongTensor(input).unsqueeze(0).to(device)
+            # is a tensor
+            else:
+                input = input.unsqueeze(0).to(device)
+        else:
+            input = input_ids
+        x = input
+        if max_seqlen is not None:
+            x = x[:, -max_seqlen :]
+        num_tokens = int(num_tokens)
+        batch_size = x.shape[0]
+        prompt_length = x.shape[1]
+        prompt_forcing = prompt_length > force_prompt_threshold
+        if prompt_forcing:
+            forced_prompt_length = prompt_length - force_prompt_threshold
+            x_force = x[:, force_prompt_threshold:]
+            x = x[:, :force_prompt_threshold]
+        else:
+            forced_prompt_length = 0
+        generation = torch.empty(
+            x.shape[0],
+            num_tokens,
+            dtype=torch.long,
+            device=x.device,
+        )
+        scores = torch.empty(
+            x.shape[0],
+            num_tokens,
+            self.tokenizer.vocab_size,
+            dtype=torch.float,
+            device=x.device,
+        )
+        if inference_params_dict is not None:
+            cached_generation = True
+            prefilled = True
+            # Ensure that the cached data is loaded on the correct device.
+            for key, data in inference_params_dict['mha'].key_value_memory_dict.items():
+                inference_params_dict['mha'].key_value_memory_dict[key] = data.to(x.device)
+            for key, data in inference_params_dict['hyena'].fir_state_dict.items():
+                inference_params_dict['hyena'].fir_state_dict[key] = data.to(x.device)
+            for key, data in inference_params_dict['hyena'].state_dict.items():
+                inference_params_dict['hyena'].state_dict[key] = data.to(x.device)
+        elif cached_generation:
+            inference_params_dict = self.model.initialize_inference_params()
+            inference_params_dict['mha'].max_batch_size = batch_size
+            inference_params_dict['hyena'].max_batch_size = batch_size
+            prefilled = False
+        if verbose:
+            mem_after_tok = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print(f'Memory after tokenization: {mem_after_tok} GB')
+            print('Starting generation...')
+            if input_string is not None:
+                print('Prompt: ' + input_string)
+            else:
+                print(f'Prompt ids: {input_ids} {input_ids.shape}')
+        for i in range(forced_prompt_length + num_tokens):
+            if prefilled:
+                post_prefill = True
+            else:
+                post_prefill = cached_generation and i > 0
+            # prefill then process only the last token
+            if post_prefill:
+                x = x[:, -1:]
+                seqlen_offset = inference_params_dict['mha'].seqlen_offset
+                if seqlen_offset == 0:
+                    seqlen_offset = input.shape[-1]
+                    inference_params_dict['hyena'].seqlen_offset = seqlen_offset
+                    inference_params_dict['mha'].seqlen_offset = seqlen_offset
+                else:
+                    inference_params_dict['mha'].seqlen_offset += 1
+                    inference_params_dict['hyena'].seqlen_offset += 1
+            # do forward pass with no gradient
+            with torch.inference_mode():
+                logits, inference_params_dict = self.model(
+                    x,
+                    inference_params_dict=inference_params_dict,
+                )
+            last_logits = logits[:, -1]
+            if prompt_forcing and i < forced_prompt_length:
+                new_idx = x_force[:, i]
+            else:
+                new_idx = sample(
+                    last_logits,
+                    top_k=self.top_k,
+                    top_p=self.top_p,
+                    temperature=self.temperature,
+                )
+            if stop_at_eos and (generation[0, -2:] == eos_token_ids).all():
+                print('Stopping generation at EOS')
+            if print_generation and verbose and batch_size == 1:
+                print(
+                    f'{self.tokenizer.detokenize([new_idx.item()])}',
+                    end=' ',
+                )
+            if prompt_forcing:
+                if i >= forced_prompt_length:
+                    scores[:, i - forced_prompt_length] = last_logits
+                    generation[:, i - forced_prompt_length] = new_idx
+            else:
+                scores[:, i] = last_logits
+                generation[:, i] = new_idx
+            if post_prefill:
+                x = new_idx[:, None]
+            else:
+                x = torch.cat([x, new_idx[:, None]], dim=-1)
+        if verbose:
+            y = self.tokenizer.detokenize_batch(generation[:, : i + 1])
+            for until in self.untils:
+                if until in y:
+                    y = y.split(until)[0]
+                    break
+            print(f'\nInput: {input_string}, Output: {y}')
+            mem_end = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print(f'Memory after generation: {mem_end} GB')
+        return generation[:, : i + 1], scores[:, : i + 1], inference_params_dict
+def generate(
+        prompt_seqs: List[str],
+        model: StripedHyena,
+        tokenizer: CharLevelTokenizer,
+        n_tokens: int = 100,
+        temperature: float = 0.,
+        top_k: int = 1,
+        top_p: float = 1.,
+        batched: bool = True,
+        prepend_bos: bool = False,
+        cached_generation: bool = False,
+        force_prompt_threshold: int = 128,
+        verbose: int = 1,
+        device: str = 'cuda:0',
+        **kwargs,
+) -> Tuple[List[str], List[float]]:
+    """
+    Performs generation from a list of prompts.
+    If all prompts are the same length, this can do batched generation.
+    Also supports cached generation for efficient sampling.
+    """
+    model.eval()
+    g = Generator(
+        model,
+        tokenizer,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )
+    uniform_lengths = all(len(s) == len(prompt_seqs[0]) for s in prompt_seqs)
+    if batched and uniform_lengths:
+        input_ids_list = [
+            prepare_batch(
+                prompt_seqs,
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+        ]
+    else:
+        if verbose:
+            if not uniform_lengths:
+                sys.stderr.write('Note: Prompts are of different lengths.\n')
+            sys.stderr.write('Note: Will not do batched generation.\n')
+        input_ids_list = [
+            prepare_batch(
+                [ prompt_seq ],
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+            for prompt_seq in prompt_seqs
+        ]
+    generated_seqs, generated_scores = [], []
+    for input_ids in input_ids_list:
+        batch_size = input_ids.shape[0]
+        output_ids, logits, _ = g.generate(
+            input_ids=input_ids,
+            num_tokens=n_tokens,
+            cached_generation=cached_generation,
+            force_prompt_threshold=force_prompt_threshold,
+            device=device,
+            print_generation=(verbose > 1),
+            verbose=(verbose > 1),
+            stop_at_eos=False,
+        )
+        if verbose > 1:
+            print('input_ids.shape', input_ids.shape)
+            print('output_ids.shape', output_ids.shape)
+            print('logits.shape', logits.shape)
+        generated_seqs_batch = list(tokenizer.detokenize_batch(output_ids))
+        assert len(generated_seqs_batch) == batch_size
+        generated_seqs += generated_seqs_batch
+        logprobs = logits_to_logprobs(logits, output_ids)
+        logprobs = logprobs.float().cpu().numpy()
+        generated_scores += [ np.mean(logprobs[idx]) for idx in range(batch_size) ]
+    assert len(generated_seqs) == len(generated_scores) == len(prompt_seqs)
+    if verbose:
+        for seq, score, prompt in zip(generated_seqs, generated_scores, prompt_seqs):
+            print(f'Prompt: "{prompt}",\tOutput: "{seq}",\tScore: {score}')
+    return generated_seqs, generated_scores

evo/models.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import pkgutil
+import re
+from transformers import AutoConfig, AutoModelForCausalLM
+import yaml
+from stripedhyena.utils import dotdict
+from stripedhyena.model import StripedHyena
+from stripedhyena.tokenizer import CharLevelTokenizer
+MODEL_NAMES = [
+    'evo-1.5-8k-base',
+    'evo-1-8k-base',
+    'evo-1-131k-base',
+    'evo-1-8k-crispr',
+    'evo-1-8k-transposon',
+]
+class Evo:
+    def __init__(self, model_name: str = MODEL_NAMES[1], device: str = None):
+        """
+        Loads an Evo model checkpoint given a model name.
+        If the checkpoint does not exist, we automatically download it from HuggingFace.
+        """
+        self.device = device
+        # Check model name.
+        if model_name not in MODEL_NAMES:
+            raise ValueError(
+                f'Invalid model name {model_name}. Should be one of: '
+                f'{", ".join(MODEL_NAMES)}.'
+            )
+        # Assign config path.
+        if model_name == 'evo-1-8k-base' or \
+           model_name == 'evo-1-8k-crispr' or \
+           model_name == 'evo-1-8k-transposon' or \
+           model_name == 'evo-1.5-8k-base':
+            config_path = 'configs/evo-1-8k-base_inference.yml'
+        elif model_name == 'evo-1-131k-base':
+            config_path = 'configs/evo-1-131k-base_inference.yml'
+        else:
+            raise ValueError(
+                f'Invalid model name {model_name}. Should be one of: '
+                f'{", ".join(MODEL_NAMES)}.'
+            )
+        # Load model.
+        self.model = load_checkpoint(
+            model_name=model_name,
+            config_path=config_path,
+            device=self.device
+        )
+        # Load tokenizer.
+        self.tokenizer = CharLevelTokenizer(512)
+HF_MODEL_NAME_MAP = {
+    'evo-1.5-8k-base': 'evo-design/evo-1.5-8k-base',
+    'evo-1-8k-base': 'togethercomputer/evo-1-8k-base',
+    'evo-1-131k-base': 'togethercomputer/evo-1-131k-base',
+    'evo-1-8k-crispr': 'LongSafari/evo-1-8k-crispr',
+    'evo-1-8k-transposon': 'LongSafari/evo-1-8k-transposon',
+}
+def load_checkpoint(
+    model_name: str = MODEL_NAMES[1],
+    config_path: str = 'evo/configs/evo-1-131k-base_inference.yml',
+    device: str = None,
+    *args, **kwargs
+):
+    """
+    Load checkpoint from HuggingFace and place it into SH model.
+    """
+    # Map model name to HuggingFace model name.
+    hf_model_name = HF_MODEL_NAME_MAP[model_name]
+    # Load model config.
+    model_config = AutoConfig.from_pretrained(
+        hf_model_name,
+        trust_remote_code=True,
+        revision='1.1_fix' if re.match(r'evo-1-.*-base', model_name) else 'main',
+    )
+    model_config.use_cache = True
+    # Load model.
+    model = AutoModelForCausalLM.from_pretrained(
+        hf_model_name,
+        config=model_config,
+        trust_remote_code=True,
+        revision='1.1_fix' if re.match(r'evo-1-.*-base', model_name) else 'main',
+    )
+    # Load model state dict & cleanup.
+    state_dict = model.backbone.state_dict()
+    del model
+    del model_config
+    # Load SH config.
+    config = yaml.safe_load(pkgutil.get_data(__name__, config_path))
+    global_config = dotdict(config, Loader=yaml.FullLoader)
+    # Load SH Model.
+    model = StripedHyena(global_config)
+    model.load_state_dict(state_dict, strict=True)
+    model.to_bfloat16_except_poles_residues()
+    if device is not None:
+        model = model.to(device)
+    return model

evo/scoring.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import numpy as np
+import torch
+from typing import List, Tuple
+from stripedhyena.model import StripedHyena
+from stripedhyena.tokenizer import CharLevelTokenizer
+def prepare_batch(
+    seqs: List[str],
+    tokenizer: CharLevelTokenizer,
+    prepend_bos: bool = True,
+    device: str = 'cuda:0'
+) -> Tuple[torch.Tensor, List[int]]:
+    """
+    Takes in a list of sequences, tokenizes them, and puts them in a tensor batch.
+    If the sequences have differing lengths, then pad up to the maximum sequence length.
+    """
+    seq_lengths = [ len(seq) for seq in seqs ]
+    max_seq_length = max(seq_lengths)
+    input_ids = []
+    for seq in seqs:
+        padding = [tokenizer.pad_id] * (max_seq_length - len(seq))
+        input_ids.append(
+            torch.tensor(
+                ([tokenizer.eod_id] * int(prepend_bos)) + tokenizer.tokenize(seq) + padding,
+                dtype=torch.long,
+            ).to(device).unsqueeze(0)
+        )
+    input_ids = torch.cat(input_ids, dim=0)
+    return input_ids, seq_lengths
+def logits_to_logprobs(
+    logits: torch.Tensor,
+    input_ids: torch.Tensor,
+    trim_bos: bool = True,
+) -> torch.Tensor:
+    """
+    Takes in a tensor of logits of dimension (batch, length, vocab).
+    Computes the log-likelihoods using a softmax along the vocab dimension.
+    Uses the `input_ids` to index into the log-likelihoods and returns the likelihood
+    of the provided sequence at each position with dimension (batch, length).
+    """
+    softmax_logprobs = torch.log_softmax(logits, dim=-1)
+    if trim_bos:
+        softmax_logprobs = softmax_logprobs[:, :-1] # Remove last prediction.
+        input_ids = input_ids[:, 1:] # Trim BOS added by tokenizer.
+    assert(softmax_logprobs.shape[1] == input_ids.shape[1])
+    logprobs = torch.gather(
+        softmax_logprobs,       # Gather likelihoods...
+        2,                      # along the vocab dimension...
+        input_ids.unsqueeze(-1) # using the token ids to index.
+    ).squeeze(-1)
+    return logprobs
+def score_sequences(
+    seqs: List[str],
+    model: StripedHyena,
+    tokenizer: CharLevelTokenizer,
+    reduce_method: str = 'mean',
+    device: str = 'cuda:0',
+) -> List[float]:
+    """
+    Computes the model log-likelihood scores for sequences in `seqs`.
+    Uses `reduce_method` to take the mean or sum across the likelihoods at each
+    position (default: `'mean'`).
+    Returns a list of scalar scores corresponding to the reduced log-likelihoods for
+    each sequence.
+    """
+    input_ids, seq_lengths = prepare_batch(seqs, tokenizer, device=device, prepend_bos=True)
+    assert(len(seq_lengths) == input_ids.shape[0])
+    with torch.inference_mode():
+        logits, _ = model(input_ids) # (batch, length, vocab)
+    logprobs = logits_to_logprobs(logits, input_ids, trim_bos=True)
+    logprobs = logprobs.float().cpu().numpy()
+    if reduce_method == 'mean':
+        reduce_func = np.mean
+    elif reduce_method == 'sum':
+        reduce_func = np.sum
+    else:
+        raise ValueError(f'Invalid reduce_method {reduce_method}')
+    return [
+        reduce_func(logprobs[idx][:seq_lengths[idx]])
+        for idx in range(len(seq_lengths))
+    ]
+def positional_entropies(
+    seqs: List[str],
+    model: StripedHyena,
+    tokenizer: CharLevelTokenizer,
+    device: str = 'cuda:0',
+) -> List[np.array]:
+    """
+    Computes the positional entropies for sequences in `seqs`.
+    Returns a list of arrays, where each array is the same length as the
+    corresponding sequence length. Each array contains the per-position entropy
+    across the vocab dimension.
+    """
+    input_ids, seq_lengths = prepare_batch(seqs, tokenizer, device=device, prepend_bos=True)
+    assert(len(seq_lengths) == input_ids.shape[0])
+    with torch.inference_mode():
+        logits, _ = model(input_ids) # (batch, length, vocab)
+    # Tokenizer prepends BOS, remember to remove last prediction.
+    softmax_logprobs = torch.log_softmax(logits, dim=-1)[:, :-1]
+    entropies = -torch.sum(torch.exp(softmax_logprobs) * softmax_logprobs, dim=-1)
+    entropies = entropies.float().cpu().numpy()
+    sequence_entropies = [
+        entropies[idx][:seq_lengths[idx]] for idx in range(len(seq_lengths))
+    ]
+    assert all(
+        len(seq) == len(entropy) for seq, entropy in zip(seqs, sequence_entropies)
+    )
+    return sequence_entropies

evo/utils.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import numpy as np
+import pandas as pd
+from typing import Callable
+NTs = 'ACGT'
+AAs = 'ACDEFGHIKLMNPQRSTVWY'
+AA_TO_CODON = {
+    '*': ['TAA', 'TAG', 'TGA'],  # Stop.
+    'A': ['GCT', 'GCC', 'GCA', 'GCG'],  # Ala.
+    'C': ['TGT', 'TGC'],  # Cys.
+    'D': ['GAT', 'GAC'],  # Asp.
+    'E': ['GAA', 'GAG'],  # Glu.
+    'F': ['TTT', 'TTC'],  # Phe.
+    'G': ['GGU', 'GGC', 'GGA', 'GGG'],  # Gly.
+    'H': ['CAT', 'CAC'],  # His.
+    'I': ['ATT', 'ATC', 'ATA'],  # Ile.
+    'K': ['AAA', 'AAG'],  # Lys.
+    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],  # Leu.
+    'M': ['ATG'],  # Met.
+    'N': ['AAT', 'AAC'],  # Asn.
+    'P': ['CCT', 'CCC', 'CCA', 'CCG'],  # Pro.
+    'Q': ['CAA', 'CAG'],  # Gln.
+    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],  # Arg.
+    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],  # Ser.
+    'T': ['ACT', 'ACC', 'ACA', 'ACG'],  # Thr.
+    'V': ['GTT', 'GTC', 'GTA', 'GTG'],  # Val.
+    'W': ['TGG'],  # Trp.
+    'Y': ['TAT', 'TAC'],  # Tyr.
+}
+CODON_TO_AA = {
+    codon: aa
+    for aa, codon_list in AA_TO_CODON.items()
+    for codon in codon_list
+}
+AA_3_TO_1 = {
+    "Ala": "A", # Alanine
+    "Arg": "R", # Arginine
+    "Asn": "N", # Asparagine
+    "Asp": "D", # Aspartic acid
+    "Cys": "C", # Cysteine
+    "Gln": "Q", # Glutamine
+    "Glu": "E", # Glutamic acid
+    "Gly": "G", # Glycine
+    "His": "H", # Histidine
+    "Ile": "I", # Isoleucine
+    "Leu": "L", # Leucine
+    "Lys": "K", # Lysine
+    "Met": "M", # Methionine
+    "Phe": "F", # Phenylalanine
+    "Pro": "P", # Proline
+    "Ser": "S", # Serine
+    "Thr": "T", # Threonine
+    "Trp": "W", # Tryptophan
+    "Tyr": "Y", # Tyrosine
+    "Val": "V"  # Valine
+}
+def nucleotide_deep_mutational_scan(sequence: str, ignore_wt: bool = True):
+    for idx, wt in enumerate(sequence):
+        for mt in NTs:
+            if ignore_wt and wt == mt:
+                continue
+            yield (wt, mt, idx)
+def parse_blast_output(output_path: str) -> pd.DataFrame:
+    """
+    Parses standard blast output with `-outfmt 6`.
+    """
+    # blast default format output fields.
+    blast_table_header = [
+        'qacc', 'sacc', 'pident', 'length', 'mismatch', 'gapopen', 'qstart',
+        'qend', 'sstart', 'send', 'evalue',
+    ]
+    data = []
+    with open(output_path, 'r') as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            if line.strip() == '':
+                continue
+            line = line.strip().split()
+            data.append(dict(zip(blast_table_header, line)))
+    df = pd.DataFrame(data)
+    if len(df) == 0:
+        return df
+    df['evalue'] = df['evalue'].astype(float)
+    return df
+def parse_erpin_output(output_path: str, name: str) -> pd.DataFrame:
+    """
+    Parses ERPIN output. For an example, see `eval/data/example_rho_output.txt`.
+    """
+    # ERPIN format output fields.
+    output_fields = [ 'strand', 'index', 'interval', 'score', 'evalue' ]
+    data = []
+    with open(output_path, 'r') as f:
+        for line in f:
+            if line.startswith(f'>{name}'):
+                meta = dict(zip(output_fields, f.readline().rstrip().split()))
+                sequence = f.readline().rstrip()
+                start, end = meta['interval'].split('..')
+                data.append([
+                    f"{name}_{meta['index']}_{meta['strand']}",
+                    sequence,
+                    int(start),
+                    int(end),
+                    '+' if meta['strand'] == 'FW' else '-',
+                    meta['score'],
+                    float(meta['evalue']),
+                ])
+    return pd.DataFrame(
+        data,
+        columns=[
+            'id',
+            'seq',
+            'start',
+            'end',
+            'strand',
+            'score',
+            'evalue',
+        ],
+    )
+def parse_hmmsearch_output(output_path: str) -> pd.DataFrame:
+    """
+    Parses standard hmmsearch output.
+    """
+    # hmmsearch format output fields.
+    hmmsearch_table_header = [
+        'target', 'target_acc', 'tlen', 'query', 'query_acc', 'qlen',
+        'evalue', 'score', 'bias', 'num', 'of', 'cevalue', 'ievalue',
+        'dscore', 'dbias', 'hmm_from', 'hmm_to', 'ali_from', 'ali_to',
+        'env_from', 'env_to', 'acc', 'desc',
+    ]
+    data = []
+    with open(output_path, 'r') as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            line = line.strip().split()
+            data.append(dict(zip(hmmsearch_table_header, line)))
+    return pd.DataFrame(data)
+def permutation_test(
+    score_func: Callable[[np.array, np.array], float],
+    x1: np.array,
+    x2: np.array,
+    n_permutations: int = 100_000,
+) -> float:
+    """
+    Returns a permutation-based P value. Computes the null distribution by
+    shuffling the provided data and recomputing the `score_func`.
+    """
+    if n_permutations < 1:
+        raise ValueError('Number of permutations must be positive.')
+    x1, x2 = np.array(x1), np.array(x2)
+    observed_score = score_func(x1, x2)
+    null_distribution = np.array([
+        score_func(x1, np.random.permutation(x2))
+        for _ in range(n_permutations)
+    ])
+    return np.mean(null_distribution >= observed_score)

evo/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ version = '0.4'

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==4.44.0
+torch==2.1.0
+numpy==1.24.3
+transformers==4.36.0
+einops==0.7.0
+pyyaml==6.0.1
+git+https://github.com/togethercomputer/stripedhyena.git