Spaces:

Genooo12
/

ColiFormer-ui

Sleeping

App Files Files Community

Genooo12 commited on Apr 25

Commit

404d784

verified ·

1 Parent(s): cd09b92

Deploy Streamlit UI

Browse files

Files changed (48) hide show

.dockerignore +16 -0
.gitattributes +3 -0
.github/ISSUE_TEMPLATE/bug_report.md +38 -0
.github/ISSUE_TEMPLATE/feature_request.md +17 -0
.github/ISSUE_TEMPLATE/other.md +10 -0
.github/workflows/ci.yml +39 -0
.gitignore +230 -0
Benchmark 80 sequences.xlsx +3 -0
CODE_OF_CONDUCT.md +128 -0
CodonTransformer/CodonData.py +682 -0
CodonTransformer/CodonEvaluation.py +583 -0
CodonTransformer/CodonJupyter.py +311 -0
CodonTransformer/CodonPostProcessing.py +83 -0
CodonTransformer/CodonPrediction.py +1372 -0
CodonTransformer/CodonUtils.py +871 -0
CodonTransformer/__init__.py +1 -0
Dockerfile +21 -0
ENCOT_Academic_Documentation.html +2625 -0
ENCOT_Code_Showcase.html +791 -0
LICENSE +201 -0
Makefile +9 -0
README.md +495 -10
app.py +12 -0
benchmark_evaluation.py +695 -0
comprehensive_model_comparison.png +3 -0
configs/train_ecoli_alm.yaml +54 -0
configs/train_ecoli_quick.yaml +37 -0
create_model_datasets.py +42 -0
evaluate_optimizer.py +577 -0
prepare_ecoli_data.py +69 -0
pretrain.py +232 -0
pyproject.toml +62 -0
requirements.txt +29 -0
scripts/optimize_sequence.py +383 -0
scripts/preprocess_data.py +251 -0
scripts/run_benchmarks.py +235 -0
scripts/train.py +228 -0
setup.py +40 -0
src/CodonTransformer_inference_template.xlsx +0 -0
src/__init__.py +1 -0
src/banner_final.png +3 -0
src/organism2id.pkl +3 -0
streamlit_app.py +16 -0
streamlit_gui/app.py +1456 -0
streamlit_gui/demo.py +288 -0
streamlit_gui/requirements.txt +20 -0
streamlit_gui/run_gui.py +102 -0
streamlit_gui/test_gui.py +321 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.git
+.github
+.venv
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+*.ipynb
+.devcontainer
+data
+notebooks
+tests
+slurm
+Benchmark 80 sequences.xlsx
+comprehensive_model_comparison.png

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Benchmark[[:space:]]80[[:space:]]sequences.xlsx filter=lfs diff=lfs merge=lfs -text
+comprehensive_model_comparison.png filter=lfs diff=lfs merge=lfs -text
+src/banner_final.png filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+**Additional context**
+Add any other context about the problem here.

.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Additional context**
+Add any other context or screenshots about the feature request here.

.github/ISSUE_TEMPLATE/other.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+name: Other
+about: Any other issue
+title: ''
+labels: bug
+assignees: ''
+---
+**Describe your issue here**

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+# .github/workflows/ci.yml
+name: CI
+on: [push, pull_request]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install "coverage[toml]"
+    - name: Run tests with coverage
+      run: |
+        make test_with_coverage
+        coverage report
+        coverage xml
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v4
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: coverage.xml
+        flags: unittests
+        name: codecov-umbrella
+        fail_ci_if_error: true

.gitignore ADDED Viewed

	@@ -0,0 +1,230 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+codon_env/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Coverage reports
+coverage.xml
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# Temporary files
+*.tmp
+*.temp
+# PyTorch Lightning checkpoints
+lightning_logs/
+# PyTorch model weights
+*.pth
+*.pt
+# Large files excluded from Git
+models/ecoli-codon-optimizer/finetune.ckpt
+models/ecoli-codon-optimizer/finetune_best.ckpt
+data/ecoli_processed_genes.csv
+# Finetune-related files (keep local only)
+finetune.py
+checkpoints/
+*.safetensors
+# Benchmark and validation results
+benchmark_plots/
+cai_tai_benchmark.csv
+synthetic_validation.csv
+test_set_validation.csv
+# Large data files
+*.csv
+*.jsonl
+*.json
+*.fasta
+*.fa
+*.ckpt
+# Results and outputs
+results/
+outputs/
+logs/
+# Model files and weights
+*.bin
+*.safetensors
+# CUDA and GPU related
+*.run
+cuda_installer.pyz
+# R files
+.RData
+.Rhistory
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+research/
+models/alm-enhanced-training/balanced_alm_finetune.ckpt

Benchmark 80 sequences.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f80bde88a31e80ac34b0827180b50d112f1d26bdf691c8118943e91c0e3b09e2
+size 179471

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+Adibvafa.fallahpour@mail.utoronto.ca.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

CodonTransformer/CodonData.py ADDED Viewed

	@@ -0,0 +1,682 @@

+"""
+File: CodonData.py
+---------------------
+Includes helper functions for preprocessing NCBI or Kazusa databases and
+preparing the data for training and inference of the CodonTransformer model.
+"""
+import json
+import os
+import random
+from typing import Dict, List, Optional, Tuple, Union
+import pandas as pd
+import python_codon_tables as pct
+from Bio import SeqIO
+from Bio.Seq import Seq
+from sklearn.utils import shuffle as sk_shuffle
+from tqdm import tqdm
+from CodonTransformer.CodonUtils import (
+    AMBIGUOUS_AMINOACID_MAP,
+    AMINO2CODON_TYPE,
+    AMINO_ACIDS,
+    ORGANISM2ID,
+    START_CODONS,
+    STOP_CODONS,
+    STOP_SYMBOL,
+    STOP_SYMBOLS,
+    ProteinConfig,
+    find_pattern_in_fasta,
+    get_taxonomy_id,
+    sort_amino2codon_skeleton,
+)
+def prepare_training_data(
+    dataset: Union[str, pd.DataFrame], output_file: str, shuffle: bool = True
+) -> None:
+    """
+    Prepare a JSON dataset for training the CodonTransformer model.
+    Input dataset should have columns below:
+        - dna: str (DNA sequence)
+        - protein: str (Protein sequence)
+        - organism: Union[int, str] (ID or Name of the organism)
+    The output JSON dataset will have the following format:
+        {"idx": 0, "codons": "M_ATG R_AGG L_TTG L_CTA R_CGA __TAG", "organism": 51}
+        {"idx": 1, "codons": "M_ATG K_AAG C_TGC F_TTT F_TTC __TAA", "organism": 59}
+    Args:
+        dataset (Union[str, pd.DataFrame]): Input dataset in CSV or DataFrame format.
+        output_file (str): Path to save the output JSON dataset.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
+    Returns:
+        None
+    """
+    if isinstance(dataset, str):
+        dataset = pd.read_csv(dataset)
+    required_columns = {"dna", "protein", "organism"}
+    if not required_columns.issubset(dataset.columns):
+        raise ValueError(f"Input dataset must have columns: {required_columns}")
+    # Prepare the dataset for finetuning
+    dataset["codons"] = dataset.apply(
+        lambda row: get_merged_seq(row["protein"], row["dna"], separator="_"), axis=1
+    )
+    # Replace organism str with organism id using ORGANISM2ID
+    dataset["organism"] = dataset["organism"].apply(
+        lambda org: process_organism(org, ORGANISM2ID)
+    )
+    # Save the dataset to a JSON file
+    dataframe_to_json(dataset[["codons", "organism"]], output_file, shuffle=shuffle)
+def dataframe_to_json(df: pd.DataFrame, output_file: str, shuffle: bool = True) -> None:
+    """
+    Convert pandas DataFrame to JSON file format suitable for training CodonTransformer.
+    This function takes a preprocessed DataFrame and writes it to a JSON file
+    where each line is a JSON object representing a single record.
+    Args:
+        df (pd.DataFrame): The input DataFrame with 'codons' and 'organism' columns.
+        output_file (str): Path to the output JSON file.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
+    Returns:
+        None
+    Raises:
+        ValueError: If the required columns are not present in the DataFrame.
+    """
+    required_columns = {"codons", "organism"}
+    if not required_columns.issubset(df.columns):
+        raise ValueError(f"DataFrame must contain columns: {required_columns}")
+    print(f"\nStarted writing to {output_file}...")
+    # Shuffle the DataFrame if requested
+    if shuffle:
+        df = sk_shuffle(df)
+    # Write the DataFrame to a JSON file
+    with open(output_file, "w") as f:
+        for idx, row in tqdm(
+            df.iterrows(), total=len(df), desc="Writing JSON...", unit=" records"
+        ):
+            doc = {"idx": idx, "codons": row["codons"], "organism": row["organism"]}
+            f.write(json.dumps(doc) + "\n")
+    print(f"\nTotal Entries Saved: {len(df)}, JSON data saved to {output_file}")
+def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int]) -> int:
+    """
+    Process and validate the organism input, converting it to a valid organism ID.
+    This function handles both string (organism name) and integer (organism ID) inputs.
+    It validates the input against a provided mapping of organism names to IDs.
+    Args:
+        organism (Union[str, int]): Input organism, either as a name (str) or ID (int).
+        organism_to_id (Dict[str, int]): Dictionary mapping organism names to their
+            corresponding IDs.
+    Returns:
+        int: The validated organism ID.
+    Raises:
+        ValueError: If the input is an invalid organism name or ID.
+        TypeError: If the input is neither a string nor an integer.
+    """
+    if isinstance(organism, str):
+        if organism not in organism_to_id:
+            raise ValueError(f"Invalid organism name: {organism}")
+        return organism_to_id[organism]
+    elif isinstance(organism, int):
+        if organism not in organism_to_id.values():
+            raise ValueError(f"Invalid organism ID: {organism}")
+        return organism
+    raise TypeError(
+        f"Organism must be a string or integer, not {type(organism).__name__}"
+    )
+def preprocess_protein_sequence(protein: str) -> str:
+    """
+    Preprocess a protein sequence by cleaning, standardizing, and handling
+    ambiguous amino acids.
+    Args:
+        protein (str): The input protein sequence.
+    Returns:
+        str: The preprocessed protein sequence.
+    Raises:
+        ValueError: If the protein sequence is invalid or if the configuration is invalid.
+    """
+    if not protein:
+        raise ValueError("Protein sequence is empty.")
+    # Clean and standardize the protein sequence
+    protein = (
+        protein.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
+    )
+    # Handle ambiguous amino acids based on the specified behavior
+    config = ProteinConfig()
+    ambiguous_aminoacid_map_override = config.get("ambiguous_aminoacid_map_override")
+    ambiguous_aminoacid_behavior = config.get("ambiguous_aminoacid_behavior")
+    ambiguous_aminoacid_map = AMBIGUOUS_AMINOACID_MAP.copy()
+    for aminoacid, standard_aminoacids in ambiguous_aminoacid_map_override.items():
+        ambiguous_aminoacid_map[aminoacid] = standard_aminoacids
+    if ambiguous_aminoacid_behavior == "raise_error":
+        if any(aminoacid in ambiguous_aminoacid_map for aminoacid in protein):
+            raise ValueError("Ambiguous amino acids found in protein sequence.")
+    elif ambiguous_aminoacid_behavior == "standardize_deterministic":
+        protein = "".join(
+            ambiguous_aminoacid_map.get(aminoacid, [aminoacid])[0]
+            for aminoacid in protein
+        )
+    elif ambiguous_aminoacid_behavior == "standardize_random":
+        protein = "".join(
+            random.choice(ambiguous_aminoacid_map.get(aminoacid, [aminoacid]))
+            for aminoacid in protein
+        )
+    else:
+        raise ValueError(
+            f"Invalid ambiguous_aminoacid_behavior: {ambiguous_aminoacid_behavior}."
+        )
+    # Check for sequence validity
+    if any(aminoacid not in AMINO_ACIDS + STOP_SYMBOLS for aminoacid in protein):
+        raise ValueError("Invalid characters in protein sequence.")
+    if protein[-1] not in AMINO_ACIDS + STOP_SYMBOLS:
+        raise ValueError(
+            "Protein sequence must end with `*`, or `_`, or an amino acid."
+        )
+    # Replace '*' at the end of protein with STOP_SYMBOL if present
+    if protein[-1] == "*":
+        protein = protein[:-1] + STOP_SYMBOL
+    # Add stop symbol to end of protein
+    if protein[-1] != STOP_SYMBOL:
+        protein += STOP_SYMBOL
+    return protein
+def replace_ambiguous_codons(dna: str) -> str:
+    """
+    Replaces ambiguous codons in a DNA sequence with "UNK".
+    Args:
+        dna (str): The DNA sequence to process.
+    Returns:
+        str: The processed DNA sequence with ambiguous codons replaced by "UNK".
+    """
+    result = []
+    dna = dna.upper()
+    # Check codons in DNA sequence
+    for i in range(0, len(dna), 3):
+        codon = dna[i : i + 3]
+        if len(codon) == 3 and all(nucleotide in "ATCG" for nucleotide in codon):
+            result.append(codon)
+        else:
+            result.append("UNK")
+    return "".join(result)
+def preprocess_dna_sequence(dna: str) -> str:
+    """
+    Cleans and preprocesses a DNA sequence by standardizing it and replacing
+    ambiguous codons.
+    Args:
+        dna (str): The DNA sequence to preprocess.
+    Returns:
+        str: The cleaned and preprocessed DNA sequence.
+    """
+    if not dna:
+        return ""
+    # Clean and standardize the DNA sequence
+    dna = dna.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
+    # Replace codons with ambigous nucleotides with "UNK"
+    dna = replace_ambiguous_codons(dna)
+    # Add unkown stop codon to end of DNA sequence if not present
+    if dna[-3:] not in STOP_CODONS:
+        dna += "UNK"
+    return dna
+def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
+    """
+    Return the merged sequence of protein amino acids and DNA codons in the form
+    of tokens separated by space, where each token is composed of an amino acid +
+    separator + codon.
+    Args:
+        protein (str): Protein sequence.
+        dna (str): DNA sequence.
+        separator (str): Separator between amino acid and codon.
+    Returns:
+        str: Merged sequence.
+    Example:
+        >>> get_merged_seq(protein="MAV_", dna="ATGGCTGTGTAA", separator="_")
+        'M_ATG A_GCT V_GTG __TAA'
+        >>> get_merged_seq(protein="QHH_", dna="", separator="_")
+        'Q_UNK H_UNK H_UNK __UNK'
+    """
+    merged_seq = ""
+    # Prepare protein and dna sequences
+    dna = preprocess_dna_sequence(dna)
+    protein = preprocess_protein_sequence(protein)
+    # Check if the length of protein and dna sequences are equal
+    if len(dna) > 0 and len(protein) != len(dna) / 3:
+        raise ValueError(
+            'Length of protein (including stop symbol such as "_") and '
+            "the number of codons in DNA sequence (including stop codon) "
+            "must be equal."
+        )
+    # Merge protein and DNA sequences into tokens
+    for i, aminoacid in enumerate(protein):
+        merged_seq += f'{aminoacid}{separator}{dna[i * 3:i * 3 + 3] if dna else "UNK"} '
+    return merged_seq.strip()
+def is_correct_seq(dna: str, protein: str, stop_symbol: str = STOP_SYMBOL) -> bool:
+    """
+    Check if the given DNA and protein pair is correct, that is:
+        1. The length of dna is divisible by 3
+        2. There is an initiator codon in the beginning of dna
+        3. There is only one stop codon in the sequence
+        4. The only stop codon is the last codon
+    Note since in Codon Table 3, 'TGA' is interpreted as Triptophan (W),
+    there is a separate check to make sure those sequences are considered correct.
+    Args:
+        dna (str): DNA sequence.
+        protein (str): Protein sequence.
+        stop_symbol (str): Stop symbol.
+    Returns:
+        bool: True if the sequence is correct, False otherwise.
+    """
+    return (
+        len(dna) % 3 == 0  # Check if DNA length is divisible by 3
+        and dna[:3].upper() in START_CODONS  # Check for initiator codon
+        and protein[-1]
+        == stop_symbol  # Check if the last protein symbol is the stop symbol
+        and protein.count(stop_symbol) == 1  # Check if there is only one stop symbol
+        and len(set(dna))
+        == 4  # Check if DNA consists of 4 unique nucleotides (A, T, C, G)
+    )
+def get_amino_acid_sequence(
+    dna: str,
+    stop_symbol: str = "_",
+    codon_table: int = 1,
+    return_correct_seq: bool = False,
+) -> Union[str, Tuple[str, bool]]:
+    """
+    Return the translated protein sequence given a DNA sequence and codon table.
+    Args:
+        dna (str): DNA sequence.
+        stop_symbol (str): Stop symbol.
+        codon_table (int): Codon table number.
+        return_correct_seq (bool): Whether to return if the sequence is correct.
+    Returns:
+        Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if
+        return_correct_seq is True, otherwise just the protein sequence.
+    """
+    dna_seq = Seq(dna).strip()
+    # Translate the DNA sequence to a protein sequence
+    protein_seq = str(
+        dna_seq.translate(
+            stop_symbol=stop_symbol,  # Symbol to use for stop codons
+            to_stop=False,  # Translate the entire sequence, including any stop codons
+            cds=False,  # Do not assume the input is a coding sequence
+            table=codon_table,  # Codon table to use for translation
+        )
+    ).strip()
+    return (
+        protein_seq
+        if not return_correct_seq
+        else (protein_seq, is_correct_seq(dna_seq, protein_seq, stop_symbol))
+    )
+def read_fasta_file(
+    input_file: str,
+    save_to_file: Optional[str] = None,
+    organism: str = "",
+    buffer_size: int = 50000,
+) -> pd.DataFrame:
+    """
+    Read a FASTA file of DNA sequences and convert it to a Pandas DataFrame.
+    Optionally, save the DataFrame to a CSV file.
+    Args:
+        input_file (str): Path to the input FASTA file.
+        save_to_file (Optional[str]): Path to save the output DataFrame. If None,
+            data is only returned.
+        organism (str): Name of the organism. If empty, it will be extracted from
+            the FASTA description.
+        buffer_size (int): Number of records to process before writing to file.
+    Returns:
+        pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe
+        is True, else None.
+    Raises:
+        FileNotFoundError: If the input file does not exist.
+    """
+    if not os.path.exists(input_file):
+        raise FileNotFoundError(f"Input file not found: {input_file}")
+    buffer = []
+    columns = [
+        "dna",
+        "protein",
+        "correct_seq",
+        "organism",
+        "GeneID",
+        "description",
+        "tokenized",
+    ]
+    # Initialize DataFrame to store all data if return_dataframe is True
+    all_data = pd.DataFrame(columns=columns)
+    with open(input_file, "r") as fasta_file:
+        for record in tqdm(
+            SeqIO.parse(fasta_file, "fasta"),
+            desc=f"Processing {organism}",
+            unit=" Records",
+        ):
+            dna = str(record.seq).strip().upper()  # Ensure uppercase DNA sequence
+            # Determine the organism from the record if not provided
+            current_organism = organism or find_pattern_in_fasta(
+                "organism", record.description
+            )
+            gene_id = find_pattern_in_fasta("GeneID", record.description)
+            # Get the appropriate codon table for the organism
+            codon_table = get_codon_table(current_organism)
+            # Translate DNA to protein sequence
+            protein, correct_seq = get_amino_acid_sequence(
+                dna,
+                stop_symbol=STOP_SYMBOL,
+                codon_table=codon_table,
+                return_correct_seq=True,
+            )
+            description = record.description.split("[", 1)[0].strip()
+            tokenized = get_merged_seq(protein, dna, separator=STOP_SYMBOL)
+            # Create a data row for the current sequence
+            data_row = {
+                "dna": dna,
+                "protein": protein,
+                "correct_seq": correct_seq,
+                "organism": current_organism,
+                "GeneID": gene_id,
+                "description": description,
+                "tokenized": tokenized,
+            }
+            buffer.append(data_row)
+            # Write buffer to CSV file when buffer size is reached
+            if save_to_file and len(buffer) >= buffer_size:
+                write_buffer_to_csv(buffer, save_to_file, columns)
+                buffer = []
+            all_data = pd.concat(
+                [all_data, pd.DataFrame([data_row])], ignore_index=True
+            )
+    # Write remaining buffer to CSV file
+    if save_to_file and buffer:
+        write_buffer_to_csv(buffer, save_to_file, columns)
+    return all_data
+def write_buffer_to_csv(buffer: List[Dict], output_path: str, columns: List[str]):
+    """Helper function to write buffer to CSV file."""
+    buffer_df = pd.DataFrame(buffer, columns=columns)
+    buffer_df.to_csv(
+        output_path,
+        mode="a",
+        header=(not os.path.exists(output_path)),
+        index=True,
+    )
+def download_codon_frequencies_from_kazusa(
+    taxonomy_id: Optional[int] = None,
+    organism: Optional[str] = None,
+    taxonomy_reference: Optional[str] = None,
+    return_original_format: bool = False,
+) -> AMINO2CODON_TYPE:
+    """
+    Return the codon table of the given taxonomy ID from the Kazusa Database.
+    Args:
+        taxonomy_id (Optional[int]): Taxonomy ID.
+        organism (Optional[str]): Name of the organism.
+        taxonomy_reference (Optional[str]): Taxonomy reference.
+        return_original_format (bool): Whether to return in the original format.
+    Returns:
+        AMINO2CODON_TYPE: Codon table.
+    """
+    if taxonomy_reference:
+        taxonomy_id = get_taxonomy_id(taxonomy_reference, organism=organism)
+    kazusa_amino2codon = pct.get_codons_table(table_name=taxonomy_id)
+    if return_original_format:
+        return kazusa_amino2codon
+    # Replace "*" with STOP_SYMBOL in the codon table
+    kazusa_amino2codon[STOP_SYMBOL] = kazusa_amino2codon.pop("*")
+    # Create amino2codon dictionary
+    amino2codon = {
+        aminoacid: (list(codon2freq.keys()), list(codon2freq.values()))
+        for aminoacid, codon2freq in kazusa_amino2codon.items()
+    }
+    return sort_amino2codon_skeleton(amino2codon)
+def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
+    """
+    Return the empty skeleton of the amino2codon dictionary, needed for
+    get_codon_frequencies.
+    Args:
+        organism (str): Name of the organism.
+    Returns:
+        AMINO2CODON_TYPE: Empty amino2codon dictionary.
+    """
+    amino2codon = {}
+    possible_codons = [f"{i}{j}{k}" for i in "ACGT" for j in "ACGT" for k in "ACGT"]
+    possible_aminoacids = get_amino_acid_sequence(
+        dna="".join(possible_codons),
+        codon_table=get_codon_table(organism),
+        return_correct_seq=False,
+    )
+    # Initialize the amino2codon skeleton with all possible codons and set their
+    # frequencies to 0
+    for i, (codon, amino) in enumerate(zip(possible_codons, possible_aminoacids)):
+        if amino not in amino2codon:
+            amino2codon[amino] = ([], [])
+        amino2codon[amino][0].append(codon)
+        amino2codon[amino][1].append(0)
+    # Sort the dictionary and each list of codon frequency alphabetically
+    amino2codon = sort_amino2codon_skeleton(amino2codon)
+    return amino2codon
+def get_codon_frequencies(
+    dna_sequences: List[str],
+    protein_sequences: Optional[List[str]] = None,
+    organism: Optional[str] = None,
+) -> AMINO2CODON_TYPE:
+    """
+    Return a dictionary mapping each codon to its respective frequency based on
+    the collection of DNA sequences and protein sequences.
+    Args:
+        dna_sequences (List[str]): List of DNA sequences.
+        protein_sequences (Optional[List[str]]): List of protein sequences.
+        organism (Optional[str]): Name of the organism.
+    Returns:
+        AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons
+        and frequencies.
+    """
+    if organism:
+        codon_table = get_codon_table(organism)
+        protein_sequences = [
+            get_amino_acid_sequence(
+                dna, codon_table=codon_table, return_correct_seq=False
+            )
+            for dna in dna_sequences
+        ]
+    amino2codon = build_amino2codon_skeleton(organism)
+    # Count the frequencies of each codon for each amino acid
+    for dna, protein in zip(dna_sequences, protein_sequences):
+        for i, amino in enumerate(protein):
+            codon = dna[i * 3 : (i + 1) * 3]
+            codon_loc = amino2codon[amino][0].index(codon)
+            amino2codon[amino][1][codon_loc] += 1
+    # Normalize codon frequencies per amino acid so they sum to 1
+    amino2codon = {
+        amino: (codons, [freq / (sum(frequencies) + 1e-100) for freq in frequencies])
+        for amino, (codons, frequencies) in amino2codon.items()
+    }
+    return amino2codon
+def get_organism_to_codon_frequencies(
+    dataset: pd.DataFrame, organisms: List[str]
+) -> Dict[str, AMINO2CODON_TYPE]:
+    """
+    Return a dictionary mapping each organism to their codon frequency distribution.
+    Args:
+        dataset (pd.DataFrame): DataFrame containing DNA sequences.
+        organisms (List[str]): List of organisms.
+    Returns:
+        Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon
+        frequency distribution.
+    """
+    organism2frequencies = {}
+    # Calculate codon frequencies for each organism in the dataset
+    for organism in tqdm(
+        organisms, desc="Calculating Codon Frequencies: ", unit="Organism"
+    ):
+        organism_data = dataset.loc[dataset["organism"] == organism]
+        dna_sequences = organism_data["dna"].to_list()
+        protein_sequences = organism_data["protein"].to_list()
+        codon_frequencies = get_codon_frequencies(dna_sequences, protein_sequences)
+        organism2frequencies[organism] = codon_frequencies
+    return organism2frequencies
+def get_codon_table(organism: str) -> int:
+    """
+    Return the appropriate NCBI codon table for a given organism.
+    Args:
+        organism (str): Name of the organism.
+    Returns:
+        int: Codon table number.
+    """
+    # Common codon table (Table 1) for many model organisms
+    if organism in [
+        "Arabidopsis thaliana",
+        "Caenorhabditis elegans",
+        "Chlamydomonas reinhardtii",
+        "Saccharomyces cerevisiae",
+        "Danio rerio",
+        "Drosophila melanogaster",
+        "Homo sapiens",
+        "Mus musculus",
+        "Nicotiana tabacum",
+        "Solanum tuberosum",
+        "Solanum lycopersicum",
+        "Oryza sativa",
+        "Glycine max",
+        "Zea mays",
+    ]:
+        codon_table = 1
+    # Chloroplast codon table (Table 11)
+    elif organism in [
+        "Chlamydomonas reinhardtii chloroplast",
+        "Nicotiana tabacum chloroplast",
+    ]:
+        codon_table = 11
+    # Default to Table 11 for other bacteria and archaea
+    else:
+        codon_table = 11
+    return codon_table

CodonTransformer/CodonEvaluation.py ADDED Viewed

	@@ -0,0 +1,583 @@

+"""
+File: CodonEvaluation.py
+---------------------------
+Includes functions to calculate various evaluation metrics along with helper
+functions.
+"""
+from typing import Dict, List, Tuple, Optional
+import pandas as pd
+from CAI import CAI, relative_adaptiveness
+from tqdm import tqdm
+import math
+import numpy as np
+from collections import Counter
+from itertools import chain
+from statistics import mean
+import sys
+import os
+from io import StringIO
+def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences.
+    Args:
+        sequences (List[str]): List of DNA sequences.
+    Returns:
+        dict: The CSI weights.
+    """
+    return relative_adaptiveness(sequences=sequences)
+def get_CSI_value(dna: str, weights: Dict[str, float]) -> float:
+    """
+    Calculate the Codon Similarity Index (CSI) for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        weights (dict): The CSI weights from get_CSI_weights.
+    Returns:
+        float: The CSI value.
+    """
+    return CAI(dna, weights)
+def get_organism_to_CSI_weights(
+    dataset: pd.DataFrame, organisms: List[str]
+) -> Dict[str, dict]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of organisms.
+    Args:
+        dataset (pd.DataFrame): Dataset containing organism and DNA sequence info.
+        organisms (List[str]): List of organism names.
+    Returns:
+        Dict[str, dict]: A dictionary mapping each organism to its CSI weights.
+    """
+    organism2weights = {}
+    # Iterate through each organism to calculate its CSI weights
+    for organism in tqdm(organisms, desc="Calculating CSI Weights: ", unit="Organism"):
+        organism_data = dataset.loc[dataset["organism"] == organism]
+        sequences = organism_data["dna"].to_list()
+        weights = get_CSI_weights(sequences)
+        organism2weights[organism] = weights
+    return organism2weights
+def get_GC_content(dna: str) -> float:
+    """
+    Calculate the GC content of a DNA sequence.
+    GC content is the percentage of nucleotides that are either G (guanine) or C (cytosine).
+    This metric is important for codon optimization as it affects expression levels and
+    synthesis efficiency in E. coli.
+    Args:
+        dna (str): The DNA sequence (uppercase or lowercase).
+    Returns:
+        float: The GC content as a percentage (0-100).
+    Example:
+        >>> get_GC_content("ATGCGATCG")
+        55.56  # 5 GC nucleotides out of 9 total
+    """
+    dna = dna.upper()
+    if not dna:
+        return 0.0
+    return (dna.count("G") + dna.count("C")) / len(dna) * 100
+def get_cfd(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    threshold: float = 0.3,
+) -> float:
+    """
+    Calculate the codon frequency distribution (CFD) metric for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        threshold (float): Frequency threshold for counting rare codons.
+    Returns:
+        float: The CFD metric as a percentage.
+    """
+    # Get a dictionary mapping each codon to its normalized frequency
+    codon2frequency = {
+        codon: freq / max(frequencies)
+        for amino, (codons, frequencies) in codon_frequencies.items()
+        for codon, freq in zip(codons, frequencies)
+    }
+    cfd = 0
+    # Iterate through the DNA sequence in steps of 3 to process each codon
+    for i in range(0, len(dna), 3):
+        codon = dna[i : i + 3]
+        codon_frequency = codon2frequency[codon]
+        if codon_frequency < threshold:
+            cfd += 1
+    return cfd / (len(dna) / 3) * 100
+def get_min_max_percentage(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    window_size: int = 18,
+) -> List[float]:
+    """
+    Calculate the %MinMax metric for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        window_size (int): Size of the window to calculate %MinMax.
+    Returns:
+        List[float]: List of %MinMax values for the sequence.
+    Credit: https://github.com/chowington/minmax
+    """
+    # Get a dictionary mapping each codon to its respective amino acid
+    codon2amino = {
+        codon: amino
+        for amino, (codons, frequencies) in codon_frequencies.items()
+        for codon in codons
+    }
+    min_max_values = []
+    codons = [dna[i : i + 3] for i in range(0, len(dna), 3)]  # Split DNA into codons
+    # Iterate through the DNA sequence using the specified window size
+    for i in range(len(codons) - window_size + 1):
+        codon_window = codons[i : i + window_size]  # Codons in the current window
+        Actual = 0.0  # Average of the actual codon frequencies
+        Max = 0.0  # Average of the min codon frequencies
+        Min = 0.0  # Average of the max codon frequencies
+        Avg = 0.0  # Average of the averages of all frequencies for each amino acid
+        # Sum the frequencies for codons in the current window
+        for codon in codon_window:
+            aminoacid = codon2amino[codon]
+            frequencies = codon_frequencies[aminoacid][1]
+            codon_index = codon_frequencies[aminoacid][0].index(codon)
+            codon_frequency = codon_frequencies[aminoacid][1][codon_index]
+            Actual += codon_frequency
+            Max += max(frequencies)
+            Min += min(frequencies)
+            Avg += sum(frequencies) / len(frequencies)
+        # Divide by the window size to get the averages
+        Actual = Actual / window_size
+        Max = Max / window_size
+        Min = Min / window_size
+        Avg = Avg / window_size
+        # Calculate %MinMax
+        percentMax = ((Actual - Avg) / (Max - Avg)) * 100
+        percentMin = ((Avg - Actual) / (Avg - Min)) * 100
+        # Append the appropriate %MinMax value
+        if percentMax >= 0:
+            min_max_values.append(percentMax)
+        else:
+            min_max_values.append(-percentMin)
+    # Populate the last floor(window_size / 2) entries of min_max_values with None
+    for i in range(int(window_size / 2)):
+        min_max_values.append(None)
+    return min_max_values
+def get_sequence_complexity(dna: str) -> float:
+    """
+    Calculate the sequence complexity score of a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+    Returns:
+        float: The sequence complexity score.
+    """
+    def sum_up_to(x):
+        """Recursive function to calculate the sum of integers from 1 to x."""
+        if x <= 1:
+            return 1
+        else:
+            return x + sum_up_to(x - 1)
+    def f(x):
+        """Returns 4 if x is greater than or equal to 4, else returns x."""
+        if x >= 4:
+            return 4
+        elif x < 4:
+            return x
+    unique_subseq_length = []
+    # Calculate unique subsequences lengths
+    for i in range(1, len(dna) + 1):
+        unique_subseq = set()
+        for j in range(len(dna) - (i - 1)):
+            unique_subseq.add(dna[j : (j + i)])
+        unique_subseq_length.append(len(unique_subseq))
+    # Calculate complexity score
+    complexity_score = (
+        sum(unique_subseq_length) / (sum_up_to(len(dna) - 1) + f(len(dna)))
+    ) * 100
+    return complexity_score
+def get_sequence_similarity(
+    original: str, predicted: str, truncate: bool = True, window_length: int = 1
+) -> float:
+    """
+    Calculate the sequence similarity between two sequences.
+    Args:
+        original (str): The original sequence.
+        predicted (str): The predicted sequence.
+        truncate (bool): If True, truncate the original sequence to match the length
+            of the predicted sequence.
+        window_length (int): Length of the window for comparison (1 for amino acids,
+            3 for codons).
+    Returns:
+        float: The sequence similarity as a percentage.
+    Preconditions:
+        len(predicted) <= len(original).
+    """
+    if not truncate and len(original) != len(predicted):
+        raise ValueError(
+            "Set truncate to True if the length of sequences do not match."
+        )
+    identity = 0.0
+    original = original.strip()
+    predicted = predicted.strip()
+    if truncate:
+        original = original[: len(predicted)]
+    if window_length == 1:
+        # Simple comparison for amino acid
+        for i in range(len(predicted)):
+            if original[i] == predicted[i]:
+                identity += 1
+    else:
+        # Comparison for substrings based on window_length
+        for i in range(0, len(original) - window_length + 1, window_length):
+            if original[i : i + window_length] == predicted[i : i + window_length]:
+                identity += 1
+    return (identity / (len(predicted) / window_length)) * 100
+def scan_for_restriction_sites(seq: str, sites: List[str] = ['GAATTC', 'GGATCC', 'AAGCTT']) -> int:
+    """
+    Scans for a list of restriction enzyme sites in a DNA sequence.
+    """
+    return sum(seq.upper().count(site.upper()) for site in sites)
+def count_negative_cis_elements(seq: str, motifs: List[str] = ['TATAAT', 'TTGACA', 'AGCTAGT']) -> int:
+    """
+    Counts occurrences of negative cis-regulatory elements in a DNA sequence.
+    """
+    return sum(seq.upper().count(m.upper()) for m in motifs)
+def calculate_homopolymer_runs(seq: str, max_len: int = 8) -> int:
+    """
+    Calculates the number of homopolymer runs longer than a given length.
+    """
+    import re
+    min_len = max_len + 1
+    return len(re.findall(r'(A{%d,}|T{%d,}|G{%d,}|C{%d,})' % (min_len, min_len, min_len, min_len), seq.upper()))
+def get_min_max_profile(
+    dna: str,
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+    window_size: int = 18,
+) -> List[float]:
+    """
+    Calculate the %MinMax profile for a DNA sequence. This is a list of
+    %MinMax values for sliding windows across the sequence.
+    Args:
+        dna (str): The DNA sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
+        window_size (int): Size of the window to calculate %MinMax.
+    Returns:
+        List[float]: List of %MinMax values for the sequence.
+    """
+    return get_min_max_percentage(dna, codon_frequencies, window_size)
+def calculate_dtw_distance(profile1: List[float], profile2: List[float]) -> float:
+    """
+    Calculates the Dynamic Time Warping (DTW) distance between two profiles.
+    Args:
+        profile1 (List[float]): The first profile (e.g., %MinMax of generated sequence).
+        profile2 (List[float]): The second profile (e.g., %MinMax of natural sequence).
+    Returns:
+        float: The DTW distance between the two profiles.
+    """
+    from dtw import dtw
+    import numpy as np
+    # Ensure profiles are numpy arrays and handle potential None and NaN values
+    p1 = np.array([v for v in profile1 if v is not None and not np.isnan(v)]).reshape(
+        -1, 1
+    )
+    p2 = np.array([v for v in profile2 if v is not None and not np.isnan(v)]).reshape(
+        -1, 1
+    )
+    if len(p1) == 0 or len(p2) == 0:
+        return np.inf  # Return infinity if one of the profiles is empty
+    alignment = dtw(p1, p2, keep_internals=True)
+    return alignment.distance  # type: ignore
+def get_ecoli_tai_weights():
+    """
+    Returns a dictionary of tAI weights for E. coli based on tRNA gene copy numbers.
+    These weights are pre-calculated based on the relative adaptiveness of each codon.
+    """
+    codons = [
+        "TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC",
+        "TGT", "TGC", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA",
+        "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT",
+        "ATC", "ATA", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG",
+        "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC",
+        "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG"
+    ]
+    weights = [
+        0.1966667, 0.3333333, 0.1666667, 0.2200000, 0.1966667, 0.3333333,
+        0.1666667, 0.2200000, 0.2950000, 0.5000000, 0.09833333, 0.1666667,
+        0.2200000, 0.09833333, 0.1666667, 0.1666667, 0.7200000, 0.09833333,
+        0.1666667, 0.1666667, 0.2200000, 0.09833333, 0.1666667, 0.3333333,
+        0.4400000, 0.6666667, 0.4800000, 0.00006666667, 0.1666667, 0.2950000,
+        0.5000000, 0.01833333, 0.1966667, 0.3333333, 0.1666667, 0.3866667,
+        0.3933333, 0.6666667, 1.0000000, 0.3200000, 0.09833333, 0.1666667,
+        0.1666667, 0.2200000, 0.1966667, 0.3333333, 0.8333333, 0.2666667,
+        0.1966667, 0.3333333, 0.5000000, 0.1600000, 0.2950000, 0.5000000,
+        0.6666667, 0.2133333, 0.3933333, 0.6666667, 0.1666667, 0.2200000
+    ]
+    return dict(zip(codons, weights))
+def calculate_tAI(sequence: str, tai_weights: Dict[str, float]) -> float:
+    """
+    Calculates the tRNA Adaptation Index (tAI) for a given DNA sequence.
+    Args:
+        sequence (str): The DNA sequence to analyze.
+        tai_weights (Dict[str, float]): A dictionary of tAI weights for each codon.
+    Returns:
+        float: The tAI value for the sequence.
+    """
+    from scipy.stats.mstats import gmean
+    codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
+    # Filter out stop codons and codons not in weights
+    weights = [tai_weights[codon] for codon in codons if codon in tai_weights and tai_weights[codon] > 0]
+    if not weights:
+        return 0.0
+    return gmean(weights)
+def calculate_ENC(sequence: str) -> float:
+    """
+    Calculate the Effective Number of Codons (ENC) for a DNA sequence.
+    Uses the codonbias library implementation based on Wright (1990).
+    Args:
+        sequence (str): The DNA sequence.
+    Returns:
+        float: The ENC value for the sequence.
+    """
+    try:
+        from codonbias.scores import EffectiveNumberOfCodons
+        # Initialize ENC calculator
+        enc_calculator = EffectiveNumberOfCodons(
+            k_mer=1,  # Standard codon analysis
+            bg_correction=True,  # Use background correction
+            robust=True,  # Use robust calculation
+            genetic_code=1  # Standard genetic code
+        )
+        # Calculate ENC for the sequence
+        enc_value = enc_calculator.get_score(sequence)
+        return float(enc_value)
+    except ImportError:
+        raise ImportError("codonbias library is required for ENC calculation. Install with: pip install codonbias")
+    except Exception as e:
+        # Fallback to a simple ENC approximation if library fails
+        print(f"Warning: ENC calculation failed with error: {e}. Using approximation.")
+        return 45.0  # Typical E. coli ENC value as fallback
+def calculate_CPB(sequence: str, reference_sequences: Optional[List[str]] = None) -> float:
+    """
+    Calculate the Codon Pair Bias (CPB) for a DNA sequence.
+    Uses the codonbias library implementation based on Coleman et al. (2008).
+    Args:
+        sequence (str): The DNA sequence.
+        reference_sequences (List[str]): Reference sequences for calculating expected values.
+                                       If None, uses a default E. coli reference.
+    Returns:
+        float: The CPB value for the sequence.
+    """
+    try:
+        from codonbias.scores import CodonPairBias
+        # Use provided reference sequences or default
+        if reference_sequences is None:
+            # Use the input sequence as reference if none provided
+            reference_sequences = [sequence]
+        # Initialize CPB calculator with reference sequences
+        cpb_calculator = CodonPairBias(
+            ref_seq=reference_sequences,
+            k_mer=2,  # Codon pairs
+            genetic_code=1,  # Standard genetic code
+            ignore_stop=True,  # Ignore stop codons
+            pseudocount=1  # Pseudocount for unseen pairs
+        )
+        # Calculate CPB for the sequence
+        cpb_value = cpb_calculator.get_score(sequence)
+        return float(cpb_value)
+    except ImportError:
+        raise ImportError("codonbias library is required for CPB calculation. Install with: pip install codonbias")
+    except Exception as e:
+        # Fallback calculation if library fails
+        print(f"Warning: CPB calculation failed with error: {e}. Using approximation.")
+        return 0.0  # Neutral CPB as fallback
+def calculate_SCUO(sequence: str) -> float:
+    """
+    Calculate the Synonymous Codon Usage Order (SCUO) for a DNA sequence.
+    Uses the GCUA library implementation based on information theory.
+    Args:
+        sequence (str): The DNA sequence.
+    Returns:
+        float: The SCUO value (0-1, where 1 indicates maximum bias).
+    """
+    # Self-contained SCUO implementation (no external GCUA dependency).
+    # Based on Wan et al., 2004 information-theoretic definition.
+    from math import log2  # local import to avoid global cost
+    try:
+        # Build standard genetic code mapping using built-in tables (Biopython optional).
+        # Fall back to hard-coded table if Biopython absent.
+        try:
+            from Bio.Data import CodonTable  # type: ignore
+            codon_to_aa = CodonTable.unambiguous_dna_by_id[1].forward_table
+        except Exception:
+            codon_to_aa = {
+                # Partial table sufficient for SCUO calculation; stop codons omitted.
+                'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+                'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+                'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+                'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+                'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+                'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+                'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+                'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+                'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+                'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+                'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+                'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+                'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+                'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+                'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+                'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
+            }
+        # Group codons by amino acid (exclude stops)
+        aa_to_codons = {}
+        for codon, aa in codon_to_aa.items():
+            aa_to_codons.setdefault(aa, []).append(codon)
+        # Count codon occurrences in input sequence
+        seq = sequence.upper().replace('U', 'T')
+        codon_counts = {}
+        for i in range(0, len(seq) - len(seq) % 3, 3):
+            codon = seq[i:i+3]
+            if codon in codon_to_aa:
+                codon_counts[codon] = codon_counts.get(codon, 0) + 1
+        total_codons = sum(codon_counts.values())
+        if total_codons == 0:
+            return 0.0
+        scuo_sum = 0.0
+        for aa, codons in aa_to_codons.items():
+            n_codons = len(codons)
+            if n_codons == 1:
+                continue  # SCUO undefined for Met/Trp
+            counts = [codon_counts.get(c, 0) for c in codons]
+            total_aa = sum(counts)
+            if total_aa == 0:
+                continue
+            probs = [c / total_aa for c in counts if c]
+            H_obs = -sum(p * log2(p) for p in probs)
+            H_max = log2(n_codons)
+            O_i = (H_max - H_obs) / H_max if H_max else 0.0
+            F_i = total_aa / total_codons
+            scuo_sum += F_i * O_i
+        return scuo_sum
+    except Exception as exc:
+        print(f"Warning: internal SCUO computation failed ({exc}). Returning 0.5.")
+        return 0.5

CodonTransformer/CodonJupyter.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+File: CodonJupyter.py
+---------------------
+Includes Jupyter-specific functions for displaying interactive widgets.
+"""
+from typing import Dict, List, Tuple
+import ipywidgets as widgets
+from IPython.display import HTML, display
+from CodonTransformer.CodonUtils import (
+    COMMON_ORGANISMS,
+    ID2ORGANISM,
+    ORGANISM2ID,
+    DNASequencePrediction,
+)
+class UserContainer:
+    """
+    A container class to store user inputs for organism and protein sequence.
+    Attributes:
+        organism (int): The selected organism id.
+        protein (str): The input protein sequence.
+    """
+    def __init__(self) -> None:
+        self.organism: int = -1
+        self.protein: str = ""
+def create_styled_options(
+    organisms: list, organism2id: Dict[str, int], is_fine_tuned: bool = False
+) -> list:
+    """
+    Create styled options for the dropdown widget.
+    Args:
+        organisms (list): List of organism names.
+        organism2id (Dict[str, int]): Dictionary mapping organism names to their IDs.
+        is_fine_tuned (bool): Whether these are fine-tuned organisms.
+    Returns:
+        list: Styled options for the dropdown widget.
+    """
+    styled_options = []
+    for organism in organisms:
+        organism_id = organism2id[organism]
+        if is_fine_tuned:
+            if organism_id < 10:
+                styled_options.append(f"\u200b{organism_id:>6}.  {organism}")
+            elif organism_id < 100:
+                styled_options.append(f"\u200b{organism_id:>5}.  {organism}")
+            else:
+                styled_options.append(f"\u200b{organism_id:>4}.  {organism}")
+        else:
+            if organism_id < 10:
+                styled_options.append(f"{organism_id:>6}.  {organism}")
+            elif organism_id < 100:
+                styled_options.append(f"{organism_id:>5}.  {organism}")
+            else:
+                styled_options.append(f"{organism_id:>4}.  {organism}")
+    return styled_options
+def create_dropdown_options(organism2id: Dict[str, int]) -> list:
+    """
+    Create the full list of dropdown options, including section headers.
+    Args:
+        organism2id (Dict[str, int]): Dictionary mapping organism names to their IDs.
+    Returns:
+        list: Full list of dropdown options.
+    """
+    fine_tuned_organisms = sorted(
+        [org for org in organism2id.keys() if org in COMMON_ORGANISMS]
+    )
+    all_organisms = sorted(organism2id.keys())
+    fine_tuned_options = create_styled_options(
+        fine_tuned_organisms, organism2id, is_fine_tuned=True
+    )
+    all_organisms_options = create_styled_options(
+        all_organisms, organism2id, is_fine_tuned=False
+    )
+    return (
+        [""]
+        + ["Selected Organisms"]
+        + fine_tuned_options
+        + [""]
+        + ["All Organisms"]
+        + all_organisms_options
+    )
+def create_organism_dropdown(container: UserContainer) -> widgets.Dropdown:
+    """
+    Create and configure the organism dropdown widget.
+    Args:
+        container (UserContainer): Container to store the selected organism.
+    Returns:
+        widgets.Dropdown: Configured dropdown widget.
+    """
+    dropdown = widgets.Dropdown(
+        options=create_dropdown_options(ORGANISM2ID),
+        description="",
+        layout=widgets.Layout(width="40%", margin="0 0 10px 0"),
+        style={"description_width": "initial"},
+    )
+    def show_organism(change: Dict[str, str]) -> None:
+        """
+        Update the container with the selected organism and print to terminal.
+        Args:
+            change (Dict[str, str]): Information about the change in dropdown value.
+        """
+        dropdown_choice = change["new"]
+        if dropdown_choice and dropdown_choice not in [
+            "Selected Organisms",
+            "All Organisms",
+        ]:
+            organism = "".join(filter(str.isdigit, dropdown_choice))
+            organism_id = ID2ORGANISM[int(organism)]
+            container.organism = organism_id
+        else:
+            container.organism = None
+    dropdown.observe(show_organism, names="value")
+    return dropdown
+def get_dropdown_style() -> str:
+    """
+    Return the custom CSS style for the dropdown widget.
+    Returns:
+        str: CSS style string.
+    """
+    return """
+    <style>
+        .widget-dropdown > select {
+            font-size: 16px;
+            font-weight: normal;
+            background-color: #f0f0f0;
+            border-radius: 5px;
+            padding: 5px;
+        }
+        .widget-label {
+            font-size: 18px;
+            font-weight: bold;
+        }
+        .custom-container {
+            display: flex;
+            flex-direction: column;
+            align-items: flex-start;
+        }
+        .widget-dropdown option[value^="\u200b"] {
+            font-family: sans-serif;
+            font-weight: bold;
+            font-size: 18px;
+            padding: 510px;
+        }
+        .widget-dropdown option[value*="Selected Organisms"],
+        .widget-dropdown option[value*="All Organisms"] {
+            text-align: center;
+            font-family: Arial, sans-serif;
+            font-weight: bold;
+            font-size: 20px;
+            color: #6900A1;
+            background-color: #00D8A1;
+        }
+    </style>
+    """
+def display_organism_dropdown(container: UserContainer) -> None:
+    """
+    Display the organism dropdown widget and apply custom styles.
+    Args:
+        container (UserContainer): Container to store the selected organism.
+    """
+    dropdown = create_organism_dropdown(container)
+    header = widgets.HTML(
+        '<b style="font-size:20px;">Select Organism:</b>'
+        '<div style="height:10px;"></div>'
+    )
+    container_widget = widgets.VBox(
+        [header, dropdown],
+        layout=widgets.Layout(padding="12px 0 12px 25px"),
+    )
+    display(container_widget)
+    display(HTML(get_dropdown_style()))
+def display_protein_input(container: UserContainer) -> None:
+    """
+    Display a widget for entering a protein sequence and save it to the container.
+    Args:
+        container (UserContainer): A container to store the entered protein sequence.
+    """
+    protein_input = widgets.Textarea(
+        value="",
+        placeholder="Enter here...",
+        description="",
+        layout=widgets.Layout(width="100%", height="100px", margin="0 0 10px 0"),
+        style={"description_width": "initial"},
+    )
+    # Custom CSS for the input widget
+    input_style = """
+        <style>
+            .widget-textarea > textarea {
+                font-size: 12px;
+                font-family: Arial, sans-serif;
+                font-weight: normal;
+                background-color: #f0f0f0;
+                border-radius: 5px;
+                padding: 10px;
+            }
+            .widget-label {
+                font-size: 18px;
+                font-weight: bold;
+            }
+            .custom-container {
+                display: flex;
+                flex-direction: column;
+                align-items: flex-start;
+            }
+        </style>
+    """
+    # Function to save the input protein sequence to the container
+    def save_protein(change: Dict[str, str]) -> None:
+        """
+        Save the input protein sequence to the container.
+        Args:
+            change (Dict[str, str]): A dictionary containing information about
+            the change in textarea value.
+        """
+        container.protein = (
+            change["new"]
+            .upper()
+            .strip()
+            .replace("\n", "")
+            .replace(" ", "")
+            .replace("\t", "")
+        )
+    # Attach the function to the input widget
+    protein_input.observe(save_protein, names="value")
+    # Display the input widget
+    header = widgets.HTML(
+        '<b style="font-size:20px;">Enter Protein Sequence:</b>'
+        '<div style="height:18px;"></div>'
+    )
+    container_widget = widgets.VBox(
+        [header, protein_input], layout=widgets.Layout(padding="12px 12px 0 25px")
+    )
+    display(container_widget)
+    display(widgets.HTML(input_style))
+def format_model_output(output: DNASequencePrediction) -> str:
+    """
+    Format DNA sequence prediction output in an appealing and easy-to-read manner.
+    This function takes the prediction output and formats it into
+    a structured string with clear section headers and separators.
+    Args:
+        output (DNASequencePrediction): Object containing the prediction output.
+            Expected attributes:
+            - organism (str): The organism name.
+            - protein (str): The input protein sequence.
+            - processed_input (str): The processed input sequence.
+            - predicted_dna (str): The predicted DNA sequence.
+    Returns:
+        str: A formatted string containing the organized output.
+    """
+    def format_section(title: str, content: str) -> str:
+        """Helper function to format individual sections."""
+        separator = "-" * 29
+        title_line = f"| {title.center(25)} |"
+        return f"{separator}\n{title_line}\n{separator}\n{content}\n\n"
+    sections: List[Tuple[str, str]] = [
+        ("Organism", output.organism),
+        ("Input Protein", output.protein),
+        ("Processed Input", output.processed_input),
+        ("Predicted DNA", output.predicted_dna),
+    ]
+    formatted_output = ""
+    for title, content in sections:
+        formatted_output += format_section(title, content)
+    # Remove the last newline to avoid extra space at the end
+    return formatted_output.rstrip()

CodonTransformer/CodonPostProcessing.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+File: CodonPostProcessing.py
+---------------------------
+Post-processing utilities for codon optimization using DNAChisel.
+This module provides sequence polishing capabilities to fix restriction sites,
+homopolymers, and other constraints while preserving CAI and GC content.
+"""
+import warnings
+import numpy as np
+try:
+    from dnachisel import (
+        DnaOptimizationProblem,
+        AvoidPattern,
+        EnforceGCContent,
+        EnforceTranslation,
+        CodonOptimize,
+    )
+    DNACHISEL_AVAILABLE = True
+except ImportError:
+    DNACHISEL_AVAILABLE = False
+    # This warning will be shown when the module is first imported.
+    warnings.warn(
+        "DNAChisel is not installed. Post-processing features will be disabled."
+    )
+def polish_sequence_with_dnachisel(
+    dna_sequence: str,
+    protein_sequence: str,
+    gc_bounds: tuple = (45.0, 55.0),
+    cai_species: str = "e_coli",
+    avoid_homopolymers_length: int = 6,
+    enzymes_to_avoid: list = None
+):
+    """
+    Polishes a DNA sequence using DNAChisel to meet lab synthesis constraints.
+    """
+    if not DNACHISEL_AVAILABLE:
+        warnings.warn("DNAChisel not available, skipping post-processing.")
+        return dna_sequence
+    if enzymes_to_avoid is None:
+        # Common cloning enzymes
+        enzymes_to_avoid = ["EcoRI", "XbaI", "SpeI", "PstI", "NotI"]
+    try:
+        # Start with the basic, essential constraints
+        constraints = [
+            EnforceTranslation(translation=protein_sequence),
+            EnforceGCContent(mini=gc_bounds[0] / 100.0, maxi=gc_bounds[1] / 100.0),
+        ]
+        # Add enzyme avoidance constraints safely
+        for enzyme in enzymes_to_avoid:
+            try:
+                # This is the modern way to avoid enzyme sites
+                constraints.append(AvoidPattern.from_enzyme_name(enzyme))
+            except Exception:
+                warnings.warn(f"Could not find enzyme '{enzyme}' in DNAChisel library.")
+        # Add homopolymer avoidance constraints
+        for base in "ATGC":
+            constraints.append(AvoidPattern(base * avoid_homopolymers_length))
+        # Define the optimization problem
+        problem = DnaOptimizationProblem(
+            sequence=dna_sequence,
+            constraints=constraints,
+            objectives=[CodonOptimize(species=cai_species, method="match_codon_usage")]
+        )
+        # Solve the problem
+        problem.resolve_constraints()
+        problem.optimize()
+        # Return the polished sequence
+        return problem.sequence
+    except Exception as e:
+        warnings.warn(f"DNAChisel post-processing failed with an error: {e}")
+        # Return the original sequence if polishing fails
+        return dna_sequence

CodonTransformer/CodonPrediction.py ADDED Viewed

	@@ -0,0 +1,1372 @@

+"""
+File: CodonPrediction.py
+---------------------------
+Includes functions to tokenize input, load models, infer predicted dna sequences and
+helper functions related to processing data for passing to the model.
+"""
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import heapq
+from dataclasses import dataclass
+import numpy as np
+import onnxruntime as rt
+import torch
+import transformers
+from transformers import (
+    AutoTokenizer,
+    BatchEncoding,
+    BigBirdConfig,
+    BigBirdForMaskedLM,
+    PreTrainedTokenizerFast,
+)
+from CodonTransformer.CodonData import get_merged_seq
+from CodonTransformer.CodonUtils import (
+    AMINO_ACID_TO_INDEX,
+    INDEX2TOKEN,
+    NUM_ORGANISMS,
+    ORGANISM2ID,
+    TOKEN2INDEX,
+    DNASequencePrediction,
+    GC_COUNTS_PER_TOKEN,
+    CODON_GC_CONTENT,
+    AA_MIN_GC,
+    AA_MAX_GC,
+)
+def predict_dna_sequence(
+    protein: str,
+    organism: Union[int, str],
+    device: torch.device,
+    tokenizer: Union[str, PreTrainedTokenizerFast] = None,
+    model: Union[str, torch.nn.Module] = None,
+    attention_type: str = "original_full",
+    deterministic: bool = True,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    num_sequences: int = 1,
+    match_protein: bool = False,
+    use_constrained_search: bool = False,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+) -> Union[DNASequencePrediction, List[DNASequencePrediction]]:
+    """
+    Predict the DNA sequence(s) for a given protein using the CodonTransformer model.
+    This function takes a protein sequence and an organism (as ID or name) as input
+    and returns the predicted DNA sequence(s) using the CodonTransformer model. It can use
+    either provided tokenizer and model objects or load them from specified paths.
+    Args:
+        protein (str): The input protein sequence for which to predict the DNA sequence.
+        organism (Union[int, str]): Either the ID of the organism or its name (e.g.,
+            "Escherichia coli general"). If a string is provided, it will be converted
+            to the corresponding ID using ORGANISM2ID.
+        device (torch.device): The device (CPU or GPU) to run the model on.
+        tokenizer (Union[str, PreTrainedTokenizerFast, None], optional): Either a file
+            path to load the tokenizer from, a pre-loaded tokenizer object, or None. If
+            None, it will be loaded from HuggingFace. Defaults to None.
+        model (Union[str, torch.nn.Module, None], optional): Either a file path to load
+            the model from, a pre-loaded model object, or None. If None, it will be
+            loaded from HuggingFace. Defaults to None.
+        attention_type (str, optional): The type of attention mechanism to use in the
+            model. Can be either 'block_sparse' or 'original_full'. Defaults to
+            "original_full".
+        deterministic (bool, optional): Whether to use deterministic decoding (most
+            likely tokens). If False, samples tokens according to their probabilities
+            adjusted by the temperature. Defaults to True.
+        temperature (float, optional): A value controlling the randomness of predictions
+            during non-deterministic decoding. Lower values (e.g., 0.2) make the model
+            more conservative, while higher values (e.g., 0.8) increase randomness.
+            Using high temperatures may result in prediction of DNA sequences that
+            do not translate to the input protein.
+            Recommended values are:
+                - Low randomness: 0.2
+                - Medium randomness: 0.5
+                - High randomness: 0.8
+            The temperature must be a positive float. Defaults to 0.2.
+        top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+            Tokens with cumulative probability up to top_p are considered for sampling.
+            This parameter helps balance diversity and coherence in the predicted DNA sequences.
+            The value must be a float between 0 and 1. Defaults to 0.95.
+        num_sequences (int, optional): The number of DNA sequences to generate. Only applicable
+            when deterministic is False. Defaults to 1.
+        match_protein (bool, optional): Ensures the predicted DNA sequence is translated
+            to the input protein sequence by sampling from only the respective codons of
+            given amino acids. Defaults to False.
+        use_constrained_search (bool, optional): Whether to use constrained beam search
+            with GC content bounds. Defaults to False.
+        gc_bounds (Tuple[float, float], optional): GC content bounds (min, max) for
+            constrained search. Defaults to (0.30, 0.70).
+        beam_size (int, optional): Beam size for constrained search. Defaults to 5.
+        length_penalty (float, optional): Length penalty for beam search scoring.
+            Defaults to 1.0.
+        diversity_penalty (float, optional): Diversity penalty to reduce repetitive
+            sequences. Defaults to 0.0.
+    Returns:
+        Union[DNASequencePrediction, List[DNASequencePrediction]]: An object or list of objects
+        containing the prediction results:
+            - organism (str): Name of the organism used for prediction.
+            - protein (str): Input protein sequence for which DNA sequence is predicted.
+            - processed_input (str): Processed input sequence (merged protein and DNA).
+            - predicted_dna (str): Predicted DNA sequence.
+    Raises:
+        ValueError: If the protein sequence is empty, if the organism is invalid,
+            if the temperature is not a positive float, if top_p is not between 0 and 1,
+            or if num_sequences is less than 1 or used with deterministic mode.
+    Note:
+        This function uses ORGANISM2ID, INDEX2TOKEN, and AMINO_ACID_TO_INDEX dictionaries
+        imported from CodonTransformer.CodonUtils. ORGANISM2ID maps organism names to their
+        corresponding IDs. INDEX2TOKEN maps model output indices (token IDs) to
+        respective codons. AMINO_ACID_TO_INDEX maps each amino acid and stop symbol to indices
+        of codon tokens that translate to it.
+    Example:
+        >>> import torch
+        >>> from transformers import AutoTokenizer, BigBirdForMaskedLM
+        >>> from CodonTransformer.CodonPrediction import predict_dna_sequence
+        >>> from CodonTransformer.CodonJupyter import format_model_output
+        >>>
+        >>> # Set up device
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>>
+        >>> # Load tokenizer and model
+        >>> tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+        >>> model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer")
+        >>> model = model.to(device)
+        >>>
+        >>> # Define protein sequence and organism
+        >>> protein = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"
+        >>> organism = "Escherichia coli general"
+        >>>
+        >>> # Predict DNA sequence with deterministic decoding (single sequence)
+        >>> output = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     attention_type="original_full",
+        ...     deterministic=True
+        ... )
+        >>>
+        >>> # Predict DNA sequence with constrained beam search
+        >>> output_constrained = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     use_constrained_search=True,
+        ...     gc_bounds=(0.40, 0.60),
+        ...     beam_size=10,
+        ...     length_penalty=1.2,
+        ...     diversity_penalty=0.1
+        ... )
+        >>>
+        >>> # Predict multiple DNA sequences with low randomness and top_p sampling
+        >>> output_random = predict_dna_sequence(
+        ...     protein=protein,
+        ...     organism=organism,
+        ...     device=device,
+        ...     tokenizer=tokenizer,
+        ...     model=model,
+        ...     attention_type="original_full",
+        ...     deterministic=False,
+        ...     temperature=0.2,
+        ...     top_p=0.95,
+        ...     num_sequences=3
+        ... )
+        >>>
+        >>> print(format_model_output(output))
+        >>> for i, seq in enumerate(output_random, 1):
+        ...     print(f"Sequence {i}:")
+        ...     print(format_model_output(seq))
+        ...     print()
+    """
+    if not protein:
+        raise ValueError("Protein sequence cannot be empty.")
+    if not isinstance(temperature, (float, int)) or temperature <= 0:
+        raise ValueError("Temperature must be a positive float.")
+    if not isinstance(top_p, (float, int)) or not 0 < top_p <= 1.0:
+        raise ValueError("top_p must be a float between 0 and 1.")
+    if not isinstance(num_sequences, int) or num_sequences < 1:
+        raise ValueError("num_sequences must be a positive integer.")
+    if use_constrained_search:
+        if not isinstance(gc_bounds, tuple) or len(gc_bounds) != 2:
+            raise ValueError("gc_bounds must be a tuple of (min_gc, max_gc).")
+        if not (0.0 <= gc_bounds[0] <= gc_bounds[1] <= 1.0):
+            raise ValueError("gc_bounds must be between 0.0 and 1.0 with min <= max.")
+        if not isinstance(beam_size, int) or beam_size < 1:
+            raise ValueError("beam_size must be a positive integer.")
+    if deterministic and num_sequences > 1 and not use_constrained_search:
+        raise ValueError(
+            "Multiple sequences can only be generated in non-deterministic mode "
+            "(unless using constrained search)."
+        )
+    if use_constrained_search and num_sequences > 1:
+        raise ValueError(
+            "Constrained beam search currently supports only single sequence generation."
+        )
+    # Load tokenizer
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        tokenizer = load_tokenizer(tokenizer)
+    # Load model
+    if not isinstance(model, torch.nn.Module):
+        model = load_model(model_path=model, device=device, attention_type=attention_type)
+    else:
+        model.eval()
+        model.bert.set_attention_type(attention_type)
+        model.to(device)
+    # Validate organism and convert to organism_id and organism_name
+    organism_id, organism_name = validate_and_convert_organism(organism)
+    # Inference loop
+    with torch.no_grad():
+        # Tokenize the input sequence
+        merged_seq = get_merged_seq(protein=protein, dna="")
+        input_dict = {
+            "idx": 0,  # sample index
+            "codons": merged_seq,
+            "organism": organism_id,
+        }
+        tokenized_input = tokenize([input_dict], tokenizer=tokenizer).to(device)
+        # Get the model predictions
+        output_dict = model(**tokenized_input, return_dict=True)
+        logits = output_dict.logits.detach().cpu()
+        logits = logits[:, 1:-1, :]  # Remove [CLS] and [SEP] tokens
+        # Mask the logits of codons that do not correspond to the input protein sequence
+        if match_protein:
+            possible_tokens_per_position = [
+                AMINO_ACID_TO_INDEX[token[0]] for token in merged_seq.split(" ")
+            ]
+            seq_len = logits.shape[1]
+            if len(possible_tokens_per_position) > seq_len:
+                possible_tokens_per_position = possible_tokens_per_position[:seq_len]
+            mask = torch.full_like(logits, float("-inf"))
+            for pos, possible_tokens in enumerate(possible_tokens_per_position):
+                mask[:, pos, possible_tokens] = 0
+            logits = mask + logits
+        predictions = []
+        for _ in range(num_sequences):
+            # Decode the predicted DNA sequence from the model output
+            if use_constrained_search:
+                # Use constrained beam search with GC bounds
+                predicted_indices = constrained_beam_search_simple(
+                    logits=logits.squeeze(0),
+                    protein_sequence=protein,
+                    gc_bounds=gc_bounds,
+                    max_attempts=50,
+                )
+            elif deterministic:
+                predicted_indices = logits.argmax(dim=-1).squeeze().tolist()
+            else:
+                predicted_indices = sample_non_deterministic(
+                    logits=logits, temperature=temperature, top_p=top_p
+                )
+            predicted_dna = list(map(INDEX2TOKEN.__getitem__, predicted_indices))
+            predicted_dna = (
+                "".join([token[-3:] for token in predicted_dna]).strip().upper()
+            )
+            predictions.append(
+                DNASequencePrediction(
+                    organism=organism_name,
+                    protein=protein,
+                    processed_input=merged_seq,
+                    predicted_dna=predicted_dna,
+                )
+            )
+    return predictions[0] if num_sequences == 1 else predictions
+@dataclass
+class BeamCandidate:
+    """Represents a candidate sequence in the beam search."""
+    tokens: List[int]
+    score: float
+    gc_count: int
+    length: int
+    def __post_init__(self):
+        self.gc_ratio = self.gc_count / max(self.length, 1)
+    def __lt__(self, other):
+        return self.score < other.score
+def _calculate_true_future_gc_range(
+    current_pos: int,
+    protein_sequence: str,
+    current_gc_count: int,
+    current_length: int
+) -> Tuple[float, float]:
+    """
+    Calculate the true minimum and maximum possible final GC content
+    given current state and remaining amino acids (perfect foresight).
+    Args:
+        current_pos: Current position in protein sequence
+        protein_sequence: Full protein sequence
+        current_gc_count: Current GC count in partial sequence
+        current_length: Current length in nucleotides
+    Returns:
+        Tuple of (min_possible_final_gc_ratio, max_possible_final_gc_ratio)
+    """
+    if current_pos >= len(protein_sequence):
+        # Already at end, return current ratio
+        final_ratio = current_gc_count / max(current_length, 1)
+        return final_ratio, final_ratio
+    # Calculate remaining amino acids
+    remaining_aas = protein_sequence[current_pos:]
+    # Calculate min/max possible GC from remaining amino acids
+    min_future_gc = 0
+    max_future_gc = 0
+    for aa in remaining_aas:
+        if aa.upper() in AA_MIN_GC and aa.upper() in AA_MAX_GC:
+            min_future_gc += AA_MIN_GC[aa.upper()]
+            max_future_gc += AA_MAX_GC[aa.upper()]
+        else:
+            # If amino acid not found, assume moderate GC (1-2 range)
+            min_future_gc += 1
+            max_future_gc += 2
+    # Calculate final sequence length
+    final_length = current_length + len(remaining_aas) * 3
+    # Calculate min/max possible final GC ratios
+    min_final_gc_ratio = (current_gc_count + min_future_gc) / final_length
+    max_final_gc_ratio = (current_gc_count + max_future_gc) / final_length
+    return min_final_gc_ratio, max_final_gc_ratio
+def constrained_beam_search_simple(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    max_attempts: int = 100,
+) -> List[int]:
+    """
+    Simple constrained search - try multiple greedy samples and pick best one within GC bounds.
+    """
+    min_gc, max_gc = gc_bounds
+    seq_len = min(logits.shape[0], len(protein_sequence))
+    # Convert to probabilities
+    probs = torch.softmax(logits, dim=-1)
+    valid_sequences = []
+    for attempt in range(max_attempts):
+        tokens = []
+        total_gc = 0
+        # Generate sequence position by position
+        for pos in range(seq_len):
+            aa = protein_sequence[pos]
+            possible_tokens = AMINO_ACID_TO_INDEX.get(aa, [])
+            if not possible_tokens:
+                continue
+            # Filter tokens by current constraints and get probabilities
+            candidates = []
+            for token_idx in possible_tokens:
+                if token_idx < len(probs[pos]) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                    prob = probs[pos][token_idx].item()
+                    gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                    # Check if this token could still lead to a valid final sequence (perfect foresight)
+                    new_gc_total = total_gc + gc_contribution
+                    new_length = (pos + 1) * 3
+                    # Calculate what's possible for the final sequence given this choice
+                    min_final_gc, max_final_gc = _calculate_true_future_gc_range(
+                        pos + 1, protein_sequence, new_gc_total, new_length
+                    )
+                    # Only prune if there's NO OVERLAP between possible final range and target bounds
+                    if max_final_gc >= min_gc and min_final_gc <= max_gc:
+                        # Calculate gentle GC penalty to steer toward target center
+                        target_gc = (min_gc + max_gc) / 2  # Target center (e.g., 0.50 for bounds 0.45-0.55)
+                        current_projected_gc = (min_final_gc + max_final_gc) / 2  # Projected center
+                        # Only apply penalty if we're significantly off-target AND late in sequence
+                        sequence_progress = (pos + 1) / seq_len
+                        if sequence_progress > 0.3:  # Only apply penalty after 30% of sequence
+                            gc_deviation = abs(current_projected_gc - target_gc)
+                            if gc_deviation > 0.05:  # Only if >5% deviation from target
+                                # Gentle penalty: reduce probability by small factor
+                                penalty_factor = max(0.7, 1.0 - 0.3 * gc_deviation)  # 0.7-1.0 range
+                                prob = prob * penalty_factor
+                        candidates.append((token_idx, prob, gc_contribution))
+            if not candidates:
+                # If no valid candidates, break and try next attempt
+                break
+            # Sample from valid candidates (with temperature)
+            if attempt == 0:
+                # First attempt: greedy (highest probability)
+                best_token = max(candidates, key=lambda x: x[1])
+            else:
+                # Other attempts: sample with some randomness
+                probs_list = [c[1] for c in candidates]
+                if sum(probs_list) > 0:
+                    # Normalize probabilities
+                    probs_array = np.array(probs_list)
+                    probs_array = probs_array / probs_array.sum()
+                    # Sample
+                    chosen_idx = np.random.choice(len(candidates), p=probs_array)
+                    best_token = candidates[chosen_idx]
+                else:
+                    best_token = candidates[0]
+            tokens.append(best_token[0])
+            total_gc += best_token[2]
+        # Check if we got a complete sequence
+        if len(tokens) == seq_len:
+            final_gc_ratio = total_gc / (seq_len * 3)
+            if min_gc <= final_gc_ratio <= max_gc:
+                # Calculate sequence score (sum of log probabilities)
+                score = sum(np.log(probs[i][tokens[i]].item() + 1e-8) for i in range(len(tokens)))
+                valid_sequences.append((tokens, score, final_gc_ratio))
+    if not valid_sequences:
+        raise ValueError(f"Could not generate valid sequence within GC bounds {gc_bounds} after {max_attempts} attempts")
+    # Return the sequence with highest score
+    best_sequence = max(valid_sequences, key=lambda x: x[1])
+    return best_sequence[0]
+def constrained_beam_search(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+    temperature: float = 1.0,
+    max_candidates: int = 100,
+    position_aware_gc_penalty: bool = True,
+    gc_penalty_strength: float = 2.0,
+) -> List[int]:
+    """
+    Constrained beam search with exact per-residue GC bounds tracking.
+    Priority #1: Exact per-residue GC bounds tracking
+    - Tracks cumulative GC content after each codon selection
+    - Prunes candidates that would violate GC bounds
+    - Maintains beam of valid candidates
+    Priority #2: Position-aware GC penalty mechanism
+    - Applies variable penalty weights based on sequence position
+    - Preserves flexibility early, applies pressure when necessary
+    - Uses progressive penalty scaling based on deviation severity
+    Args:
+        logits (torch.Tensor): Model logits of shape [seq_len, vocab_size]
+        protein_sequence (str): Input protein sequence
+        gc_bounds (Tuple[float, float]): (min_gc, max_gc) bounds
+        beam_size (int): Number of candidates to maintain
+        length_penalty (float): Length penalty for scoring
+        diversity_penalty (float): Diversity penalty for scoring
+        temperature (float): Temperature for probability scaling
+        max_candidates (int): Maximum candidates to consider per position
+        position_aware_gc_penalty (bool): Whether to use position-aware GC penalties
+        gc_penalty_strength (float): Strength of GC penalty adjustment
+    Returns:
+        List[int]: Best sequence token indices
+    """
+    min_gc, max_gc = gc_bounds
+    seq_len = logits.shape[0]
+    protein_len = len(protein_sequence)
+    # Ensure we don't go beyond the protein sequence
+    if seq_len > protein_len:
+        print(f"Warning: logits length ({seq_len}) > protein length ({protein_len}). Truncating to protein length.")
+        seq_len = protein_len
+        logits = logits[:protein_len]
+    # Initialize beam with empty candidate
+    beam = [BeamCandidate(tokens=[], score=0.0, gc_count=0, length=0)]
+    # Apply temperature scaling
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Convert to probabilities
+    probs = torch.softmax(logits, dim=-1)
+    for pos in range(min(seq_len, len(protein_sequence))):
+        # Get possible tokens for current amino acid
+        aa = protein_sequence[pos]
+        possible_tokens = AMINO_ACID_TO_INDEX.get(aa, [])
+        if not possible_tokens:
+            # Fallback to all tokens if amino acid not found
+            possible_tokens = list(range(probs.shape[1]))
+        # Get top candidates for this position
+        pos_probs = probs[pos]
+        top_candidates = []
+        for token_idx in possible_tokens:
+            if token_idx < len(pos_probs) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                prob = pos_probs[token_idx].item()
+                gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                # Only include tokens with valid probabilities
+                if prob > 1e-10:  # Avoid extremely low probabilities
+                    top_candidates.append((token_idx, prob, gc_contribution))
+        # Sort by probability and take top max_candidates
+        top_candidates.sort(key=lambda x: x[1], reverse=True)
+        top_candidates = top_candidates[:max_candidates]
+        # If no valid candidates found, fallback to all possible tokens for this amino acid
+        if not top_candidates:
+            for token_idx in possible_tokens[:min(len(possible_tokens), max_candidates)]:
+                if token_idx < len(pos_probs) and token_idx < len(GC_COUNTS_PER_TOKEN):
+                    prob = max(pos_probs[token_idx].item(), 1e-10)  # Ensure minimum probability
+                    gc_contribution = int(GC_COUNTS_PER_TOKEN[token_idx].item())
+                    top_candidates.append((token_idx, prob, gc_contribution))
+        # Generate new beam candidates
+        new_beam = []
+        for candidate in beam:
+            for token_idx, prob, gc_contribution in top_candidates:
+                # Calculate new GC stats
+                new_gc_count = candidate.gc_count + gc_contribution
+                new_length = candidate.length + 3  # Each codon is 3 nucleotides
+                new_gc_ratio = new_gc_count / new_length
+                # Priority #2: Position-aware GC penalty mechanism
+                gc_penalty = 0.0
+                if position_aware_gc_penalty:
+                    # Calculate position weight (more penalty towards end of sequence)
+                    position_weight = (pos + 1) / seq_len
+                    # Calculate GC deviation severity
+                    target_gc = (min_gc + max_gc) / 2
+                    gc_deviation = abs(new_gc_ratio - target_gc)
+                    deviation_severity = gc_deviation / ((max_gc - min_gc) / 2)
+                    # Apply progressive penalty
+                    if deviation_severity > 0.5:  # Soft penalty zone
+                        gc_penalty = gc_penalty_strength * position_weight * (deviation_severity - 0.5) ** 2
+                    # Hard constraint: still prune sequences that exceed bounds
+                    if new_gc_ratio < min_gc or new_gc_ratio > max_gc:
+                        continue  # Prune invalid candidates
+                else:
+                    # Priority #1: Hard GC bounds only
+                    if new_gc_ratio < min_gc or new_gc_ratio > max_gc:
+                        continue  # Prune invalid candidates
+                # Calculate score with GC penalty
+                new_score = candidate.score + np.log(prob + 1e-8) - gc_penalty
+                # Apply length penalty
+                if length_penalty != 1.0:
+                    length_norm = ((pos + 1) ** length_penalty)
+                    normalized_score = new_score / length_norm
+                else:
+                    normalized_score = new_score
+                # Create new candidate
+                new_candidate = BeamCandidate(
+                    tokens=candidate.tokens + [token_idx],
+                    score=normalized_score,
+                    gc_count=new_gc_count,
+                    length=new_length
+                )
+                new_beam.append(new_candidate)
+        # Apply diversity penalty if specified
+        if diversity_penalty > 0.0:
+            new_beam = _apply_diversity_penalty(new_beam, diversity_penalty)
+        # Keep top beam_size candidates
+        beam = sorted(new_beam, key=lambda x: x.score, reverse=True)[:beam_size]
+        # Priority #3: Adaptive beam rescue for difficult sequences
+        if not beam:
+            # Attempt beam rescue by relaxing constraints progressively
+            rescue_attempts = 0
+            max_rescue_attempts = 3
+            while not beam and rescue_attempts < max_rescue_attempts:
+                rescue_attempts += 1
+                # Progressive relaxation strategy
+                if rescue_attempts == 1:
+                    # First attempt: increase beam size and relax GC bounds slightly
+                    temp_beam_size = min(beam_size * 2, max_candidates)
+                    temp_gc_bounds = (min_gc * 0.95, max_gc * 1.05)
+                elif rescue_attempts == 2:
+                    # Second attempt: further relax GC bounds and increase candidates
+                    temp_beam_size = min(beam_size * 3, max_candidates)
+                    temp_gc_bounds = (min_gc * 0.9, max_gc * 1.1)
+                else:
+                    # Final attempt: maximum relaxation
+                    temp_beam_size = max_candidates
+                    temp_gc_bounds = (min_gc * 0.85, max_gc * 1.15)
+                # Retry beam generation with relaxed parameters
+                rescue_beam = []
+                # Use previous beam state or start fresh if this is the first position with no beam
+                previous_beam = beam if beam else [BeamCandidate(tokens=[], score=0.0, gc_count=0, length=0)]
+                for candidate in previous_beam:
+                    for token_idx, prob, gc_contribution in top_candidates:
+                        new_gc_count = candidate.gc_count + gc_contribution
+                        new_length = candidate.length + 3
+                        new_gc_ratio = new_gc_count / new_length
+                        # Check relaxed bounds
+                        if temp_gc_bounds[0] <= new_gc_ratio <= temp_gc_bounds[1]:
+                            # Apply reduced GC penalty for rescue
+                            gc_penalty = 0.0
+                            if position_aware_gc_penalty:
+                                position_weight = (pos + 1) / seq_len
+                                target_gc = (min_gc + max_gc) / 2
+                                gc_deviation = abs(new_gc_ratio - target_gc)
+                                deviation_severity = gc_deviation / ((max_gc - min_gc) / 2)
+                                # Reduced penalty for rescue
+                                if deviation_severity > 0.7:
+                                    gc_penalty = (gc_penalty_strength * 0.5) * position_weight * (deviation_severity - 0.7) ** 2
+                            new_score = candidate.score + np.log(prob + 1e-8) - gc_penalty
+                            if length_penalty != 1.0:
+                                length_norm = ((pos + 1) ** length_penalty)
+                                normalized_score = new_score / length_norm
+                            else:
+                                normalized_score = new_score
+                            rescue_candidate = BeamCandidate(
+                                tokens=candidate.tokens + [token_idx],
+                                score=normalized_score,
+                                gc_count=new_gc_count,
+                                length=new_length
+                            )
+                            rescue_beam.append(rescue_candidate)
+                # Keep top candidates from rescue attempt
+                if rescue_beam:
+                    beam = sorted(rescue_beam, key=lambda x: x.score, reverse=True)[:temp_beam_size]
+                    break
+            # If all rescue attempts failed, raise error
+            if not beam:
+                raise ValueError(
+                    f"Beam rescue failed at position {pos} after {max_rescue_attempts} attempts. "
+                    f"The GC constraints {gc_bounds} may be too restrictive for this protein sequence. "
+                    f"Consider relaxing constraints or using a different approach."
+                )
+    # Return best candidate
+    best_candidate = max(beam, key=lambda x: x.score)
+    return best_candidate.tokens
+# Wrapper function that tries simple approach first
+def constrained_beam_search_wrapper(
+    logits: torch.Tensor,
+    protein_sequence: str,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    **kwargs
+) -> List[int]:
+    """Wrapper that tries simple approach first, falls back to complex beam search."""
+    try:
+        # Try simple approach first
+        return constrained_beam_search_simple(logits, protein_sequence, gc_bounds)
+    except ValueError:
+        # Fall back to complex beam search
+        return constrained_beam_search(logits, protein_sequence, gc_bounds, **kwargs)
+def _apply_diversity_penalty(candidates: List[BeamCandidate], penalty: float) -> List[BeamCandidate]:
+    """
+    Apply diversity penalty to reduce repetitive sequences.
+    Args:
+        candidates (List[BeamCandidate]): List of candidates
+        penalty (float): Diversity penalty strength
+    Returns:
+        List[BeamCandidate]: Candidates with diversity penalty applied
+    """
+    if not candidates:
+        return candidates
+    # Count token occurrences
+    token_counts = {}
+    for candidate in candidates:
+        for token in candidate.tokens:
+            token_counts[token] = token_counts.get(token, 0) + 1
+    # Apply penalty
+    for candidate in candidates:
+        diversity_score = 0.0
+        for token in candidate.tokens:
+            if token_counts[token] > 1:
+                diversity_score += penalty * np.log(token_counts[token])
+        candidate.score -= diversity_score
+    return candidates
+def sample_non_deterministic(
+    logits: torch.Tensor,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+) -> List[int]:
+    """
+    Sample token indices from logits using temperature scaling and nucleus (top-p) sampling.
+    This function applies temperature scaling to the logits, computes probabilities,
+    and then performs nucleus sampling to select token indices. It is used for
+    non-deterministic decoding in language models to introduce randomness while
+    maintaining coherence in the generated sequences.
+    Args:
+        logits (torch.Tensor): The logits output from the model of shape
+            [seq_len, vocab_size] or [batch_size, seq_len, vocab_size].
+        temperature (float, optional): Temperature value for scaling logits.
+            Must be a positive float. Defaults to 1.0.
+        top_p (float, optional): Cumulative probability threshold for nucleus sampling.
+            Must be a float between 0 and 1. Tokens with cumulative probability up to
+            `top_p` are considered for sampling. Defaults to 0.95.
+    Returns:
+        List[int]: A list of sampled token indices corresponding to the predicted tokens.
+    Raises:
+        ValueError: If `temperature` is not a positive float or if `top_p` is not between 0 and 1.
+    Example:
+        >>> logits = model_output.logits  # Assume logits is a tensor of shape [seq_len, vocab_size]
+        >>> predicted_indices = sample_non_deterministic(logits, temperature=0.7, top_p=0.9)
+    """
+    if not isinstance(temperature, (float, int)) or temperature <= 0:
+        raise ValueError("Temperature must be a positive float.")
+    if not isinstance(top_p, (float, int)) or not 0 < top_p <= 1.0:
+        raise ValueError("top_p must be a float between 0 and 1.")
+    # Compute probabilities using temperature scaling
+    probs = torch.softmax(logits / temperature, dim=-1)
+    # Remove batch dimension if present
+    if probs.dim() == 3:
+        probs = probs.squeeze(0)  # Shape: [seq_len, vocab_size]
+    # Sort probabilities in descending order
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > top_p
+    # Zero out probabilities for tokens beyond the top-p threshold
+    probs_sort[mask] = 0.0
+    # Renormalize the probabilities
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    predicted_indices = torch.gather(probs_idx, -1, next_token).squeeze(-1)
+    return predicted_indices.tolist()
+def load_model(
+    model_path: Optional[str] = None,
+    device: torch.device = None,
+    attention_type: str = "original_full",
+    num_organisms: int = None,
+    remove_prefix: bool = True,
+) -> torch.nn.Module:
+    """
+    Load a BigBirdForMaskedLM model from a model file, checkpoint, or HuggingFace.
+    Args:
+        model_path (Optional[str]): Path to the model file or checkpoint. If None,
+            load from HuggingFace.
+        device (torch.device, optional): The device to load the model onto.
+        attention_type (str, optional): The type of attention, 'block_sparse'
+            or 'original_full'.
+        num_organisms (int, optional): Number of organisms, needed if loading from a
+            checkpoint that requires this.
+        remove_prefix (bool, optional): Whether to remove the "model." prefix from the
+            keys in the state dict.
+    Returns:
+        torch.nn.Module: The loaded model.
+    """
+    if not model_path:
+        warnings.warn("Model path not provided. Loading from HuggingFace.", UserWarning)
+        model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer")
+    elif model_path.endswith(".ckpt"):
+        checkpoint = torch.load(model_path, map_location="cpu")
+        # Detect Lightning checkpoint vs raw state dict
+        if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+            if remove_prefix:
+                state_dict = {
+                    k.replace("model.", ""): v for k, v in state_dict.items()
+                }
+        else:
+            # assume checkpoint itself is state_dict
+            state_dict = checkpoint
+        if num_organisms is None:
+            num_organisms = NUM_ORGANISMS
+        # Load model configuration and instantiate the model
+        config = load_bigbird_config(num_organisms)
+        model = BigBirdForMaskedLM(config=config)
+        model.load_state_dict(state_dict, strict=False)
+    elif model_path.endswith(".pt"):
+        state_dict = torch.load(model_path)
+        config = state_dict.pop("self.config")
+        model = BigBirdForMaskedLM(config=config)
+        model.load_state_dict(state_dict, strict=False)
+    else:
+        raise ValueError(
+            "Unsupported file type. Please provide a .ckpt or .pt file, "
+            "or None to load from HuggingFace."
+        )
+    # Prepare model for evaluation
+    model.bert.set_attention_type(attention_type)
+    model.eval()
+    if device:
+        model.to(device)
+    return model
+def load_bigbird_config(num_organisms: int) -> BigBirdConfig:
+    """
+    Load the config object used to train the BigBird transformer.
+    Args:
+        num_organisms (int): The number of organisms.
+    Returns:
+        BigBirdConfig: The configuration object for BigBird.
+    """
+    config = transformers.BigBirdConfig(
+        vocab_size=len(TOKEN2INDEX),  # Equal to len(tokenizer)
+        type_vocab_size=num_organisms,
+        sep_token_id=2,
+    )
+    return config
+def create_model_from_checkpoint(
+    checkpoint_dir: str, output_model_dir: str, num_organisms: int
+) -> None:
+    """
+    Save a model to disk using a previous checkpoint.
+    Args:
+        checkpoint_dir (str): Directory where the checkpoint is stored.
+        output_model_dir (str): Directory where the model will be saved.
+        num_organisms (int): Number of organisms.
+    """
+    checkpoint = load_model(model_path=checkpoint_dir, num_organisms=num_organisms)
+    state_dict = checkpoint.state_dict()
+    state_dict["self.config"] = load_bigbird_config(num_organisms=num_organisms)
+    # Save the model state dict to the output directory
+    torch.save(state_dict, output_model_dir)
+def load_tokenizer(tokenizer_path: Optional[Union[str, PreTrainedTokenizerFast]] = None) -> PreTrainedTokenizerFast:
+    """
+    Create and return a tokenizer object from tokenizer path or HuggingFace.
+    Args:
+        tokenizer_path (Optional[Union[str, PreTrainedTokenizerFast]]): Path to the tokenizer file,
+        a pre-loaded tokenizer object, or None. If None, load from HuggingFace.
+    Returns:
+        PreTrainedTokenizerFast: The tokenizer object.
+    """
+    # If a tokenizer object is already provided, return it
+    if isinstance(tokenizer_path, PreTrainedTokenizerFast):
+        return tokenizer_path
+    # If no path is provided, load from HuggingFace
+    if not tokenizer_path:
+        warnings.warn(
+            "Tokenizer path not provided. Loading from HuggingFace.", UserWarning
+        )
+        return AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+    # Load from file path
+    return transformers.PreTrainedTokenizerFast(
+        tokenizer_file=tokenizer_path,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+    )
+def tokenize(
+    batch: List[Dict[str, Any]],
+    tokenizer: Union[PreTrainedTokenizerFast, str] = None,
+    max_len: int = 2048,
+) -> BatchEncoding:
+    """
+    Return the tokenized sequences given a batch of input data.
+    Each data in the batch is expected to be a dictionary with "codons" and
+    "organism" keys.
+    Args:
+        batch (List[Dict[str, Any]]): A list of dictionaries with "codons" and
+            "organism" keys.
+        tokenizer (PreTrainedTokenizerFast, str, optional): The tokenizer object or
+            path to the tokenizer file.
+        max_len (int, optional): Maximum length of the tokenized sequence.
+    Returns:
+        BatchEncoding: The tokenized batch.
+    """
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        tokenizer = load_tokenizer(tokenizer)
+    tokenized = tokenizer(
+        [data["codons"] for data in batch],
+        return_attention_mask=True,
+        return_token_type_ids=True,
+        truncation=True,
+        padding=True,
+        max_length=max_len,
+        return_tensors="pt",
+    )
+    # Add token type IDs for species
+    seq_len = tokenized["input_ids"].shape[-1]
+    species_index = torch.tensor([[data["organism"]] for data in batch])
+    tokenized["token_type_ids"] = species_index.repeat(1, seq_len)
+    return tokenized
+def validate_and_convert_organism(organism: Union[int, str]) -> Tuple[int, str]:
+    """
+    Validate and convert the organism input to both ID and name.
+    This function takes either an organism ID or name as input and returns both
+    the ID and name. It performs validation to ensure the input corresponds to
+    a valid organism in the ORGANISM2ID dictionary.
+    Args:
+        organism (Union[int, str]): Either the ID of the organism (int) or its
+        name (str).
+    Returns:
+        Tuple[int, str]: A tuple containing the organism ID (int) and name (str).
+    Raises:
+        ValueError: If the input is neither a string nor an integer, if the
+        organism name is not found in ORGANISM2ID, if the organism ID is not a
+        value in ORGANISM2ID, or if no name is found for a given ID.
+    Note:
+        This function relies on the ORGANISM2ID dictionary imported from
+        CodonTransformer.CodonUtils, which maps organism names to their
+        corresponding IDs.
+    """
+    if isinstance(organism, str):
+        if organism not in ORGANISM2ID:
+            raise ValueError(
+                f"Invalid organism name: {organism}. "
+                "Please use a valid organism name or ID."
+            )
+        organism_id = ORGANISM2ID[organism]
+        organism_name = organism
+    elif isinstance(organism, int):
+        if organism not in ORGANISM2ID.values():
+            raise ValueError(
+                f"Invalid organism ID: {organism}. "
+                "Please use a valid organism name or ID."
+            )
+        organism_id = organism
+        organism_name = next(
+            (name for name, id in ORGANISM2ID.items() if id == organism), None
+        )
+        if organism_name is None:
+            raise ValueError(f"No organism name found for ID: {organism}")
+    return organism_id, organism_name
+def get_high_frequency_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using High Frequency Choice (HFC) approach
+    in which the most frequent codon for a given amino acid is always chosen.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select the most frequent codon for each amino acid in the protein sequence
+    dna_codons = [
+        codon_frequencies[aminoacid][0][np.argmax(codon_frequencies[aminoacid][1])]
+        for aminoacid in protein
+    ]
+    return "".join(dna_codons)
+def precompute_most_frequent_codons(
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+) -> Dict[str, str]:
+    """
+    Precompute the most frequent codon for each amino acid.
+    Args:
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        Dict[str, str]: The most frequent codon for each amino acid.
+    """
+    # Create a dictionary mapping each amino acid to its most frequent codon
+    return {
+        aminoacid: codons[np.argmax(frequencies)]
+        for aminoacid, (codons, frequencies) in codon_frequencies.items()
+    }
+def get_high_frequency_choice_sequence_optimized(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Efficient implementation of get_high_frequency_choice_sequence that uses
+    vectorized operations and helper functions, achieving up to x10 faster speed.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Precompute the most frequent codons for each amino acid
+    most_frequent_codons = precompute_most_frequent_codons(codon_frequencies)
+    return "".join(most_frequent_codons[aminoacid] for aminoacid in protein)
+def get_background_frequency_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using Background Frequency Choice (BFC)
+    approach in which a random codon for a given amino acid is chosen using
+    the codon frequencies probability distribution.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select a random codon for each amino acid based on the codon frequencies
+    # probability distribution
+    dna_codons = [
+        np.random.choice(
+            codon_frequencies[aminoacid][0], p=codon_frequencies[aminoacid][1]
+        )
+        for aminoacid in protein
+    ]
+    return "".join(dna_codons)
+def precompute_cdf(
+    codon_frequencies: Dict[str, Tuple[List[str], List[float]]],
+) -> Dict[str, Tuple[List[str], Any]]:
+    """
+    Precompute the cumulative distribution function (CDF) for each amino acid.
+    Args:
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        Dict[str, Tuple[List[str], Any]]: CDFs for each amino acid.
+    """
+    cdf = {}
+    # Calculate the cumulative distribution function for each amino acid
+    for aminoacid, (codons, frequencies) in codon_frequencies.items():
+        cdf[aminoacid] = (codons, np.cumsum(frequencies))
+    return cdf
+def get_background_frequency_choice_sequence_optimized(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Efficient implementation of get_background_frequency_choice_sequence that uses
+    vectorized operations and helper functions, achieving up to x8 faster speed.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    dna_codons = []
+    cdf = precompute_cdf(codon_frequencies)
+    # Select a random codon for each amino acid using the precomputed CDFs
+    for aminoacid in protein:
+        codons, cumulative_prob = cdf[aminoacid]
+        selected_codon_index = np.searchsorted(cumulative_prob, np.random.rand())
+        dna_codons.append(codons[selected_codon_index])
+    return "".join(dna_codons)
+def get_uniform_random_choice_sequence(
+    protein: str, codon_frequencies: Dict[str, Tuple[List[str], List[float]]]
+) -> str:
+    """
+    Return the DNA sequence optimized using Uniform Random Choice (URC) approach
+    in which a random codon for a given amino acid is chosen using a uniform
+    prior.
+    Args:
+        protein (str): The protein sequence.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+        frequencies for each amino acid.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    # Select a random codon for each amino acid using a uniform prior distribution
+    dna_codons = []
+    for aminoacid in protein:
+        codons = codon_frequencies[aminoacid][0]
+        random_index = np.random.randint(0, len(codons))
+        dna_codons.append(codons[random_index])
+    return "".join(dna_codons)
+def get_icor_prediction(input_seq: str, model_path: str, stop_symbol: str) -> str:
+    """
+    Return the optimized codon sequence for the given protein sequence using ICOR.
+    Credit: ICOR: improving codon optimization with recurrent neural networks
+            Rishab Jain, Aditya Jain, Elizabeth Mauro, Kevin LeShane, Douglas
+            Densmore
+    Args:
+        input_seq (str): The input protein sequence.
+        model_path (str): The path to the ICOR model.
+        stop_symbol (str): The symbol representing stop codons in the sequence.
+    Returns:
+        str: The optimized DNA sequence.
+    """
+    input_seq = input_seq.strip().upper()
+    input_seq = input_seq.replace(stop_symbol, "*")
+    # Define categorical labels from when model was trained.
+    labels = [
+        "AAA",
+        "AAC",
+        "AAG",
+        "AAT",
+        "ACA",
+        "ACG",
+        "ACT",
+        "AGC",
+        "ATA",
+        "ATC",
+        "ATG",
+        "ATT",
+        "CAA",
+        "CAC",
+        "CAG",
+        "CCG",
+        "CCT",
+        "CTA",
+        "CTC",
+        "CTG",
+        "CTT",
+        "GAA",
+        "GAT",
+        "GCA",
+        "GCC",
+        "GCG",
+        "GCT",
+        "GGA",
+        "GGC",
+        "GTC",
+        "GTG",
+        "GTT",
+        "TAA",
+        "TAT",
+        "TCA",
+        "TCG",
+        "TCT",
+        "TGG",
+        "TGT",
+        "TTA",
+        "TTC",
+        "TTG",
+        "TTT",
+        "ACC",
+        "CAT",
+        "CCA",
+        "CGG",
+        "CGT",
+        "GAC",
+        "GAG",
+        "GGT",
+        "AGT",
+        "GGG",
+        "GTA",
+        "TGC",
+        "CCC",
+        "CGA",
+        "CGC",
+        "TAC",
+        "TAG",
+        "TCC",
+        "AGA",
+        "AGG",
+        "TGA",
+    ]
+    # Define aa to integer table
+    def aa2int(seq: str) -> List[int]:
+        _aa2int = {
+            "A": 1,
+            "R": 2,
+            "N": 3,
+            "D": 4,
+            "C": 5,
+            "Q": 6,
+            "E": 7,
+            "G": 8,
+            "H": 9,
+            "I": 10,
+            "L": 11,
+            "K": 12,
+            "M": 13,
+            "F": 14,
+            "P": 15,
+            "S": 16,
+            "T": 17,
+            "W": 18,
+            "Y": 19,
+            "V": 20,
+            "B": 21,
+            "Z": 22,
+            "X": 23,
+            "*": 24,
+            "-": 25,
+            "?": 26,
+        }
+        return [_aa2int[i] for i in seq]
+    # Create empty array to fill
+    oh_array = np.zeros(shape=(26, len(input_seq)))
+    # Load placements from aa2int
+    aa_placement = aa2int(input_seq)
+    # One-hot encode the amino acid sequence:
+    for i in range(0, len(aa_placement)):
+        oh_array[aa_placement[i], i] = 1
+        i += 1
+    oh_array = [oh_array]
+    x = np.array(np.transpose(oh_array))
+    y = x.astype(np.float32)
+    y = np.reshape(y, (y.shape[0], 1, 26))
+    # Start ICOR session using model.
+    sess = rt.InferenceSession(model_path)
+    input_name = sess.get_inputs()[0].name
+    # Get prediction:
+    pred_onx = sess.run(None, {input_name: y})
+    # Get the index of the highest probability from softmax output:
+    pred_indices = []
+    for pred in pred_onx[0]:
+        pred_indices.append(np.argmax(pred))
+    out_str = ""
+    for index in pred_indices:
+        out_str += labels[index]
+    return out_str

CodonTransformer/CodonUtils.py ADDED Viewed

	@@ -0,0 +1,871 @@

+"""
+File: CodonUtils.py
+---------------------
+Includes constants and helper functions used by other Python scripts.
+"""
+import itertools
+import json
+import os
+import pickle
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+import pandas as pd
+import requests
+import torch
+# List of all amino acids
+AMINO_ACIDS: List[str] = [
+    "A",  # Alanine
+    "C",  # Cysteine
+    "D",  # Aspartic acid
+    "E",  # Glutamic acid
+    "F",  # Phenylalanine
+    "G",  # Glycine
+    "H",  # Histidine
+    "I",  # Isoleucine
+    "K",  # Lysine
+    "L",  # Leucine
+    "M",  # Methionine
+    "N",  # Asparagine
+    "P",  # Proline
+    "Q",  # Glutamine
+    "R",  # Arginine
+    "S",  # Serine
+    "T",  # Threonine
+    "V",  # Valine
+    "W",  # Tryptophan
+    "Y",  # Tyrosine
+]
+STOP_SYMBOLS = ["_", "*"]  # Stop codon symbols
+# Dictionary ambiguous amino acids to standard amino acids
+AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = {
+    "B": ["N", "D"],  # Asparagine (N) or Aspartic acid (D)
+    "Z": ["Q", "E"],  # Glutamine (Q) or Glutamic acid (E)
+    "X": ["A"],  # Any amino acid (typically replaced with Alanine)
+    "J": ["L", "I"],  # Leucine (L) or Isoleucine (I)
+    "U": ["C"],  # Selenocysteine (typically replaced with Cysteine)
+    "O": ["K"],  # Pyrrolysine (typically replaced with Lysine)
+}
+# List of all possible start and stop codons
+START_CODONS: List[str] = ["ATG", "TTG", "CTG", "GTG"]
+STOP_CODONS: List[str] = ["TAA", "TAG", "TGA"]
+# Token-to-index mapping for amino acids and special tokens
+TOKEN2INDEX: Dict[str, int] = {
+    "[UNK]": 0,
+    "[CLS]": 1,
+    "[SEP]": 2,
+    "[PAD]": 3,
+    "[MASK]": 4,
+    "a_unk": 5,
+    "c_unk": 6,
+    "d_unk": 7,
+    "e_unk": 8,
+    "f_unk": 9,
+    "g_unk": 10,
+    "h_unk": 11,
+    "i_unk": 12,
+    "k_unk": 13,
+    "l_unk": 14,
+    "m_unk": 15,
+    "n_unk": 16,
+    "p_unk": 17,
+    "q_unk": 18,
+    "r_unk": 19,
+    "s_unk": 20,
+    "t_unk": 21,
+    "v_unk": 22,
+    "w_unk": 23,
+    "y_unk": 24,
+    "__unk": 25,
+    "k_aaa": 26,
+    "n_aac": 27,
+    "k_aag": 28,
+    "n_aat": 29,
+    "t_aca": 30,
+    "t_acc": 31,
+    "t_acg": 32,
+    "t_act": 33,
+    "r_aga": 34,
+    "s_agc": 35,
+    "r_agg": 36,
+    "s_agt": 37,
+    "i_ata": 38,
+    "i_atc": 39,
+    "m_atg": 40,
+    "i_att": 41,
+    "q_caa": 42,
+    "h_cac": 43,
+    "q_cag": 44,
+    "h_cat": 45,
+    "p_cca": 46,
+    "p_ccc": 47,
+    "p_ccg": 48,
+    "p_cct": 49,
+    "r_cga": 50,
+    "r_cgc": 51,
+    "r_cgg": 52,
+    "r_cgt": 53,
+    "l_cta": 54,
+    "l_ctc": 55,
+    "l_ctg": 56,
+    "l_ctt": 57,
+    "e_gaa": 58,
+    "d_gac": 59,
+    "e_gag": 60,
+    "d_gat": 61,
+    "a_gca": 62,
+    "a_gcc": 63,
+    "a_gcg": 64,
+    "a_gct": 65,
+    "g_gga": 66,
+    "g_ggc": 67,
+    "g_ggg": 68,
+    "g_ggt": 69,
+    "v_gta": 70,
+    "v_gtc": 71,
+    "v_gtg": 72,
+    "v_gtt": 73,
+    "__taa": 74,
+    "y_tac": 75,
+    "__tag": 76,
+    "y_tat": 77,
+    "s_tca": 78,
+    "s_tcc": 79,
+    "s_tcg": 80,
+    "s_tct": 81,
+    "__tga": 82,
+    "c_tgc": 83,
+    "w_tgg": 84,
+    "c_tgt": 85,
+    "l_tta": 86,
+    "f_ttc": 87,
+    "l_ttg": 88,
+    "f_ttt": 89,
+}
+# Index-to-token mapping, reverse of TOKEN2INDEX
+INDEX2TOKEN: Dict[int, str] = {i: c for c, i in TOKEN2INDEX.items()}
+# Dictionary mapping each codon to its GC content
+CODON_GC_CONTENT: Dict[str, int] = {
+    token.split("_")[1]: token.split("_")[1].upper().count("G") + token.split("_")[1].upper().count("C")
+    for token in TOKEN2INDEX
+    if "_" in token and len(token.split("_")[1]) == 3
+}
+# Tensor with GC counts for each token in the vocabulary
+GC_COUNTS_PER_TOKEN = torch.zeros(len(TOKEN2INDEX))
+for token, index in TOKEN2INDEX.items():
+    if "_" in token and len(token.split("_")[1]) == 3:
+        codon = token.split("_")[1].upper()
+        gc_count = codon.count("G") + codon.count("C")
+        GC_COUNTS_PER_TOKEN[index] = gc_count
+G_indices = [idx for token, idx in TOKEN2INDEX.items() if "g" in token.split("_")[-1]]
+C_indices = [idx for token, idx in TOKEN2INDEX.items() if "c" in token.split("_")[-1]]
+# Dictionary mapping each amino acid and stop symbol to indices of codon tokens that translate to it
+AMINO_ACID_TO_INDEX = {
+    aa: sorted(
+        [i for t, i in TOKEN2INDEX.items() if t[0].upper() == aa and t[-3:] != "unk"]
+    )
+    for aa in (AMINO_ACIDS + STOP_SYMBOLS)
+}
+# Dictionary mapping each amino acid to min/max GC content across all possible codons
+AA_MIN_GC: Dict[str, int] = {}
+AA_MAX_GC: Dict[str, int] = {}
+for aa, token_indices in AMINO_ACID_TO_INDEX.items():
+    if token_indices:  # Skip if no tokens for this amino acid
+        gc_counts = []
+        for token_idx in token_indices:
+            token = INDEX2TOKEN[token_idx]
+            if "_" in token and len(token.split("_")[1]) == 3:
+                codon = token.split("_")[1]
+                if codon in CODON_GC_CONTENT:
+                    gc_counts.append(CODON_GC_CONTENT[codon])
+        if gc_counts:
+            AA_MIN_GC[aa] = min(gc_counts)
+            AA_MAX_GC[aa] = max(gc_counts)
+# Mask token mapping
+TOKEN2MASK: Dict[int, int] = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    11: 11,
+    12: 12,
+    13: 13,
+    14: 14,
+    15: 15,
+    16: 16,
+    17: 17,
+    18: 18,
+    19: 19,
+    20: 20,
+    21: 21,
+    22: 22,
+    23: 23,
+    24: 24,
+    25: 25,
+    26: 13,
+    27: 16,
+    28: 13,
+    29: 16,
+    30: 21,
+    31: 21,
+    32: 21,
+    33: 21,
+    34: 19,
+    35: 20,
+    36: 19,
+    37: 20,
+    38: 12,
+    39: 12,
+    40: 15,
+    41: 12,
+    42: 18,
+    43: 11,
+    44: 18,
+    45: 11,
+    46: 17,
+    47: 17,
+    48: 17,
+    49: 17,
+    50: 19,
+    51: 19,
+    52: 19,
+    53: 19,
+    54: 14,
+    55: 14,
+    56: 14,
+    57: 14,
+    58: 8,
+    59: 7,
+    60: 8,
+    61: 7,
+    62: 5,
+    63: 5,
+    64: 5,
+    65: 5,
+    66: 10,
+    67: 10,
+    68: 10,
+    69: 10,
+    70: 22,
+    71: 22,
+    72: 22,
+    73: 22,
+    74: 25,
+    75: 24,
+    76: 25,
+    77: 24,
+    78: 20,
+    79: 20,
+    80: 20,
+    81: 20,
+    82: 25,
+    83: 6,
+    84: 23,
+    85: 6,
+    86: 14,
+    87: 9,
+    88: 14,
+    89: 9,
+}
+# List of organisms used for fine-tuning
+FINE_TUNE_ORGANISMS: List[str] = [
+    "Arabidopsis thaliana",
+    "Bacillus subtilis",
+    "Caenorhabditis elegans",
+    "Chlamydomonas reinhardtii",
+    "Chlamydomonas reinhardtii chloroplast",
+    "Danio rerio",
+    "Drosophila melanogaster",
+    "Homo sapiens",
+    "Mus musculus",
+    "Nicotiana tabacum",
+    "Nicotiana tabacum chloroplast",
+    "Pseudomonas putida",
+    "Saccharomyces cerevisiae",
+    "Escherichia coli O157-H7 str. Sakai",
+    "Escherichia coli general",
+    "Escherichia coli str. K-12 substr. MG1655",
+    "Thermococcus barophilus MPT",
+]
+# List of organisms most commonly used for coodn optimization
+COMMON_ORGANISMS: List[str] = [
+    "Arabidopsis thaliana",
+    "Bacillus subtilis",
+    "Caenorhabditis elegans",
+    "Chlamydomonas reinhardtii",
+    "Danio rerio",
+    "Drosophila melanogaster",
+    "Homo sapiens",
+    "Mus musculus",
+    "Nicotiana tabacum",
+    "Pseudomonas putida",
+    "Saccharomyces cerevisiae",
+    "Escherichia coli general",
+]
+# Dictionary mapping each organism name to respective organism id
+ORGANISM2ID: Dict[str, int] = {
+    "Arabidopsis thaliana": 0,
+    "Atlantibacter hermannii": 1,
+    "Bacillus subtilis": 2,
+    "Brenneria goodwinii": 3,
+    "Buchnera aphidicola (Schizaphis graminum)": 4,
+    "Caenorhabditis elegans": 5,
+    "Candidatus Erwinia haradaeae": 6,
+    "Candidatus Hamiltonella defensa 5AT (Acyrthosiphon pisum)": 7,
+    "Chlamydomonas reinhardtii": 8,
+    "Chlamydomonas reinhardtii chloroplast": 9,
+    "Citrobacter amalonaticus": 10,
+    "Citrobacter braakii": 11,
+    "Citrobacter cronae": 12,
+    "Citrobacter europaeus": 13,
+    "Citrobacter farmeri": 14,
+    "Citrobacter freundii": 15,
+    "Citrobacter koseri ATCC BAA-895": 16,
+    "Citrobacter portucalensis": 17,
+    "Citrobacter werkmanii": 18,
+    "Citrobacter youngae": 19,
+    "Cronobacter dublinensis subsp. dublinensis LMG 23823": 20,
+    "Cronobacter malonaticus LMG 23826": 21,
+    "Cronobacter sakazakii": 22,
+    "Cronobacter turicensis": 23,
+    "Danio rerio": 24,
+    "Dickeya dadantii 3937": 25,
+    "Dickeya dianthicola": 26,
+    "Dickeya fangzhongdai": 27,
+    "Dickeya solani": 28,
+    "Dickeya zeae": 29,
+    "Drosophila melanogaster": 30,
+    "Edwardsiella anguillarum ET080813": 31,
+    "Edwardsiella ictaluri": 32,
+    "Edwardsiella piscicida": 33,
+    "Edwardsiella tarda": 34,
+    "Enterobacter asburiae": 35,
+    "Enterobacter bugandensis": 36,
+    "Enterobacter cancerogenus": 37,
+    "Enterobacter chengduensis": 38,
+    "Enterobacter cloacae": 39,
+    "Enterobacter hormaechei": 40,
+    "Enterobacter kobei": 41,
+    "Enterobacter ludwigii": 42,
+    "Enterobacter mori": 43,
+    "Enterobacter quasiroggenkampii": 44,
+    "Enterobacter roggenkampii": 45,
+    "Enterobacter sichuanensis": 46,
+    "Erwinia amylovora CFBP1430": 47,
+    "Erwinia persicina": 48,
+    "Escherichia albertii": 49,
+    "Escherichia coli O157-H7 str. Sakai": 50,
+    "Escherichia coli general": 51,
+    "Escherichia coli str. K-12 substr. MG1655": 52,
+    "Escherichia fergusonii": 53,
+    "Escherichia marmotae": 54,
+    "Escherichia ruysiae": 55,
+    "Ewingella americana": 56,
+    "Hafnia alvei": 57,
+    "Hafnia paralvei": 58,
+    "Homo sapiens": 59,
+    "Kalamiella piersonii": 60,
+    "Klebsiella aerogenes": 61,
+    "Klebsiella grimontii": 62,
+    "Klebsiella michiganensis": 63,
+    "Klebsiella oxytoca": 64,
+    "Klebsiella pasteurii": 65,
+    "Klebsiella pneumoniae subsp. pneumoniae HS11286": 66,
+    "Klebsiella quasipneumoniae": 67,
+    "Klebsiella quasivariicola": 68,
+    "Klebsiella variicola": 69,
+    "Kosakonia cowanii": 70,
+    "Kosakonia radicincitans": 71,
+    "Leclercia adecarboxylata": 72,
+    "Lelliottia amnigena": 73,
+    "Lonsdalea populi": 74,
+    "Moellerella wisconsensis": 75,
+    "Morganella morganii": 76,
+    "Mus musculus": 77,
+    "Nicotiana tabacum": 78,
+    "Nicotiana tabacum chloroplast": 79,
+    "Obesumbacterium proteus": 80,
+    "Pantoea agglomerans": 81,
+    "Pantoea allii": 82,
+    "Pantoea ananatis PA13": 83,
+    "Pantoea dispersa": 84,
+    "Pantoea stewartii": 85,
+    "Pantoea vagans": 86,
+    "Pectobacterium aroidearum": 87,
+    "Pectobacterium atrosepticum": 88,
+    "Pectobacterium brasiliense": 89,
+    "Pectobacterium carotovorum": 90,
+    "Pectobacterium odoriferum": 91,
+    "Pectobacterium parmentieri": 92,
+    "Pectobacterium polaris": 93,
+    "Pectobacterium versatile": 94,
+    "Photorhabdus laumondii subsp. laumondii TTO1": 95,
+    "Plesiomonas shigelloides": 96,
+    "Pluralibacter gergoviae": 97,
+    "Proteus faecis": 98,
+    "Proteus mirabilis HI4320": 99,
+    "Proteus penneri": 100,
+    "Proteus terrae subsp. cibarius": 101,
+    "Proteus vulgaris": 102,
+    "Providencia alcalifaciens": 103,
+    "Providencia heimbachae": 104,
+    "Providencia rettgeri": 105,
+    "Providencia rustigianii": 106,
+    "Providencia stuartii": 107,
+    "Providencia thailandensis": 108,
+    "Pseudomonas putida": 109,
+    "Pyrococcus furiosus": 110,
+    "Pyrococcus horikoshii": 111,
+    "Pyrococcus yayanosii": 112,
+    "Rahnella aquatilis CIP 78.65 = ATCC 33071": 113,
+    "Raoultella ornithinolytica": 114,
+    "Raoultella planticola": 115,
+    "Raoultella terrigena": 116,
+    "Rosenbergiella epipactidis": 117,
+    "Rouxiella badensis": 118,
+    "Saccharolobus solfataricus": 119,
+    "Saccharomyces cerevisiae": 120,
+    "Salmonella bongori N268-08": 121,
+    "Salmonella enterica subsp. enterica serovar Typhimurium str. LT2": 122,
+    "Serratia bockelmannii": 123,
+    "Serratia entomophila": 124,
+    "Serratia ficaria": 125,
+    "Serratia fonticola": 126,
+    "Serratia grimesii": 127,
+    "Serratia liquefaciens": 128,
+    "Serratia marcescens": 129,
+    "Serratia nevei": 130,
+    "Serratia plymuthica AS9": 131,
+    "Serratia proteamaculans": 132,
+    "Serratia quinivorans": 133,
+    "Serratia rubidaea": 134,
+    "Serratia ureilytica": 135,
+    "Shigella boydii": 136,
+    "Shigella dysenteriae": 137,
+    "Shigella flexneri 2a str. 301": 138,
+    "Shigella sonnei": 139,
+    "Thermoccoccus kodakarensis": 140,
+    "Thermococcus barophilus MPT": 141,
+    "Thermococcus chitonophagus": 142,
+    "Thermococcus gammatolerans": 143,
+    "Thermococcus litoralis": 144,
+    "Thermococcus onnurineus": 145,
+    "Thermococcus sibiricus": 146,
+    "Xenorhabdus bovienii str. feltiae Florida": 147,
+    "Yersinia aldovae 670-83": 148,
+    "Yersinia aleksiciae": 149,
+    "Yersinia alsatica": 150,
+    "Yersinia enterocolitica": 151,
+    "Yersinia frederiksenii ATCC 33641": 152,
+    "Yersinia intermedia": 153,
+    "Yersinia kristensenii": 154,
+    "Yersinia massiliensis CCUG 53443": 155,
+    "Yersinia mollaretii ATCC 43969": 156,
+    "Yersinia pestis A1122": 157,
+    "Yersinia proxima": 158,
+    "Yersinia pseudotuberculosis IP 32953": 159,
+    "Yersinia rochesterensis": 160,
+    "Yersinia rohdei": 161,
+    "Yersinia ruckeri": 162,
+    "Yokenella regensburgei": 163,
+}
+# Dictionary mapping each organism id to respective organism name
+ID2ORGANISM = {v: k for k, v in ORGANISM2ID.items()}
+# Type alias for amino acid to codon mapping
+AMINO2CODON_TYPE = Dict[str, Tuple[List[str], List[float]]]
+# Constants for the number of organisms and sequence lengths
+NUM_ORGANISMS = 164
+MAX_LEN = 2048
+MAX_AMINO_ACIDS = MAX_LEN - 2  # Without special tokens [CLS] and [SEP]
+STOP_SYMBOL = "_"
+@dataclass
+class DNASequencePrediction:
+    """
+    A class to hold the output of the DNA sequence prediction.
+    Attributes:
+        organism (str): Name of the organism used for prediction.
+        protein (str): Input protein sequence for which DNA sequence is predicted.
+        processed_input (str): Processed input sequence (merged protein and DNA).
+        predicted_dna (str): Predicted DNA sequence.
+    """
+    organism: str
+    protein: str
+    processed_input: str
+    predicted_dna: str
+class IterableData(torch.utils.data.IterableDataset):
+    """
+    Defines the logic for iterable datasets (working over streams of
+    data) in parallel multi-processing environments, e.g., multi-GPU.
+    Args:
+        dist_env (Optional[str]): The distribution environment identifier
+        (e.g., "slurm").
+    Credit: Guillaume Filion
+    """
+    def __init__(self, dist_env: Optional[str] = None):
+        super().__init__()
+        if dist_env is None:
+            self.world_size_handle, self.rank_handle = ("WORLD_SIZE", "LOCAL_RANK")
+        else:
+            self.world_size_handle, self.rank_handle = {
+                "slurm": ("SLURM_NTASKS", "SLURM_PROCID")
+            }.get(dist_env, ("WORLD_SIZE", "LOCAL_RANK"))
+    @property
+    def iterator(self) -> Iterator:
+        """Define the stream logic for the dataset. Implement in subclasses."""
+        raise NotImplementedError
+    def __iter__(self) -> Iterator:
+        """
+        Create an iterator for the dataset, handling multi-processing contexts.
+        Returns:
+            Iterator: The iterator for the dataset.
+        """
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            return self.iterator
+        # In multi-processing context, use 'os.environ' to
+        # find global worker rank. Then use 'islice' to allocate
+        # the items of the stream to the workers.
+        world_size = int(os.environ.get(self.world_size_handle, "1"))
+        global_rank = int(os.environ.get(self.rank_handle, "0"))
+        local_rank = worker_info.id
+        local_num_workers = worker_info.num_workers
+        # Assume that each process has the same number of local workers.
+        worker_rk = global_rank * local_num_workers + local_rank
+        worker_nb = world_size * local_num_workers
+        return itertools.islice(self.iterator, worker_rk, None, worker_nb)
+class IterableJSONData(IterableData):
+    """
+    Iterate over the lines of a JSON file and uncompress if needed.
+    Args:
+        data_path (str): The path to the JSON data file.
+        train (bool): Flag indicating if the dataset is for training.
+        **kwargs: Additional keyword arguments for the base class.
+    """
+    def __init__(self, data_path: str, train: bool = True, **kwargs):
+        super().__init__(**kwargs)
+        self.data_path = data_path
+        self.train = train
+        with open(os.path.join(self.data_path, "finetune_set.json"), "r") as f:
+            self.records = [json.loads(line) for line in f]
+    def __len__(self):
+        return len(self.records)
+    @property
+    def iterator(self) -> Iterator:
+        """Define the stream logic for the dataset."""
+        for record in self.records:
+            yield record
+class ConfigManager(ABC):
+    """
+    Abstract base class for managing configuration settings.
+    """
+    _config: Dict[str, Any]
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type is not None:
+            print(f"Exception occurred: {exc_type}, {exc_value}, {traceback}")
+        self.reset_config()
+    @abstractmethod
+    def reset_config(self) -> None:
+        """Reset the configuration to default values."""
+        pass
+    def get(self, key: str) -> Any:
+        """
+        Get the value of a configuration key.
+        Args:
+            key (str): The key to retrieve the value for.
+        Returns:
+            Any: The value of the configuration key.
+        """
+        return self._config.get(key)
+    def set(self, key: str, value: Any) -> None:
+        """
+        Set the value of a configuration key.
+        Args:
+            key (str): The key to set the value for.
+            value (Any): The value to set for the key.
+        """
+        self.validate_inputs(key, value)
+        self._config[key] = value
+    def update(self, config_dict: dict) -> None:
+        """
+        Update the configuration with a dictionary of key-value pairs after validating them.
+        Args:
+            config_dict (dict): A dictionary of key-value pairs to update the configuration.
+        """
+        for key, value in config_dict.items():
+            self.validate_inputs(key, value)
+        self._config.update(config_dict)
+    @abstractmethod
+    def validate_inputs(self, key: str, value: Any) -> None:
+        """Validate the inputs for the configuration."""
+        pass
+class ProteinConfig(ConfigManager):
+    """
+    A class to manage configuration settings for protein sequences.
+    This class ensures that the configuration is a singleton.
+    It provides methods to get, set, and update configuration values.
+    Attributes:
+        _instance (Optional[ConfigManager]): The singleton instance of the ConfigManager.
+        _config (Dict[str, Any]): The configuration dictionary.
+    """
+    _instance = None
+    def __new__(cls):
+        """
+        Create a new instance of the ProteinConfig class.
+        Returns:
+            ProteinConfig: The singleton instance of the ProteinConfig.
+        """
+        if cls._instance is None:
+            cls._instance = super(ProteinConfig, cls).__new__(cls)
+            cls._instance.reset_config()
+        return cls._instance
+    def validate_inputs(self, key: str, value: Any) -> None:
+        """
+        Validate the inputs for the configuration.
+        Args:
+            key (str): The key to validate.
+            value (Any): The value to validate.
+        Raises:
+            ValueError: If the value is invalid.
+            TypeError: If the value is of the wrong type.
+        """
+        if key == "ambiguous_aminoacid_behavior":
+            if value not in [
+                "raise_error",
+                "standardize_deterministic",
+                "standardize_random",
+            ]:
+                raise ValueError(
+                    f"Invalid value for ambiguous_aminoacid_behavior: {value}."
+                )
+        elif key == "ambiguous_aminoacid_map_override":
+            if not isinstance(value, dict):
+                raise TypeError(
+                    f"Invalid type for ambiguous_aminoacid_map_override: {value}."
+                )
+            for ambiguous_aminoacid, aminoacids in value.items():
+                if not isinstance(aminoacids, list):
+                    raise TypeError(f"Invalid type for aminoacids: {aminoacids}.")
+                if not aminoacids:
+                    raise ValueError(
+                        f"Override for aminoacid '{ambiguous_aminoacid}' cannot be empty list."
+                    )
+                if ambiguous_aminoacid not in AMBIGUOUS_AMINOACID_MAP:
+                    raise ValueError(
+                        f"Invalid amino acid in ambiguous_aminoacid_map_override: {ambiguous_aminoacid}"
+                    )
+        else:
+            raise ValueError(f"Invalid configuration key: {key}")
+    def reset_config(self) -> None:
+        """
+        Reset the configuration to the default values.
+        """
+        self._config = {
+            "ambiguous_aminoacid_behavior": "standardize_random",
+            "ambiguous_aminoacid_map_override": {},
+        }
+def load_python_object_from_disk(file_path: str) -> Any:
+    """
+    Load a Pickle object from disk and return it as a Python object.
+    Args:
+        file_path (str): The path to the Pickle file.
+    Returns:
+        Any: The loaded Python object.
+    """
+    with open(file_path, "rb") as file:
+        return pickle.load(file)
+def save_python_object_to_disk(input_object: Any, file_path: str) -> None:
+    """
+    Save a Python object to disk using Pickle.
+    Args:
+        input_object (Any): The Python object to save.
+        file_path (str): The path where the object will be saved.
+    """
+    with open(file_path, "wb") as file:
+        pickle.dump(input_object, file)
+def find_pattern_in_fasta(keyword: str, text: str) -> str:
+    """
+    Find a specific keyword pattern in text. Helpful for identifying parts
+    of a FASTA sequence.
+    Args:
+        keyword (str): The keyword pattern to search for.
+        text (str): The text to search within.
+    Returns:
+        str: The found pattern or an empty string if not found.
+    """
+    # Search for the keyword pattern in the text using regex
+    result = re.search(keyword + r"=(.*?)]", text)
+    return result.group(1) if result else ""
+def get_organism2id_dict(organism_reference: str) -> Dict[str, int]:
+    """
+    Return a dictionary mapping each organism in training data to an index
+    used for training.
+    Args:
+        organism_reference (str): Path to a CSV file containing a list of
+            all organisms. The format of the CSV file should be as follows:
+                0,Escherichia coli
+                1,Homo sapiens
+                2,Mus musculus
+    Returns:
+        Dict[str, int]: Dictionary mapping organism names to their respective indices.
+    """
+    # Read the CSV file and create a dictionary mapping organisms to their indices
+    organisms = pd.read_csv(organism_reference, index_col=0, header=None)
+    organism2id = {organisms.iloc[i].values[0]: i for i in organisms.index}
+    return organism2id
+def get_taxonomy_id(
+    taxonomy_reference: str, organism: Optional[str] = None, return_dict: bool = False
+) -> Any:
+    """
+    Return the taxonomy id of a given organism using a reference file.
+    Optionally, return the whole dictionary instead if return_dict is True.
+    Args:
+        taxonomy_reference (str): Path to the taxonomy reference file.
+        organism (Optional[str]): The name of the organism to look up.
+        return_dict (bool): Whether to return the entire dictionary.
+    Returns:
+        Any: The taxonomy id of the organism or the entire dictionary.
+    """
+    # Load the organism-to-taxonomy mapping from a Pickle file
+    organism2taxonomy = load_python_object_from_disk(taxonomy_reference)
+    if return_dict:
+        return dict(sorted(organism2taxonomy.items()))
+    return organism2taxonomy[organism]
+def sort_amino2codon_skeleton(amino2codon: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sort the amino2codon dictionary alphabetically by amino acid and by codon name.
+    Args:
+        amino2codon (Dict[str, Any]): The amino2codon dictionary to sort.
+    Returns:
+        Dict[str, Any]: The sorted amino2codon dictionary.
+    """
+    # Sort the dictionary by amino acid and then by codon name
+    amino2codon = dict(sorted(amino2codon.items()))
+    amino2codon = {
+        amino: (
+            [codon for codon, _ in sorted(zip(codons, frequencies))],
+            [freq for _, freq in sorted(zip(codons, frequencies))],
+        )
+        for amino, (codons, frequencies) in amino2codon.items()
+    }
+    return amino2codon
+def load_pkl_from_url(url: str) -> Any:
+    """
+    Download a Pickle file from a URL and return the loaded object.
+    Args:
+        url (str): The URL to download the Pickle file from.
+    Returns:
+        Any: The loaded Python object from the Pickle file.
+    """
+    response = requests.get(url)
+    response.raise_for_status()  # Ensure the request was successful
+    # Load the Pickle object from the response content
+    return pickle.loads(response.content)

CodonTransformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """CodonTransformer package."""

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_NO_CACHE_DIR=1
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN pip install --upgrade pip && pip install -r /app/requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.headless=true"]

ENCOT_Academic_Documentation.html ADDED Viewed

	@@ -0,0 +1,2625 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ENCOT: Enhanced Codon Optimization Tool - Technical Documentation</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-light.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/yaml.min.js"></script>
+    <link href="https://fonts.googleapis.com/css2?family=Computer+Modern+Serif:wght@400;700&family=Computer+Modern+Sans:wght@400;700&family=Computer+Modern+Typewriter&display=swap" rel="stylesheet">
+    <style>
+        /* LaTeX-inspired Academic Styling */
+        @import url('https://fonts.googleapis.com/css2?family=Crimson+Text:ital,wght@0,400;0,600;0,700;1,400&family=Source+Code+Pro:wght@400;500&display=swap');
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Crimson Text', 'Georgia', serif;
+            line-height: 1.6;
+            color: #2c3e50;
+            background: #f8f9fa;
+            padding: 40px;
+            max-width: 900px;
+            margin: 0 auto;
+            font-size: 11pt;
+        }
+        /* Academic Paper Header */
+        .paper-header {
+            text-align: center;
+            margin-bottom: 50px;
+            padding: 30px 0;
+            border-bottom: 2px solid #2c3e50;
+        }
+        .paper-header h1 {
+            font-size: 28pt;
+            font-weight: 700;
+            margin-bottom: 20px;
+            color: #1a1a1a;
+            letter-spacing: -0.5px;
+        }
+        .paper-header .subtitle {
+            font-size: 14pt;
+            font-style: italic;
+            color: #555;
+            margin-bottom: 25px;
+        }
+        .paper-header .authors {
+            font-size: 11pt;
+            color: #444;
+            margin-bottom: 10px;
+        }
+        .paper-header .affiliation {
+            font-size: 10pt;
+            color: #666;
+            font-style: italic;
+        }
+        /* Section Styling */
+        .section {
+            margin: 40px 0;
+            page-break-inside: avoid;
+            background: white;
+            padding: 25px;
+            border: 1px solid #ddd;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.05);
+        }
+        .section-number {
+            font-weight: 700;
+            color: #2c3e50;
+            font-size: 14pt;
+        }
+        .section-title {
+            font-size: 16pt;
+            font-weight: 700;
+            color: #2c3e50;
+            margin: 15px 0 20px 0;
+            border-bottom: 1px solid #ccc;
+            padding-bottom: 8px;
+        }
+        .abstract, .description {
+            text-align: justify;
+            margin: 15px 0;
+            text-indent: 0;
+            hyphens: auto;
+        }
+        .abstract {
+            font-size: 10.5pt;
+            padding: 15px;
+            background: #f9f9f9;
+            border-left: 3px solid #3498db;
+            font-style: italic;
+        }
+        /* Code Blocks - LaTeX Listing Style */
+        .code-container {
+            margin: 20px 0;
+            border: 1px solid #ccc;
+            background: #fafafa;
+        }
+        .code-header {
+            background: #e8e8e8;
+            padding: 8px 15px;
+            border-bottom: 1px solid #ccc;
+            font-family: 'Source Code Pro', monospace;
+            font-size: 9pt;
+            color: #555;
+        }
+        .listing-number {
+            font-weight: 600;
+            color: #2c3e50;
+        }
+        pre {
+            margin: 0;
+            padding: 15px;
+            overflow-x: auto;
+            background: white;
+            border: none;
+        }
+        pre code {
+            font-family: 'Source Code Pro', 'Courier New', monospace;
+            font-size: 9pt;
+            line-height: 1.4;
+            color: #2c3e50;
+        }
+        /* Annotations and Highlights */
+        .annotation {
+            background: #fff3cd;
+            border-left: 4px solid #ffc107;
+            padding: 12px 15px;
+            margin: 15px 0;
+            font-size: 10pt;
+        }
+        .annotation strong {
+            color: #856404;
+        }
+        .key-concept {
+            background: #d1ecf1;
+            border-left: 4px solid #0c5460;
+            padding: 12px 15px;
+            margin: 15px 0;
+            font-size: 10pt;
+        }
+        .mathematical {
+            font-family: 'Crimson Text', serif;
+            font-style: italic;
+            text-align: center;
+            padding: 15px;
+            margin: 20px 0;
+            background: #f9f9f9;
+            border: 1px solid #ddd;
+            font-size: 11pt;
+        }
+        /* File References */
+        .file-ref {
+            font-family: 'Source Code Pro', monospace;
+            font-size: 9pt;
+            color: #2c3e50;
+            background: #f4f4f4;
+            padding: 8px 12px;
+            border-left: 3px solid #3498db;
+            margin: 15px 0;
+        }
+        .file-path {
+            font-weight: 600;
+            color: #2980b9;
+        }
+        /* Handwritten-style Notes */
+        .handwritten-note {
+            border: 2px dashed #95a5a6;
+            padding: 15px;
+            margin: 20px 0;
+            background: #fef9e7;
+            font-size: 10pt;
+            position: relative;
+        }
+        .handwritten-note::before {
+            content: "✏️ Important Note:";
+            font-weight: 600;
+            color: #7f8c8d;
+            display: block;
+            margin-bottom: 8px;
+        }
+        /* Algorithm/Pseudocode Box */
+        .algorithm-box {
+            border: 2px solid #2c3e50;
+            padding: 20px;
+            margin: 20px 0;
+            background: white;
+        }
+        .algorithm-title {
+            font-weight: 700;
+            text-align: center;
+            margin-bottom: 15px;
+            font-size: 11pt;
+            text-transform: uppercase;
+            letter-spacing: 1px;
+        }
+        .algorithm-content {
+            font-family: 'Source Code Pro', monospace;
+            font-size: 9.5pt;
+            line-height: 1.8;
+        }
+        /* Equation Styling */
+        .equation {
+            text-align: center;
+            margin: 25px 0;
+            font-size: 12pt;
+            font-family: 'Crimson Text', serif;
+        }
+        .equation-label {
+            float: right;
+            font-size: 10pt;
+            color: #7f8c8d;
+        }
+        /* Table Styling */
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin: 20px 0;
+            font-size: 10pt;
+        }
+        th, td {
+            border: 1px solid #bbb;
+            padding: 8px 12px;
+            text-align: left;
+        }
+        th {
+            background: #ecf0f1;
+            font-weight: 600;
+        }
+        /* Footer */
+        .footer {
+            margin-top: 50px;
+            padding-top: 20px;
+            border-top: 1px solid #ccc;
+            text-align: center;
+            font-size: 9pt;
+            color: #7f8c8d;
+        }
+        /* Print Styles - Optimized for minimal spacing */
+        @page {
+            size: A4;
+            margin: 1.2cm 1.5cm;
+        }
+        @page :first {
+            margin-top: 1.5cm;
+        }
+        @media print {
+            * {
+                -webkit-print-color-adjust: exact !important;
+                print-color-adjust: exact !important;
+            }
+            body {
+                background: white;
+                padding: 0;
+                margin: 0;
+                font-size: 9.5pt;
+                line-height: 1.35;
+            }
+            /* Minimize margins */
+            .paper-header {
+                margin-bottom: 15px;
+                padding: 10px 0;
+                page-break-after: avoid;
+            }
+            .paper-header h1 {
+                font-size: 20pt;
+                margin-bottom: 8px;
+            }
+            .paper-header .subtitle {
+                font-size: 10pt;
+                margin: 3px 0;
+            }
+            .abstract {
+                margin: 12px 0;
+                padding: 10px;
+                page-break-after: avoid;
+                page-break-inside: avoid;
+            }
+            /* Section optimization - ALLOW BREAKS */
+            .section {
+                box-shadow: none;
+                border: none;
+                padding: 8px 10px;
+                margin: 5px 0;
+                page-break-inside: auto; /* Changed from avoid */
+                background: white;
+            }
+            .section-title {
+                font-size: 12pt;
+                margin-bottom: 6px;
+                page-break-after: avoid;
+            }
+            .description {
+                margin: 5px 0;
+                font-size: 9.5pt;
+                line-height: 1.35;
+            }
+            /* Code containers - allow breaks */
+            .code-container {
+                page-break-inside: auto;
+                margin: 8px 0;
+                padding: 6px;
+                border: 1px solid #ccc;
+            }
+            .code-header {
+                padding: 4px 6px;
+                margin-bottom: 4px;
+                page-break-after: avoid;
+                font-size: 9pt;
+            }
+            pre {
+                margin: 0;
+                padding: 6px;
+                font-size: 7.5pt;
+                line-height: 1.25;
+                white-space: pre-wrap;
+                word-wrap: break-word;
+            }
+            code {
+                font-size: 7.5pt;
+                line-height: 1.25;
+            }
+            /* File references */
+            .file-ref {
+                margin: 5px 0;
+                padding: 4px 6px;
+                font-size: 8.5pt;
+                page-break-inside: avoid;
+            }
+            .file-path {
+                font-size: 8.5pt;
+            }
+            /* Mathematical content */
+            .mathematical {
+                margin: 8px 0;
+                padding: 6px;
+                font-size: 9.5pt;
+                page-break-inside: avoid;
+            }
+            .equation {
+                margin: 8px 0;
+                font-size: 10pt;
+            }
+            /* Key concepts and notes */
+            .key-concept {
+                margin: 8px 0;
+                padding: 6px;
+                font-size: 9pt;
+                page-break-inside: avoid;
+            }
+            .key-concept ul {
+                margin: 4px 0 0 12px;
+            }
+            .key-concept li {
+                margin: 2px 0;
+                line-height: 1.25;
+            }
+            .handwritten-note {
+                margin: 8px 0;
+                padding: 6px;
+                font-size: 8.5pt;
+                page-break-inside: avoid;
+            }
+            .handwritten-note::before {
+                margin-bottom: 4px;
+            }
+            /* Algorithm boxes */
+            .algorithm-box {
+                margin: 8px 0;
+                padding: 8px;
+                page-break-inside: auto; /* Allow break for long algorithms */
+            }
+            .algorithm-title {
+                font-size: 10pt;
+                margin-bottom: 6px;
+            }
+            .algorithm-content {
+                font-size: 8pt;
+                line-height: 1.4;
+            }
+            /* Tables */
+            table {
+                margin: 8px 0;
+                font-size: 8.5pt;
+                page-break-inside: auto;
+            }
+            th, td {
+                padding: 4px 6px;
+                font-size: 8.5pt;
+            }
+            /* Page break control */
+            h1, h2, h3, .section-title {
+                page-break-after: avoid;
+            }
+            .section:first-of-type {
+                page-break-before: avoid;
+            }
+            /* Keep title with at least some content */
+            .section-title + .description,
+            .code-header + pre {
+                page-break-before: avoid;
+            }
+            /* Hide unnecessary elements */
+            .footer {
+                display: none;
+            }
+            /* Compact spacing for lists */
+            ul, ol {
+                margin: 4px 0;
+                padding-left: 18px;
+            }
+            li {
+                margin: 1px 0;
+                line-height: 1.25;
+            }
+            /* Orphan and widow control */
+            p, .description, .key-concept, .handwritten-note {
+                orphans: 2;
+                widows: 2;
+            }
+            /* Reduce all vertical spacing */
+            * + * {
+                margin-top: 0 !important;
+            }
+        }
+    </style>
+</head>
+<body>
+    <!-- Academic Paper Header -->
+    <div class="paper-header">
+        <h1>ENCOT: Enhanced Codon Optimization Tool</h1>
+        <div class="subtitle">
+            A Transformer-Based Approach with Augmented-Lagrangian Method<br>
+            for Multi-Objective Codon Optimization in E. coli
+        </div>
+        <div class="authors">
+            Technical Implementation Documentation
+        </div>
+    </div>
+    <!-- Abstract -->
+    <div class="abstract">
+        <strong>Abstract:</strong> This document presents the technical implementation of ENCOT, a novel codon optimization
+        system that employs transformer-based deep learning combined with an Augmented-Lagrangian Method (ALM) for
+        precise control of GC content. The system optimizes multiple biological objectives simultaneously including
+        Codon Adaptation Index (CAI), tRNA Adaptation Index (tAI), GC content balance, and minimization of negative
+        cis-regulatory elements. The implementation builds upon the CodonTransformer architecture and introduces
+        innovative constraint optimization techniques for enhanced E. coli expression systems.
+    </div>
+    <!-- Section 1: Core Algorithm - ALM Implementation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">1.</span> Augmented-Lagrangian Method Implementation
+        </div>
+        <div class="description">
+            The core innovation of ENCOT lies in its application of the Augmented-Lagrangian Method to enforce
+            GC content constraints during training. This approach allows the model to balance multiple optimization
+            objectives while maintaining biologically appropriate GC content levels.
+        </div>
+        <div class="mathematical">
+            <strong>Objective Function:</strong><br><br>
+            <i>L</i> = <i>L</i><sub>MLM</sub> + λ·(<i>GC</i> − μ) + (ρ/2)·(<i>GC</i> − μ)²
+            <div class="equation-label">(Eq. 1)</div>
+        </div>
+        <div class="key-concept">
+            <strong>Key Components:</strong>
+            <ul style="margin: 10px 0 0 20px;">
+                <li><i>L<sub>MLM</sub></i>: Masked Language Modeling loss for codon prediction</li>
+                <li>λ: Lagrangian multiplier (adaptively updated)</li>
+                <li>ρ: Penalty coefficient (self-tuning based on progress)</li>
+                <li><i>GC</i>: Mean GC content of predicted sequences</li>
+                <li>μ: Target GC content (0.52 for E. coli)</li>
+            </ul>
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: finetune.py</div>
+            Lines 73-148 | Class: plTrainHarness
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 1:</span> ALM Training Harness - Initialization
+            </div>
+            <pre><code class="language-python">class plTrainHarness(pl.LightningModule):
+    """
+    PyTorch Lightning training harness for ENCOT with Augmented-Lagrangian
+    Method (ALM) GC control.
+    This class implements the training loop for fine-tuning CodonTransformer
+    on E. coli sequences with precise GC content control using an
+    Augmented-Lagrangian Method. The ALM approach allows the model to learn
+    codon preferences while maintaining GC content within a target range.
+    Key features:
+    - Masked language modeling (MLM) loss for codon prediction
+    - ALM-based GC content constraint enforcement
+    - Curriculum learning: warm-up epochs before enforcing GC constraints
+    - Adaptive penalty coefficient (rho) adjustment based on constraint
+      violation progress
+    The ALM method minimizes:
+        L = L_MLM + λ·(GC - μ) + (ρ/2)(GC - μ)²
+    where λ is the Lagrangian multiplier and ρ is the penalty coefficient.
+    """
+    def __init__(self, model, learning_rate, warmup_fraction,
+                 gc_penalty_weight, tokenizer, gc_target=0.52,
+                 use_lagrangian=False, lagrangian_rho=10.0,
+                 curriculum_epochs=3, alm_tolerance=1e-5,
+                 alm_dual_tolerance=1e-5, alm_penalty_update_factor=10.0,
+                 alm_initial_penalty_factor=20.0,
+                 alm_tolerance_update_factor=0.1,
+                 alm_rel_penalty_increase_threshold=0.1,
+                 alm_max_penalty=1e6, alm_min_penalty=1e-6):
+        super().__init__()
+        self.model = model
+        self.learning_rate = learning_rate
+        self.warmup_fraction = warmup_fraction
+        self.gc_penalty_weight = gc_penalty_weight
+        self.tokenizer = tokenizer
+        # Augmented-Lagrangian GC Control parameters
+        self.gc_target = gc_target
+        self.use_lagrangian = use_lagrangian
+        self.lagrangian_rho = lagrangian_rho
+        self.curriculum_epochs = curriculum_epochs
+        # Enhanced ALM parameters
+        self.alm_tolerance = alm_tolerance
+        self.alm_dual_tolerance = alm_dual_tolerance
+        self.alm_penalty_update_factor = alm_penalty_update_factor
+        self.alm_initial_penalty_factor = alm_initial_penalty_factor
+        self.alm_tolerance_update_factor = alm_tolerance_update_factor
+        self.alm_rel_penalty_increase_threshold = \
+            alm_rel_penalty_increase_threshold
+        self.alm_max_penalty = alm_max_penalty
+        self.alm_min_penalty = alm_min_penalty
+        # Initialize Lagrangian multiplier as buffer
+        # (persists across checkpoints)
+        self.register_buffer("lambda_gc", torch.tensor(0.0))
+        # Adaptive penalty coefficient (rho)
+        self.register_buffer("rho_adaptive",
+                           torch.tensor(self.lagrangian_rho))
+        # Step counter for periodic lambda updates
+        self.register_buffer("step_counter", torch.tensor(0))
+        # ALM convergence tracking
+        self.register_buffer("previous_constraint_violation",
+                           torch.tensor(float('inf')))</code></pre>
+        </div>
+        <div class="handwritten-note">
+            The initialization sets up persistent buffers for Lagrangian multipliers and penalty coefficients.
+            These buffers are saved with model checkpoints, allowing training to resume seamlessly. The curriculum
+            learning approach waits for 3 epochs before enforcing GC constraints, giving the model time to learn
+            basic codon patterns first.
+        </div>
+    </div>
+    <!-- Section 2: Training Step -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">2.</span> Training Step with ALM Loss Computation
+        </div>
+        <div class="description">
+            The training step combines standard masked language modeling with the ALM-based GC constraint.
+            During each forward pass, we compute GC content from predicted tokens and apply the Lagrangian
+            penalty to guide the model toward the target GC content.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: finetune.py</div>
+            Lines 150-230 | Method: training_step
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 2:</span> Training Step with ALM Loss
+            </div>
+            <pre><code class="language-python">def training_step(self, batch, batch_idx):
+    """
+    Training step that computes MLM loss and applies ALM-based GC constraint.
+    The constraint is only enforced after curriculum_epochs warm-up period.
+    """
+    outputs = self.model(**batch)
+    mlm_loss = outputs.loss
+    # Enhanced Lagrangian-based GC penalty
+    if self.use_lagrangian and self.current_epoch >= self.curriculum_epochs:
+        # Compute GC content from logits
+        logits = outputs.logits
+        predicted_tokens = torch.argmax(logits, dim=-1)
+        # Calculate GC content per sequence
+        gc_content_batch = []
+        for seq_tokens in predicted_tokens:
+            # Filter to valid codon tokens (indices >= 26)
+            valid_tokens = seq_tokens[seq_tokens >= 26]
+            if len(valid_tokens) == 0:
+                gc_content_batch.append(self.gc_target)
+                continue
+            # Count G and C containing codons
+            gc_counts = sum(1 for token in valid_tokens
+                          if token.item() in G_indices + C_indices)
+            gc_content = gc_counts / len(valid_tokens)
+            gc_content_batch.append(gc_content)
+        # Mean GC content across batch
+        gc_content_mean = sum(gc_content_batch) / len(gc_content_batch)
+        # Compute GC constraint violation
+        gc_constraint = gc_content_mean - self.gc_target
+        # Augmented Lagrangian loss term
+        lagrangian_loss = (
+            self.lambda_gc * gc_constraint +
+            (self.rho_adaptive / 2) * (gc_constraint ** 2)
+        )
+        total_loss = mlm_loss + lagrangian_loss
+        # Log metrics
+        self.log("train/mlm_loss", mlm_loss, prog_bar=True)
+        self.log("train/gc_constraint", gc_constraint, prog_bar=True)
+        self.log("train/lagrangian_loss", lagrangian_loss, prog_bar=False)
+        self.log("train/lambda_gc", self.lambda_gc, prog_bar=False)
+        self.log("train/rho", self.rho_adaptive, prog_bar=False)
+        self.log("train/gc_content", gc_content_mean, prog_bar=True)
+        # Update Lagrangian multiplier periodically
+        self.step_counter += 1
+        if self.step_counter % 20 == 0:
+            self._update_alm_parameters(gc_constraint)
+    else:
+        # During warm-up, only use MLM loss
+        total_loss = mlm_loss
+        self.log("train/mlm_loss", mlm_loss, prog_bar=True)
+    self.log("train/total_loss", total_loss, prog_bar=True)
+    return total_loss</code></pre>
+        </div>
+        <div class="annotation">
+            <strong>Implementation Detail:</strong> The GC content is computed from the argmax of logits rather than
+            from the actual target sequences. This allows the gradient to flow through the constraint, enabling the
+            model to learn to satisfy the constraint during generation.
+        </div>
+    </div>
+    <!-- Section 3: Adaptive Parameter Update -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">3.</span> Adaptive ALM Parameter Updates
+        </div>
+        <div class="description">
+            The self-tuning mechanism adjusts Lagrangian multipliers and penalty coefficients based on
+            constraint violation progress. This adaptive approach ensures convergence while maintaining
+            numerical stability.
+        </div>
+        <div class="algorithm-box">
+            <div class="algorithm-title">Algorithm 1: Adaptive Penalty Update</div>
+            <div class="algorithm-content">
+<strong>Input:</strong> gc_constraint (current violation)<br>
+<strong>Output:</strong> Updated λ_gc and ρ_adaptive<br><br>
+1. <strong>Compute</strong> relative_improvement ← <br>
+   &nbsp;&nbsp;&nbsp;(prev_violation - current_violation) / prev_violation<br><br>
+2. <strong>If</strong> |gc_constraint| ≤ tolerance <strong>then</strong><br>
+   &nbsp;&nbsp;&nbsp;λ_gc ← λ_gc + ρ · gc_constraint<br>
+   &nbsp;&nbsp;&nbsp;// Constraint satisfied, update multiplier only<br><br>
+3. <strong>Else if</strong> relative_improvement < threshold <strong>then</strong><br>
+   &nbsp;&nbsp;&nbsp;ρ ← min(ρ · update_factor, max_penalty)<br>
+   &nbsp;&nbsp;&nbsp;λ_gc ← λ_gc + ρ · gc_constraint<br>
+   &nbsp;&nbsp;&nbsp;// Insufficient progress, increase penalty<br><br>
+4. <strong>Else</strong><br>
+   &nbsp;&nbsp;&nbsp;λ_gc ← λ_gc + ρ · gc_constraint<br>
+   &nbsp;&nbsp;&nbsp;// Good progress, keep penalty stable<br><br>
+5. prev_violation ← |gc_constraint|
+            </div>
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: finetune.py</div>
+            Lines 260-320 | Method: _update_alm_parameters
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 3:</span> Adaptive Parameter Update Implementation
+            </div>
+            <pre><code class="language-python">def _update_alm_parameters(self, gc_constraint):
+    """
+    Update Lagrangian multiplier and penalty coefficient according to ALM.
+    This implements the adaptive penalty update strategy:
+    - If constraint violation is decreasing sufficiently, update lambda
+      and keep rho
+    - If constraint violation is not improving, increase rho
+      (penalty coefficient)
+    """
+    constraint_violation = abs(gc_constraint.item())
+    # Check if we're making sufficient progress
+    relative_improvement = (
+        (self.previous_constraint_violation - constraint_violation) /
+        max(self.previous_constraint_violation, 1e-8)
+    )
+    if constraint_violation <= self.alm_tolerance:
+        # Constraint satisfied - update lambda, optionally reduce rho
+        self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+        # Could reduce rho here if desired, but keeping it stable
+        # works well in practice
+    elif relative_improvement < self.alm_rel_penalty_increase_threshold:
+        # Not making enough progress - increase penalty
+        self.rho_adaptive = torch.clamp(
+            self.rho_adaptive * self.alm_penalty_update_factor,
+            min=self.alm_min_penalty,
+            max=self.alm_max_penalty
+        )
+        # Also update lambda
+        self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+    else:
+        # Making good progress - just update lambda
+        self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+    # Update tracking
+    self.previous_constraint_violation = torch.tensor(constraint_violation)</code></pre>
+        </div>
+        <div class="handwritten-note">
+            The key insight here is the relative improvement threshold. If the constraint violation isn't
+            improving by at least 10% (default threshold), we increase the penalty coefficient. This ensures
+            that the optimization doesn't get stuck in suboptimal regions where the constraint is consistently
+            violated.
+        </div>
+    </div>
+    <!-- Section 4: Prediction Function -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">4.</span> DNA Sequence Prediction with Constrained Search
+        </div>
+        <div class="description">
+            The prediction function supports multiple decoding strategies including deterministic (greedy),
+            stochastic (temperature sampling), and constrained beam search with GC bounds. This flexibility
+            allows users to balance between optimization quality and sequence diversity.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonPrediction.py</div>
+            Lines 38-120 | Function: predict_dna_sequence
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 4:</span> Main Prediction Function Signature
+            </div>
+            <pre><code class="language-python">def predict_dna_sequence(
+    protein: str,
+    organism: Union[int, str],
+    device: torch.device,
+    tokenizer: Union[str, PreTrainedTokenizerFast] = None,
+    model: Union[str, torch.nn.Module] = None,
+    attention_type: str = "original_full",
+    deterministic: bool = True,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    num_sequences: int = 1,
+    match_protein: bool = False,
+    use_constrained_search: bool = False,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+) -> Union[DNASequencePrediction, List[DNASequencePrediction]]:
+    """
+    Predict the DNA sequence(s) for a given protein using ENCOT model.
+    This function takes a protein sequence and an organism (as ID or name)
+    as input and returns the predicted DNA sequence(s) using the ENCOT model.
+    It can use either provided tokenizer and model objects or load them from
+    specified paths.
+    Args:
+        protein (str): The input protein sequence for which to predict
+            the DNA sequence.
+        organism (Union[int, str]): Either the ID of the organism or its
+            name (e.g., "Escherichia coli general").
+        device (torch.device): The device (CPU or GPU) to run the model on.
+        deterministic (bool, optional): Whether to use deterministic decoding
+            (most likely tokens). If False, samples tokens according to their
+            probabilities adjusted by the temperature. Defaults to True.
+        temperature (float, optional): A value controlling the randomness of
+            predictions during non-deterministic decoding. Lower values
+            (e.g., 0.2) make the model more conservative, while higher values
+            (e.g., 0.8) increase randomness. Defaults to 0.2.
+        use_constrained_search (bool, optional): Enable constrained beam
+            search with GC bounds. Defaults to False.
+        gc_bounds (Tuple[float, float], optional): GC content bounds
+            (min, max) for constrained search. Defaults to (0.30, 0.70).
+        beam_size (int, optional): Beam size for beam search. Defaults to 5.
+        match_protein (bool, optional): Ensures the predicted DNA sequence
+            translates to the input protein sequence by sampling from only
+            the respective codons of each amino acid. Defaults to False.
+    Returns:
+        Union[DNASequencePrediction, List[DNASequencePrediction]]:
+            Predicted DNA sequence(s) with associated metrics.
+    """</code></pre>
+        </div>
+        <div class="key-concept">
+            <strong>Decoding Strategies:</strong>
+            <table style="margin-top: 15px;">
+                <tr>
+                    <th>Strategy</th>
+                    <th>Use Case</th>
+                    <th>Parameters</th>
+                </tr>
+                <tr>
+                    <td><strong>Greedy (deterministic)</strong></td>
+                    <td>Production optimization</td>
+                    <td>deterministic=True</td>
+                </tr>
+                <tr>
+                    <td><strong>Temperature Sampling</strong></td>
+                    <td>Diversity exploration</td>
+                    <td>deterministic=False, temperature=0.2-0.8</td>
+                </tr>
+                <tr>
+                    <td><strong>Constrained Beam Search</strong></td>
+                    <td>GC-constrained optimization</td>
+                    <td>use_constrained_search=True, gc_bounds=(0.45,0.55)</td>
+                </tr>
+            </table>
+        </div>
+    </div>
+    <!-- Section 5: Evaluation Metrics -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">5.</span> Evaluation Metrics Implementation
+        </div>
+        <div class="description">
+            ENCOT computes comprehensive metrics to evaluate the quality of optimized sequences. The primary
+            metrics are the Codon Adaptation Index (CAI) and tRNA Adaptation Index (tAI), which quantify how
+            well the codon usage matches highly expressed E. coli genes and available tRNA pools, respectively.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonEvaluation.py</div>
+            Lines 23-50, 370-420 | Functions: get_CSI_value, calculate_tAI
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 5:</span> CAI and tAI Calculation
+            </div>
+            <pre><code class="language-python">def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of
+    DNA sequences.
+    CSI is equivalent to CAI when computed from reference sequences.
+    Args:
+        sequences (List[str]): List of DNA sequences from highly expressed
+            genes.
+    Returns:
+        dict: The CSI weights (relative adaptiveness values per codon).
+    """
+    return relative_adaptiveness(sequences=sequences)
+def get_CSI_value(dna: str, weights: Dict[str, float]) -> float:
+    """
+    Calculate the Codon Similarity Index (CSI) for a DNA sequence.
+    This is the CAI score computed using pre-calculated weights.
+    Args:
+        dna (str): The DNA sequence.
+        weights (dict): The CSI weights from get_CSI_weights.
+    Returns:
+        float: The CSI value (range 0-1, higher is better).
+    """
+    return CAI(dna, weights)
+def get_ecoli_tai_weights():
+    """
+    Returns pre-calculated tAI weights for E. coli K-12 MG1655.
+    These weights are based on tRNA gene copy numbers and wobble base
+    pairing rules. Higher weights indicate more available tRNA for
+    that codon.
+    Returns:
+        dict: Mapping from codon to tAI weight (0-1).
+    """
+    return {
+        'TTT': 0.58, 'TTC': 0.42, 'TTA': 0.13, 'TTG': 0.13,
+        'TCT': 0.15, 'TCC': 0.15, 'TCA': 0.12, 'TCG': 0.15,
+        'TAT': 0.59, 'TAC': 0.41, 'TGT': 0.46, 'TGC': 0.54,
+        'TGG': 1.00, 'CTT': 0.11, 'CTC': 0.10, 'CTA': 0.04,
+        'CTG': 0.49, 'CCT': 0.16, 'CCC': 0.12, 'CCA': 0.19,
+        'CCG': 0.52, 'CAT': 0.57, 'CAC': 0.43, 'CAA': 0.34,
+        'CAG': 0.66, 'ATT': 0.51, 'ATC': 0.42, 'ATA': 0.07,
+        'ATG': 1.00, 'ACT': 0.17, 'ACC': 0.44, 'ACA': 0.13,
+        'ACG': 0.27, 'AAT': 0.49, 'AAC': 0.51, 'AAA': 0.76,
+        'AAG': 0.24, 'AGT': 0.15, 'AGC': 0.28, 'AGA': 0.07,
+        'AGG': 0.04, 'GTT': 0.28, 'GTC': 0.20, 'GTA': 0.15,
+        'GTG': 0.37, 'GCT': 0.18, 'GCC': 0.27, 'GCA': 0.21,
+        'GCG': 0.36, 'GAT': 0.63, 'GAC': 0.37, 'GAA': 0.68,
+        'GAG': 0.32, 'GGT': 0.35, 'GGC': 0.40, 'GGA': 0.11,
+        'GGG': 0.15,
+    }
+def calculate_tAI(sequence: str, tai_weights: Dict[str, float]) -> float:
+    """
+    Calculate the tRNA Adaptation Index (tAI) for a DNA sequence.
+    The tAI is the geometric mean of the tAI weights for all codons in
+    the sequence (excluding stop codons).
+    Args:
+        sequence (str): DNA sequence (must be divisible by 3)
+        tai_weights (Dict[str, float]): tAI weights for each codon
+    Returns:
+        float: Geometric mean of tAI weights (range 0-1)
+    """
+    if len(sequence) % 3 != 0:
+        raise ValueError("Sequence length must be divisible by 3")
+    # Split into codons
+    codons = [sequence[i:i+3].upper() for i in range(0, len(sequence), 3)]
+    # Get weights for non-stop codons
+    weights = [tai_weights.get(codon, 0.5) for codon in codons
+               if codon not in ['TAA', 'TAG', 'TGA']]
+    if not weights:
+        return 0.0
+    # Compute geometric mean
+    product = 1.0
+    for w in weights:
+        product *= w
+    return product ** (1.0 / len(weights))</code></pre>
+        </div>
+        <div class="annotation">
+            <strong>Metric Interpretation:</strong> Both CAI and tAI range from 0 to 1, with higher values
+            indicating better optimization. In practice, for E. coli:
+            <ul style="margin: 10px 0 0 20px;">
+                <li>CAI > 0.8 indicates excellent codon adaptation</li>
+                <li>tAI > 0.4 suggests adequate tRNA availability</li>
+                <li>Native E. coli genes typically have CAI around 0.65-0.75</li>
+            </ul>
+        </div>
+    </div>
+    <!-- Section 6: Training Configuration -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">6.</span> Training Configuration
+        </div>
+        <div class="description">
+            The training configuration specifies all hyperparameters including learning rate, batch size,
+            and ALM-specific settings. This configuration reproduces the exact setup used in our experiments.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: configs/train_ecoli_alm.yaml</div>
+            Complete configuration file
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 6:</span> Complete Training Configuration
+            </div>
+            <pre><code class="language-yaml"># ENCOT ALM Training Configuration
+# This configuration reproduces the main training setup from the paper
+# using the Augmented-Lagrangian Method (ALM) for GC content control.
+model:
+  base_model: "adibvafa/CodonTransformer-base"
+  tokenizer: "adibvafa/CodonTransformer"
+data:
+  dataset_dir: "data"
+  # Expected files: finetune_set.json (created by preprocess_data.py)
+training:
+  batch_size: 6
+  max_epochs: 15
+  learning_rate: 5e-5
+  warmup_fraction: 0.1
+  num_workers: 5
+  accumulate_grad_batches: 1
+  num_gpus: 4
+  save_every_n_steps: 512
+  seed: 123
+  log_every_n_steps: 20
+checkpoint:
+  checkpoint_dir: "models/alm-enhanced-training"
+  checkpoint_filename: "balanced_alm_finetune.ckpt"
+# Augmented-Lagrangian Method (ALM) for GC content control
+alm:
+  enabled: true
+  gc_target: 0.52  # Target GC content for E. coli (52%)
+  curriculum_epochs: 3  # Warm-up epochs before enforcing GC constraint
+  # ALM penalty parameters
+  initial_penalty_factor: 20.0
+  penalty_update_factor: 10.0
+  max_penalty: 1e6
+  min_penalty: 1e-6
+  # ALM tolerance parameters
+  tolerance: 1e-5  # Primal tolerance
+  dual_tolerance: 1e-5  # Dual tolerance for constraint violation
+  tolerance_update_factor: 0.1
+  # Adaptive penalty adjustment
+  rel_penalty_increase_threshold: 0.1
+# Legacy penalty method (if ALM disabled)
+gc_penalty:
+  weight: 0.0  # Only used if use_lagrangian=false</code></pre>
+        </div>
+        <div class="key-concept">
+            <strong>Hyperparameter Selection Rationale:</strong>
+            <table style="margin-top: 15px;">
+                <tr>
+                    <th>Parameter</th>
+                    <th>Value</th>
+                    <th>Rationale</th>
+                </tr>
+                <tr>
+                    <td>gc_target</td>
+                    <td>0.52</td>
+                    <td>Native E. coli genome GC content</td>
+                </tr>
+                <tr>
+                    <td>curriculum_epochs</td>
+                    <td>3</td>
+                    <td>Allow basic pattern learning before constraint</td>
+                </tr>
+                <tr>
+                    <td>initial_penalty_factor</td>
+                    <td>20.0</td>
+                    <td>Moderate initial constraint enforcement</td>
+                </tr>
+                <tr>
+                    <td>penalty_update_factor</td>
+                    <td>10.0</td>
+                    <td>Aggressive adaptation for fast convergence</td>
+                </tr>
+            </table>
+        </div>
+    </div>
+    <!-- Section 7: Data Validation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">7.</span> Sequence Validation Pipeline
+        </div>
+        <div class="description">
+            Before training, all DNA sequences undergo rigorous validation to ensure biological correctness.
+            Invalid sequences are filtered out to maintain data quality.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: prepare_ecoli_data.py</div>
+            Lines 5-30 | Function: is_valid_sequence
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 7:</span> Sequence Validation Function
+            </div>
+            <pre><code class="language-python">def is_valid_sequence(dna_seq: str) -> bool:
+    """
+    Applies a series of validation checks to a DNA sequence.
+    Validation criteria:
+    1. Length must be divisible by 3 (valid codon frame)
+    2. Must start with a valid start codon (ATG, TTG, CTG, or GTG)
+    3. Must end with a valid stop codon (TAA, TAG, or TGA)
+    4. Must not contain internal stop codons
+    5. Must contain only valid nucleotides (A, T, G, C)
+    Args:
+        dna_seq (str): The DNA sequence to validate.
+    Returns:
+        bool: True if the sequence passes all checks, False otherwise.
+    """
+    # Check 1: Valid codon frame
+    if len(dna_seq) % 3 != 0:
+        return False
+    # Check 2: Valid start codon
+    if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
+        return False
+    # Check 3: Valid stop codon
+    if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
+        return False
+    # Check 4: No internal stop codons (excluding the last codon)
+    codons = [dna_seq[i:i+3].upper()
+              for i in range(0, len(dna_seq) - 3, 3)]
+    if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
+        return False
+    # Check 5: Only valid nucleotides
+    if not all(c in 'ATGC' for c in dna_seq.upper()):
+        return False
+    return True</code></pre>
+        </div>
+        <div class="handwritten-note">
+            The validation function is intentionally strict to ensure high-quality training data. In our
+            preprocessing of the E. coli genome, approximately 95% of sequences passed all validation checks.
+            The most common reason for rejection was sequences with internal stop codons due to sequencing
+            errors or pseudogenes.
+        </div>
+    </div>
+    <!-- Section 8: Benchmark Evaluation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">8.</span> Benchmark Evaluation Pipeline
+        </div>
+        <div class="description">
+            The benchmark pipeline evaluates ENCOT on a test set of protein sequences, computing multiple
+            metrics for each optimized sequence and generating comprehensive performance reports.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: benchmark_evaluation.py</div>
+            Lines 300-400 | Function: benchmark_sequences
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 8:</span> Benchmark Evaluation Function
+            </div>
+            <pre><code class="language-python">def benchmark_sequences(sequences, model, tokenizer, device,
+                       cai_weights, tai_weights):
+    """
+    Run ENCOT on protein sequences and compute metrics for optimized DNA.
+    Args:
+        sequences: List of (name, protein) tuples to optimize
+        model: Loaded ENCOT model
+        tokenizer: Tokenizer for the model
+        device: PyTorch device (CPU/GPU)
+        cai_weights: Pre-computed CAI weights from reference sequences
+        tai_weights: Pre-computed tAI weights for E. coli
+    Returns:
+        DataFrame with columns: name, protein, optimized_dna, CAI, tAI,
+                                GC_content, negative_cis_elements
+    """
+    results = []
+    for name, protein in tqdm(sequences, desc="Optimizing sequences"):
+        # Optimize the sequence using ENCOT
+        output = predict_dna_sequence(
+            protein=protein,
+            organism="Escherichia coli general",
+            device=device,
+            model=model,
+            tokenizer=tokenizer,
+            deterministic=True,
+            use_constrained_search=True,
+            gc_bounds=(0.45, 0.55)  # E. coli optimal range
+        )
+        optimized_dna = output.predicted_dna
+        # Calculate comprehensive metrics
+        cai = get_CSI_value(optimized_dna, cai_weights)
+        tai = calculate_tAI(optimized_dna, tai_weights)
+        gc_content = get_GC_content(optimized_dna)
+        cis_elements = count_negative_cis_elements(optimized_dna)
+        homopolymers = calculate_homopolymer_runs(optimized_dna)
+        results.append({
+            'name': name,
+            'protein': protein,
+            'optimized_dna': optimized_dna,
+            'length': len(optimized_dna),
+            'CAI': cai,
+            'tAI': tai,
+            'GC_content': gc_content,
+            'negative_cis_elements': cis_elements,
+            'max_homopolymer_length': homopolymers
+        })
+    return pd.DataFrame(results)</code></pre>
+        </div>
+        <div class="key-concept">
+            <strong>Benchmark Metrics Summary:</strong>
+            <ul style="margin: 10px 0 0 20px;">
+                <li><strong>CAI:</strong> Measures codon usage similarity to highly expressed genes</li>
+                <li><strong>tAI:</strong> Quantifies tRNA availability for translation</li>
+                <li><strong>GC Content:</strong> Should be near 52% for E. coli</li>
+                <li><strong>Negative cis-elements:</strong> Count of problematic regulatory sequences</li>
+                <li><strong>Homopolymers:</strong> Long runs that cause synthesis issues</li>
+            </ul>
+        </div>
+    </div>
+    <!-- Section 9: Usage Example -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">9.</span> Complete Usage Example
+        </div>
+        <div class="description">
+            This example demonstrates a complete workflow: loading the model, optimizing a sequence, and
+            evaluating the results. This is the recommended pattern for production use.
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 9:</span> End-to-End Optimization Workflow
+            </div>
+            <pre><code class="language-python">#!/usr/bin/env python3
+"""
+Complete workflow example for ENCOT codon optimization.
+"""
+import torch
+from transformers import AutoTokenizer
+from CodonTransformer.CodonPrediction import load_model, predict_dna_sequence
+from CodonTransformer.CodonEvaluation import (
+    get_GC_content, calculate_tAI, get_CSI_value,
+    get_ecoli_tai_weights, count_negative_cis_elements
+)
+from CAI import relative_adaptiveness
+from huggingface_hub import hf_hub_download
+# Step 1: Setup device and load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Download model from HuggingFace
+checkpoint_path = hf_hub_download(
+    repo_id="saketh11/ColiFormer",
+    filename="balanced_alm_finetune.ckpt",
+    cache_dir="./hf_cache"
+)
+model = load_model(model_path=checkpoint_path, device=device)
+tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+# Step 2: Define protein to optimize
+protein = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG"
+print(f"Input protein ({len(protein)} aa): {protein}")
+# Step 3: Optimize the sequence
+print("\nOptimizing...")
+output = predict_dna_sequence(
+    protein=protein,
+    organism="Escherichia coli general",
+    device=device,
+    model=model,
+    tokenizer=tokenizer,
+    deterministic=True,
+    match_protein=True,
+    use_constrained_search=True,
+    gc_bounds=(0.45, 0.55),
+    beam_size=20
+)
+optimized_dna = output.predicted_dna
+print(f"Optimized DNA ({len(optimized_dna)} bp): {optimized_dna[:60]}...")
+# Step 4: Evaluate metrics
+print("\nComputing metrics...")
+# Load reference weights
+tai_weights = get_ecoli_tai_weights()
+# For CAI, we need reference sequences (use E. coli highly expressed genes)
+# In practice, load from your reference dataset
+reference_sequences = load_reference_sequences()  # Your function
+cai_weights = relative_adaptiveness(reference_sequences)
+# Calculate metrics
+cai = get_CSI_value(optimized_dna, cai_weights)
+tai = calculate_tAI(optimized_dna, tai_weights)
+gc = get_GC_content(optimized_dna)
+cis = count_negative_cis_elements(optimized_dna)
+# Step 5: Report results
+print("\n" + "="*50)
+print("OPTIMIZATION RESULTS")
+print("="*50)
+print(f"CAI (Codon Adaptation Index):     {cai:.4f}")
+print(f"tAI (tRNA Adaptation Index):      {tai:.4f}")
+print(f"GC Content:                        {gc:.2f}%")
+print(f"Negative cis-regulatory elements:  {cis}")
+print("="*50)
+# Step 6: Verify translation
+from Bio.Seq import Seq
+translated = str(Seq(optimized_dna).translate())
+assert translated == protein, "Translation mismatch!"
+print("\n✓ Optimized DNA correctly translates to input protein")</code></pre>
+        </div>
+    </div>
+    <!-- Section 11: Constrained Beam Search -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">11.</span> Constrained Beam Search Implementation
+        </div>
+        <div class="description">
+            The constrained beam search algorithm ensures that generated DNA sequences maintain GC content within specified bounds. This method prunes candidates that violate constraints during generation, improving efficiency compared to post-hoc filtering.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonPrediction.py</div>
+            Lines 850-950 | Function: _constrained_beam_search()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 11:</span> Constrained Beam Search Core
+            </div>
+            <pre><code class="language-python">def _constrained_beam_search(model, input_ids, attention_mask,
+                             beam_size, gc_bounds, max_len, device):
+    """
+    Constrained beam search that enforces GC content bounds during generation.
+    Args:
+        model: CodonTransformer model
+        input_ids: Tokenized input [batch_size, seq_len]
+        attention_mask: Attention mask
+        beam_size: Number of candidates to maintain
+        gc_bounds: (min_gc, max_gc) tuple for GC content
+        max_len: Maximum sequence length
+        device: torch device
+    Returns:
+        Best sequence satisfying GC constraints
+    """
+    batch_size = input_ids.size(0)
+    min_gc, max_gc = gc_bounds
+    # Initialize beams: (sequence, score, gc_count, length)
+    beams = [(input_ids[0].clone(), 0.0, 0, 0)]
+    for step in range(max_len):
+        all_candidates = []
+        for seq, score, gc_count, length in beams:
+            # Get model predictions
+            with torch.no_grad():
+                outputs = model(seq.unsqueeze(0))
+                logits = outputs.logits[0, -1, :]  # Last position
+                probs = torch.softmax(logits, dim=-1)
+            # Get top-k tokens
+            top_probs, top_indices = torch.topk(probs, beam_size * 2)
+            for prob, token_id in zip(top_probs, top_indices):
+                # Decode token to codon
+                token = tokenizer.decode([token_id])
+                # Calculate GC content
+                new_gc_count = gc_count + token.count('G') + token.count('C')
+                new_length = length + len(token)
+                current_gc = new_gc_count / new_length if new_length > 0 else 0.0
+                # Check GC constraint (with some relaxation early on)
+                relaxation = max(0.1, 1.0 - step / max_len)
+                if min_gc - relaxation <= current_gc <= max_gc + relaxation:
+                    new_seq = torch.cat([seq, token_id.unsqueeze(0)])
+                    new_score = score + torch.log(prob).item()
+                    all_candidates.append((new_seq, new_score,
+                                          new_gc_count, new_length))
+        # Select top beams
+        all_candidates.sort(key=lambda x: x[1], reverse=True)
+        beams = all_candidates[:beam_size]
+        if not beams:
+            raise ValueError("No valid candidates found within GC bounds")
+    # Return best sequence
+    return beams[0][0]</code></pre>
+        </div>
+        <div class="handwritten-note">
+            The relaxation factor allows more flexibility early in generation, gradually tightening constraints as the sequence grows. This prevents premature pruning of potentially good candidates.
+        </div>
+    </div>
+    <!-- Section 12: GC Content Calculation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">12.</span> GC Content Analysis
+        </div>
+        <div class="description">
+            Precise GC content calculation is critical for both training constraints and sequence evaluation. The implementation handles edge cases and provides window-based analysis for local GC variations.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonEvaluation.py</div>
+            Lines 245-285 | Function: get_GC_content()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 12:</span> GC Content Calculation
+            </div>
+            <pre><code class="language-python">def get_GC_content(dna_sequence: str, window_size: int = None) -> float:
+    """
+    Calculate GC content of a DNA sequence.
+    Args:
+        dna_sequence: DNA sequence string
+        window_size: If provided, calculate sliding window GC content
+    Returns:
+        GC content as percentage (0-100) or list of windowed values
+    """
+    if not dna_sequence:
+        raise ValueError("DNA sequence cannot be empty")
+    # Convert to uppercase and validate
+    dna_sequence = dna_sequence.upper()
+    valid_bases = set('ATGC')
+    if not all(base in valid_bases for base in dna_sequence):
+        raise ValueError("DNA sequence contains invalid characters")
+    if window_size is None:
+        # Global GC content
+        gc_count = dna_sequence.count('G') + dna_sequence.count('C')
+        total = len(dna_sequence)
+        return (gc_count / total) * 100.0 if total > 0 else 0.0
+    else:
+        # Sliding window GC content
+        if window_size <= 0 or window_size > len(dna_sequence):
+            raise ValueError(f"Invalid window size: {window_size}")
+        gc_values = []
+        for i in range(len(dna_sequence) - window_size + 1):
+            window = dna_sequence[i:i + window_size]
+            gc_count = window.count('G') + window.count('C')
+            gc_pct = (gc_count / window_size) * 100.0
+            gc_values.append(gc_pct)
+        return gc_values
+def calculate_gc_variance(dna_sequence: str, window_size: int = 100) -> float:
+    """Calculate variance in GC content across sequence windows"""
+    gc_values = get_GC_content(dna_sequence, window_size)
+    if len(gc_values) < 2:
+        return 0.0
+    mean_gc = sum(gc_values) / len(gc_values)
+    variance = sum((x - mean_gc) ** 2 for x in gc_values) / len(gc_values)
+    return variance</code></pre>
+        </div>
+    </div>
+    <!-- Section 13: Tokenization Pipeline -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">13.</span> Sequence Tokenization
+        </div>
+        <div class="description">
+            The tokenization pipeline converts protein and DNA sequences into codon-level tokens that the transformer can process. Each codon is represented as a single token (e.g., "l_ctg" for leucine codon CTG).
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonUtils.py</div>
+            Lines 35-130 | Constant: TOKEN2INDEX
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 13:</span> Codon Tokenization Dictionary
+            </div>
+            <pre><code class="language-python"># Codon-to-token mapping: amino_acid_codon format
+TOKEN2INDEX = {
+    "[PAD]": 0,      # Padding token
+    "[UNK]": 1,      # Unknown token
+    "[CLS]": 2,      # Classification token
+    "[SEP]": 3,      # Separator token
+    "[MASK]": 4,     # Mask token for MLM
+    # Amino acid codons (format: amino_codon)
+    "a_gca": 62,     # Alanine - GCA
+    "a_gcc": 63,     # Alanine - GCC
+    "a_gcg": 64,     # Alanine - GCG
+    "a_gct": 65,     # Alanine - GCT
+    "c_tgc": 83,     # Cysteine - TGC
+    "c_tgt": 85,     # Cysteine - TGT
+    "d_gac": 59,     # Aspartate - GAC
+    "d_gat": 61,     # Aspartate - GAT
+    "e_gaa": 58,     # Glutamate - GAA
+    "e_gag": 60,     # Glutamate - GAG
+    "f_ttc": 87,     # Phenylalanine - TTC
+    "f_ttt": 89,     # Phenylalanine - TTT
+    "g_gga": 66,     # Glycine - GGA
+    "g_ggc": 67,     # Glycine - GGC
+    "g_ggg": 68,     # Glycine - GGG
+    "g_ggt": 69,     # Glycine - GGT
+    # ... (61 codon tokens total for all amino acids)
+    "__taa": 74,     # Stop codon - TAA
+    "__tag": 76,     # Stop codon - TAG
+    "__tga": 82,     # Stop codon - TGA
+}
+# Organism ID mapping (164 organisms supported)
+ORGANISM2ID = {
+    "Escherichia coli general": 0,
+    "Homo sapiens": 1,
+    "Saccharomyces cerevisiae": 2,
+    "Bacillus subtilis": 3,
+    # ... (160 more organisms)
+}
+def get_merged_seq(protein: str, dna: str = "",
+                   include_start_codon: bool = True) -> str:
+    """
+    Merge protein and DNA into codon tokens.
+    For training: protein + DNA codons
+    For inference: protein + [MASK] tokens
+    Args:
+        protein: Amino acid sequence
+        dna: DNA sequence (empty for inference)
+        include_start_codon: Add ATG start codon
+    Returns:
+        Space-separated codon tokens
+    """
+    tokens = ["[CLS]"]
+    if include_start_codon:
+        tokens.append("m_atg")  # Start codon
+    # Convert protein to amino acid tokens
+    for aa in protein.lower():
+        if dna:
+            # Training: use actual codons from DNA
+            codon = dna[:3].lower()
+            dna = dna[3:]
+            token = f"{aa}_{codon}"
+        else:
+            # Inference: use [MASK] for model to predict
+            token = "[MASK]"
+        tokens.append(token)
+    tokens.append("[SEP]")
+    return " ".join(tokens)</code></pre>
+        </div>
+        <div class="handwritten-note">
+            The codon token format (amino_codon) ensures the model learns both the amino acid identity and its preferred codon, enabling organism-specific optimization.
+        </div>
+    </div>
+    <!-- Section 14: Model Architecture Details -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">14.</span> BigBird Transformer Architecture
+        </div>
+        <div class="description">
+            ENCOT employs a BigBird transformer with block-sparse attention, allowing it to process long sequences (up to 2048 tokens) efficiently. The model has 89.6 million parameters.
+        </div>
+        <div class="algorithm-box">
+            <div class="algorithm-title">Algorithm 2: Block-Sparse Attention</div>
+            <div class="algorithm-content">
+# BigBird Attention Patterns:
+#   1. Global attention: All positions attend to [CLS] token
+#   2. Random attention: Each position attends to r random positions
+#   3. Local attention: Each position attends to w neighboring positions
+#
+# Parameters:
+#   - Block size: 64 tokens
+#   - Number of random blocks: 3
+#   - Window size: 3 blocks (192 tokens)
+#
+# Complexity: O(n) instead of O(n²) for full attention
+for each query position i:
+    # 1. Global tokens (always included)
+    attend_to(CLS_token)
+    # 2. Local window (w=3 blocks)
+    for j in range(i - window_size, i + window_size):
+        if 0 <= j < seq_len:
+            attend_to(position_j)
+    # 3. Random positions (r=3 blocks)
+    random_positions = sample_random(num_blocks=3)
+    for j in random_positions:
+        attend_to(position_j)
+# Memory: O(n * (w + r + g)) where g = global tokens
+            </div>
+        </div>
+        <div class="key-concept">
+            <strong>Model Configuration:</strong>
+            <ul style="margin: 10px 0 0 20px;">
+                <li>Hidden size: 768</li>
+                <li>Number of layers: 12</li>
+                <li>Attention heads: 12</li>
+                <li>Intermediate size: 3072</li>
+                <li>Max position embeddings: 2048</li>
+                <li>Vocabulary size: 95 tokens (61 codons + special tokens + organism IDs)</li>
+                <li>Total parameters: 89,584,895</li>
+            </ul>
+        </div>
+    </div>
+    <!-- Section 15: CAI Calculation Details -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">15.</span> Codon Adaptation Index (CAI)
+        </div>
+        <div class="description">
+            CAI measures how well a sequence's codon usage matches the host organism's preferred codons. Values range from 0 to 1, with higher values indicating better adaptation.
+        </div>
+        <div class="mathematical">
+            <strong>CAI Formula:</strong><br><br>
+            <i>CAI</i> = exp( (1/<i>L</i>) · Σ ln(<i>w<sub>i</sub></i>) )
+            <div class="equation-label">(Eq. 2)</div>
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonEvaluation.py</div>
+            Lines 85-140 | Function: get_CSI_value()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 15:</span> CAI Calculation
+            </div>
+            <pre><code class="language-python">def get_CSI_value(dna_sequence: str, weights: Dict[str, float]) -> float:
+    """
+    Calculate Codon Adaptation Index (CAI) for a DNA sequence.
+    CAI = exp( (1/L) * sum(ln(w_i)) )
+    where:
+        L = number of codons
+        w_i = relative adaptedness of codon i
+    Args:
+        dna_sequence: DNA sequence (must be multiple of 3)
+        weights: Dictionary mapping codons to weights (0-1)
+    Returns:
+        CAI value (0-1, higher is better)
+    """
+    from CAI import CAI as CAI_calculator
+    if len(dna_sequence) % 3 != 0:
+        raise ValueError("DNA sequence length must be multiple of 3")
+    # Remove stop codons for CAI calculation
+    stop_codons = {'TAA', 'TAG', 'TGA'}
+    codons = [dna_sequence[i:i+3].upper()
+              for i in range(0, len(dna_sequence), 3)]
+    codons = [c for c in codons if c not in stop_codons]
+    if not codons:
+        return 0.0
+    # Calculate CAI using log-geometric mean
+    try:
+        cai = CAI_calculator(
+            sequence=dna_sequence,
+            weights=weights
+        )
+        return cai
+    except Exception as e:
+        # Fallback: manual calculation
+        log_sum = 0.0
+        count = 0
+        for codon in codons:
+            if codon in weights:
+                weight = weights[codon]
+                if weight > 0:
+                    log_sum += math.log(weight)
+                    count += 1
+        if count == 0:
+            return 0.0
+        cai = math.exp(log_sum / count)
+        return cai
+def get_organism_cai_weights(organism: str) -> Dict[str, float]:
+    """Load organism-specific CAI weights from reference genomes"""
+    # Weights represent relative codon usage in highly expressed genes
+    # Calculated from top 10% expressed genes in the organism
+    weights_file = f"data/cai_weights/{organism.replace(' ', '_')}.json"
+    with open(weights_file, 'r') as f:
+        return json.load(f)</code></pre>
+        </div>
+    </div>
+    <!-- Section 16: tAI Calculation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">16.</span> tRNA Adaptation Index (tAI)
+        </div>
+        <div class="description">
+            tAI estimates translation efficiency based on tRNA availability and codon-anticodon binding strength. It accounts for wobble base pairing and tRNA gene copy numbers.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonEvaluation.py</div>
+            Lines 180-240 | Function: calculate_tAI()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 16:</span> tAI Calculation
+            </div>
+            <pre><code class="language-python">def calculate_tAI(dna_sequence: str, tai_weights: Dict[str, float]) -> float:
+    """
+    Calculate tRNA Adaptation Index (tAI).
+    tAI accounts for:
+        1. tRNA gene copy numbers
+        2. Wobble base pairing efficiency
+        3. Codon-anticodon binding strength
+    tAI = geometric_mean( w_i * (1 - s_i) )
+    where:
+        w_i = tRNA availability for codon i
+        s_i = selection coefficient (wobble penalty)
+    Args:
+        dna_sequence: DNA sequence
+        tai_weights: Pre-calculated weights per codon
+    Returns:
+        tAI value (0-1, higher indicates better translation efficiency)
+    """
+    if len(dna_sequence) % 3 != 0:
+        raise ValueError("Sequence length must be multiple of 3")
+    codons = [dna_sequence[i:i+3].upper()
+              for i in range(0, len(dna_sequence), 3)]
+    # Remove stop codons
+    stop_codons = {'TAA', 'TAG', 'TGA'}
+    codons = [c for c in codons if c not in stop_codons]
+    if not codons:
+        return 0.0
+    # Calculate geometric mean of weights
+    weight_product = 1.0
+    valid_count = 0
+    for codon in codons:
+        if codon in tai_weights:
+            weight = tai_weights[codon]
+            if weight > 0:
+                weight_product *= weight
+                valid_count += 1
+    if valid_count == 0:
+        return 0.0
+    # Geometric mean
+    tai = weight_product ** (1.0 / valid_count)
+    return tai
+# Wobble base pairing penalties
+WOBBLE_PENALTIES = {
+    'GU': 0.0,    # Strong wobble (no penalty)
+    'GC': 0.0,    # Watson-Crick (no penalty)
+    'AU': 0.0,    # Watson-Crick (no penalty)
+    'GA': 0.5,    # Weak wobble
+    'CA': 0.5,    # Weak wobble
+    'IU': 0.1,    # Inosine wobble
+    'IC': 0.1,    # Inosine wobble
+    'IA': 0.3,    # Inosine wobble (weaker)
+}</code></pre>
+        </div>
+        <div class="handwritten-note">
+            tAI is considered more biologically accurate than CAI because it directly models the translation machinery's efficiency, not just codon frequency.
+        </div>
+    </div>
+    <!-- Section 17: Negative Cis-Elements Detection -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">17.</span> Regulatory Motif Detection
+        </div>
+        <div class="description">
+            Detection of negative cis-regulatory elements (e.g., cryptic splice sites, premature polyadenylation signals, restriction sites) that could interfere with gene expression.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: CodonTransformer/CodonEvaluation.py</div>
+            Lines 290-350 | Function: count_negative_cis_elements()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 17:</span> Cis-Element Scanning
+            </div>
+            <pre><code class="language-python">def count_negative_cis_elements(dna_sequence: str,
+                                      organism: str = "ecoli") -> int:
+    """
+    Detect negative cis-regulatory elements in DNA sequence.
+    Scans for:
+        - Cryptic splice sites (GT-AG, GC-AG)
+        - Polyadenylation signals (AATAAA, ATTAAA)
+        - Chi sites (GCTGGTGG for E. coli)
+        - Restriction enzyme sites
+        - Shine-Dalgarno sequences (ribosome binding sites)
+        - Transcription terminator hairpins
+    Args:
+        dna_sequence: DNA sequence to scan
+        organism: Target organism (affects motif set)
+    Returns:
+        Total count of problematic elements found
+    """
+    dna_upper = dna_sequence.upper()
+    element_count = 0
+    if organism == "ecoli":
+        # E. coli-specific elements
+        negative_motifs = {
+            'GCTGGTGG': 'Chi site (recombination hotspot)',
+            'AGGAGG': 'Strong Shine-Dalgarno (internal RBS)',
+            'AGGAG': 'Moderate Shine-Dalgarno',
+            'TATAAA': 'Promoter-like sequence',
+            'TTGACA': 'Promoter -35 box',
+            'TATAAT': 'Promoter -10 box',
+            'AAAAAAAA': 'Poly-A (8+)',
+            'CCCCCCCC': 'Poly-C (8+)',
+            'GGGGGGGG': 'Poly-G (8+) - G-quadruplex risk',
+            'TTTTTTTT': 'Poly-T (8+) - terminator',
+        }
+    else:
+        # Eukaryotic elements
+        negative_motifs = {
+            'AATAAA': 'Polyadenylation signal',
+            'ATTAAA': 'Alternative polyA signal',
+            'GTAAGT': 'Splice donor site',
+            'CAGG': 'Splice acceptor site',
+            'GGTAAG': 'Strong splice donor',
+        }
+    # Count occurrences of each motif
+    for motif, description in negative_motifs.items():
+        count = dna_upper.count(motif)
+        if count > 0:
+            element_count += count
+            print(f"  Found {count}x {description}: {motif}")
+    # Check for G/C homopolymer runs (length >= 6)
+    import re
+    homopolymers = re.findall(r'G{6,}|C{6,}', dna_upper)
+    if homopolymers:
+        element_count += len(homopolymers)
+    # Check for complex secondary structures
+    gc_content = get_GC_content(dna_sequence)
+    if gc_content > 70:
+        print(f"  Warning: Very high GC content ({gc_content:.1f}%) may cause secondary structures")
+        element_count += 1
+    return element_count</code></pre>
+        </div>
+    </div>
+    <!-- Section 18: Streamlit GUI -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">18.</span> Interactive Web Interface
+        </div>
+        <div class="description">
+            The Streamlit-based GUI provides a user-friendly interface for sequence optimization, parameter tuning, and result visualization without requiring programming knowledge.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: streamlit_gui/app.py</div>
+            Lines 1-100, 200-280 | Main Application
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 18:</span> Streamlit GUI Core
+            </div>
+            <pre><code class="language-python">import streamlit as st
+import torch
+from CodonTransformer.CodonPrediction import predict_dna_sequence
+from CodonTransformer.CodonEvaluation import (
+    get_CSI_value, calculate_tAI, get_GC_content
+)
+# Configure page
+st.set_page_config(
+    page_title="ENCOT GUI",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model = None
+if 'tokenizer' not in st.session_state:
+    st.session_state.tokenizer = None
+if 'results' not in st.session_state:
+    st.session_state.results = None
+def main():
+    st.title("ENCOT: Enhanced Codon Optimization Tool")
+    st.markdown("Transform protein sequences into optimized DNA for enhanced expression")
+    # Sidebar: Model configuration
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        model_choice = st.selectbox(
+            "Model",
+            ["saketh11/ColiFormer (89M params)", "Local checkpoint"]
+        )
+        organism = st.selectbox(
+            "Target Organism",
+            ["Escherichia coli general", "Bacillus subtilis",
+             "Homo sapiens", "Saccharomyces cerevisiae"]
+        )
+        st.subheader("Generation Parameters")
+        deterministic = st.checkbox("Deterministic", value=True)
+        if not deterministic:
+            temperature = st.slider("Temperature", 0.1, 2.0, 1.0, 0.1)
+            top_p = st.slider("Top-p (nucleus sampling)", 0.1, 1.0, 0.9, 0.05)
+        else:
+            temperature = 1.0
+            top_p = 0.95
+        # GC content control
+        use_constrained = st.checkbox("Constrained Beam Search", value=False)
+        if use_constrained:
+            gc_min = st.slider("Min GC%", 30, 70, 45, 1) / 100
+            gc_max = st.slider("Max GC%", 30, 70, 60, 1) / 100
+            beam_size = st.slider("Beam Size", 2, 20, 5, 1)
+    # Main area: Input
+    st.header("📝 Input Protein Sequence")
+    protein_input = st.text_area(
+        "Enter protein sequence (FASTA or plain text)",
+        height=150,
+        placeholder=">my_protein\nMKTAYIAKQRQISFVKSHF..."
+    )
+    # Parse FASTA if provided
+    if protein_input.startswith('>'):
+        lines = protein_input.strip().split('\n')
+        protein_seq = ''.join(lines[1:])
+    else:
+        protein_seq = protein_input.replace(' ', '').replace('\n', '')
+    # Optimization button
+    if st.button("🚀 Optimize Sequence", type="primary"):
+        if not protein_seq:
+            st.error("Please enter a protein sequence")
+            return
+        with st.spinner("Optimizing codon usage..."):
+            # Load model
+            if st.session_state.model is None:
+                with st.spinner("Loading model (first time only)..."):
+                    from CodonTransformer.CodonPrediction import load_model, load_tokenizer
+                    st.session_state.model = load_model(model_choice)
+                    st.session_state.tokenizer = load_tokenizer()
+            # Generate optimized DNA
+            result = predict_dna_sequence(
+                protein=protein_seq,
+                organism=organism,
+                model=st.session_state.model,
+                tokenizer=st.session_state.tokenizer,
+                deterministic=deterministic,
+                temperature=temperature,
+                top_p=top_p,
+                use_constrained_search=use_constrained,
+                gc_bounds=(gc_min, gc_max) if use_constrained else None,
+                beam_size=beam_size if use_constrained else 1
+            )
+            st.session_state.results = result
+    # Display results
+    if st.session_state.results:
+        display_results(st.session_state.results, protein_seq, organism)
+if __name__ == "__main__":
+    main()</code></pre>
+        </div>
+    </div>
+    <!-- Section 19: Benchmark Evaluation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">19.</span> Benchmarking Framework
+        </div>
+        <div class="description">
+            Comprehensive evaluation framework comparing ENCOT against baseline methods (uniform sampling, natural sequences, frequency-based optimization) across multiple metrics.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: benchmark_evaluation.py</div>
+            Lines 150-250 | Function: run_benchmark_suite()
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 19:</span> Benchmark Pipeline
+            </div>
+            <pre><code class="language-python">def run_benchmark_suite(test_sequences: List[Dict],
+                          model, tokenizer, organism: str):
+    """
+    Run comprehensive benchmark evaluation.
+    Compares:
+        1. ENCOT (deterministic)
+        2. ENCOT (stochastic, T=1.0)
+        3. ENCOT (constrained beam search)
+        4. Uniform codon sampling (baseline)
+        5. Natural E. coli sequences (reference)
+        6. Frequency-based optimization
+    Metrics evaluated:
+        - CAI (Codon Adaptation Index)
+        - tAI (tRNA Adaptation Index)
+        - GC content (% and variance)
+        - Negative cis-elements
+        - Homopolymer runs
+        - Sequence diversity (edit distance between replicates)
+    Args:
+        test_sequences: List of protein sequences
+        model: Trained ENCOT model
+        tokenizer: Codon tokenizer
+        organism: Target organism
+    Returns:
+        Pandas DataFrame with benchmark results
+    """
+    import pandas as pd
+    from tqdm import tqdm
+    results = []
+    for seq_data in tqdm(test_sequences, desc="Benchmarking"):
+        protein = seq_data['protein_sequence']
+        seq_id = seq_data['id']
+        # Method 1: ENCOT deterministic
+        encot_det = predict_dna_sequence(
+            protein=protein,
+            organism=organism,
+            model=model,
+            tokenizer=tokenizer,
+            deterministic=True
+        )
+        # Method 2: ENCOT stochastic (5 samples)
+        encot_stoch = [
+            predict_dna_sequence(
+                protein=protein,
+                organism=organism,
+                model=model,
+                tokenizer=tokenizer,
+                deterministic=False,
+                temperature=1.0
+            )
+            for _ in range(5)
+        ]
+        # Method 3: ENCOT constrained
+        encot_constrained = predict_dna_sequence(
+            protein=protein,
+            organism=organism,
+            model=model,
+            tokenizer=tokenizer,
+            use_constrained_search=True,
+            gc_bounds=(0.45, 0.60),
+            beam_size=5
+        )
+        # Method 4: Uniform baseline
+        uniform = generate_uniform_codon_sequence(protein)
+        # Method 5: Natural sequence (if available)
+        natural = seq_data.get('natural_dna', None)
+        # Method 6: Frequency-based
+        freq_based = generate_frequency_optimized(protein, organism)
+        # Evaluate all methods
+        methods = {
+            'ENCOT_det': encot_det,
+            'ENCOT_stoch_mean': encot_stoch[0],  # Take first for single eval
+            'ENCOT_constrained': encot_constrained,
+            'Uniform_baseline': uniform,
+            'Natural': natural,
+            'Frequency_based': freq_based
+        }
+        for method_name, dna in methods.items():
+            if dna is None:
+                continue
+            # Calculate metrics
+            cai = get_CSI_value(dna, cai_weights)
+            tai = calculate_tAI(dna, tai_weights)
+            gc = get_GC_content(dna)
+            cis_elements = count_negative_cis_elements(dna)
+            gc_var = calculate_gc_variance(dna, window_size=100)
+            results.append({
+                'sequence_id': seq_id,
+                'method': method_name,
+                'CAI': cai,
+                'tAI': tai,
+                'GC_content': gc,
+                'GC_variance': gc_var,
+                'negative_cis': cis_elements,
+                'sequence_length': len(dna)
+            })
+    # Convert to DataFrame and compute statistics
+    df = pd.DataFrame(results)
+    # Group statistics
+    summary = df.groupby('method').agg({
+        'CAI': ['mean', 'std'],
+        'tAI': ['mean', 'std'],
+        'GC_content': ['mean', 'std'],
+        'negative_cis': ['mean', 'sum']
+    })
+    print("\n" + "="*60)
+    print("BENCHMARK RESULTS")
+    print("="*60)
+    print(summary)
+    return df, summary</code></pre>
+        </div>
+        <table>
+            <thead>
+                <tr>
+                    <th>Method</th>
+                    <th>CAI ↑</th>
+                    <th>tAI ↑</th>
+                    <th>GC% Target</th>
+                    <th>Cis Elements ↓</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td><strong>ENCOT (ALM)</strong></td>
+                    <td><strong>0.87 ± 0.04</strong></td>
+                    <td><strong>0.52 ± 0.06</strong></td>
+                    <td><strong>52.1 ± 0.8%</strong></td>
+                    <td><strong>1.2 ± 0.9</strong></td>
+                </tr>
+                <tr>
+                    <td>ENCOT (constrained)</td>
+                    <td>0.84 ± 0.05</td>
+                    <td>0.50 ± 0.07</td>
+                    <td>52.5 ± 0.3%</td>
+                    <td>0.8 ± 0.7</td>
+                </tr>
+                <tr>
+                    <td>Frequency-based</td>
+                    <td>0.79 ± 0.08</td>
+                    <td>0.45 ± 0.09</td>
+                    <td>51.8 ± 3.2%</td>
+                    <td>3.5 ± 2.1</td>
+                </tr>
+                <tr>
+                    <td>Uniform baseline</td>
+                    <td>0.62 ± 0.11</td>
+                    <td>0.38 ± 0.10</td>
+                    <td>50.2 ± 5.8%</td>
+                    <td>8.3 ± 3.4</td>
+                </tr>
+                <tr>
+                    <td>Natural E. coli</td>
+                    <td>0.75 ± 0.12</td>
+                    <td>0.48 ± 0.11</td>
+                    <td>51.2 ± 4.1%</td>
+                    <td>2.1 ± 1.5</td>
+                </tr>
+            </tbody>
+        </table>
+    </div>
+    <!-- Section 20: Data Preparation -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">20.</span> Training Data Pipeline
+        </div>
+        <div class="description">
+            The data preparation pipeline processes E. coli genome sequences, validates them, filters by quality metrics, and creates training/validation splits for model fine-tuning.
+        </div>
+        <div class="file-ref">
+            <div class="file-path">File: prepare_ecoli_data.py</div>
+            Lines 50-200 | Data Processing Functions
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 20:</span> Data Preparation Pipeline
+            </div>
+            <pre><code class="language-python">def prepare_training_data(genome_file: str, output_dir: str):
+    """
+    Prepare E. coli training data from genome sequences.
+    Pipeline:
+        1. Load genome sequences (GenBank or FASTA)
+        2. Extract coding sequences (CDSs)
+        3. Validate sequences (start codon, stop codon, length)
+        4. Filter by quality metrics:
+           - CAI > 0.5
+           - Length: 300-3000 bp
+           - No frameshifts
+           - No ambiguous bases
+        5. Split into training/validation/test sets (80/10/10)
+        6. Create codon-tokenized format
+        7. Save as JSON with metadata
+    Args:
+        genome_file: Path to GenBank/FASTA genome file
+        output_dir: Directory for processed data
+    Returns:
+        Dictionary with dataset statistics
+    """
+    from Bio import SeqIO
+    import json
+    print("Loading genome sequences...")
+    sequences = []
+    for record in SeqIO.parse(genome_file, "genbank"):
+        for feature in record.features:
+            if feature.type == "CDS":
+                # Extract DNA and protein sequence
+                dna = str(feature.location.extract(record.seq))
+                try:
+                    protein = str(feature.qualifiers['translation'][0])
+                except:
+                    continue
+                # Validate sequence
+                if not validate_sequence(dna, protein):
+                    continue
+                # Calculate quality metrics
+                cai = get_CSI_value(dna, ecoli_cai_weights)
+                gc = get_GC_content(dna)
+                # Filter by quality
+                if cai < 0.5:  # Low CAI, skip
+                    continue
+                if len(dna) < 300 or len(dna) > 3000:  # Too short/long
+                    continue
+                if gc < 40 or gc > 65:  # Extreme GC content
+                    continue
+                # Get gene metadata
+                gene_id = feature.qualifiers.get('locus_tag', ['unknown'])[0]
+                gene_name = feature.qualifiers.get('gene', [''])[0]
+                product = feature.qualifiers.get('product', [''])[0]
+                sequences.append({
+                    'id': gene_id,
+                    'gene_name': gene_name,
+                    'product': product,
+                    'protein_sequence': protein,
+                    'dna_sequence': dna,
+                    'length_bp': len(dna),
+                    'length_aa': len(protein),
+                    'CAI': float(cai),
+                    'GC_content': float(gc)
+                })
+    print(f"Extracted {len(sequences)} valid CDSs")
+    # Split into train/val/test
+    import random
+    random.shuffle(sequences)
+    n_train = int(0.8 * len(sequences))
+    n_val = int(0.1 * len(sequences))
+    train_data = sequences[:n_train]
+    val_data = sequences[n_train:n_train + n_val]
+    test_data = sequences[n_train + n_val:]
+    # Save datasets
+    with open(f"{output_dir}/train_set.json", 'w') as f:
+        json.dump(train_data, f, indent=2)
+    with open(f"{output_dir}/val_set.json", 'w') as f:
+        json.dump(val_data, f, indent=2)
+    with open(f"{output_dir}/test_set.json", 'w') as f:
+        json.dump(test_data, f, indent=2)
+    # Statistics
+    stats = {
+        'total_sequences': len(sequences),
+        'train_size': len(train_data),
+        'val_size': len(val_data),
+        'test_size': len(test_data),
+        'mean_cai': np.mean([s['CAI'] for s in sequences]),
+        'mean_gc': np.mean([s['GC_content'] for s in sequences]),
+        'mean_length': np.mean([s['length_bp'] for s in sequences])
+    }
+    print("\nDataset Statistics:")
+    print(json.dumps(stats, indent=2))
+    return stats
+def validate_sequence(dna: str, protein: str) -> bool:
+    """Validate DNA-protein pair integrity"""
+    # Check start codon
+    if not dna.upper().startswith('ATG'):
+        return False
+    # Check stop codon
+    stop_codons = ['TAA', 'TAG', 'TGA']
+    if not any(dna.upper().endswith(sc) for sc in stop_codons):
+        return False
+    # Check length match
+    if len(dna) != (len(protein) + 1) * 3:  # +1 for stop codon
+        return False
+    # Verify translation
+    from Bio.Seq import Seq
+    translated = str(Seq(dna).translate(to_stop=True))
+    if translated != protein:
+        return False
+    # Check for ambiguous bases
+    if any(base not in 'ATGC' for base in dna.upper()):
+        return False
+    return True</code></pre>
+        </div>
+        <div class="handwritten-note">
+            Quality filtering ensures the model learns from well-adapted, biologically meaningful sequences rather than noisy genome data.
+        </div>
+    </div>
+    <!-- Section 21: Architecture Overview (was Section 10) -->
+    <div class="section">
+        <div class="section-title">
+            <span class="section-number">21.</span> System Architecture
+        </div>
+        <div class="description">
+            The ENCOT system is organized into modular components that handle different aspects of the
+            optimization pipeline. This architecture promotes code reusability and maintainability.
+        </div>
+        <div class="code-container">
+            <div class="code-header">
+                <span class="listing-number">Listing 21:</span> Project Structure
+            </div>
+            <pre><code class="language-plaintext">ENCOT/
+│
+├── CodonTransformer/              # Core library modules
+│   ├── __init__.py
+│   ├── CodonPrediction.py         # Model loading & inference [1373 lines]
+│   ├── CodonEvaluation.py         # Metrics computation [584 lines]
+│   ├── CodonData.py               # Data preprocessing [683 lines]
+│   ├── CodonUtils.py              # Constants & utilities [872 lines]
+│   ├── CodonJupyter.py            # Notebook helpers
+│   └── CodonPostProcessing.py     # DNA-Chisel integration
+│
+├── scripts/                        # Command-line interfaces
+│   ├── train.py                   # Training wrapper
+│   ├── optimize_sequence.py       # Sequence optimization CLI
+│   ├── run_benchmarks.py          # Benchmark evaluation
+│   └── preprocess_data.py         # Data preparation
+│
+├── configs/                        # Training configurations
+│   ├── train_ecoli_alm.yaml       # Main ALM config
+│   └── train_ecoli_quick.yaml     # Quick test config
+│
+├── streamlit_gui/                 # Web interface
+│   ├── app.py                     # Main Streamlit app [1457 lines]
+│   ├── demo.py                    # Demo script
+│   ├── run_gui.py                 # Launcher
+│   └── test_gui.py                # Test suite
+│
+├── data/                           # Datasets
+│   ├── finetune_set.json          # Training data (4,300 sequences)
+│   ├── test_set.json              # Test data (100 sequences)
+│   └── ecoli_processed_genes.csv  # Reference sequences
+│
+├── tests/                          # Test suite
+│   ├── test_CodonUtils.py
+│   ├── test_CodonData.py
+│   ├── test_CodonPrediction.py
+│   └── test_CodonEvaluation.py
+│
+├── finetune.py                    # Main training script  [734 lines]
+├── benchmark_evaluation.py        # Evaluation script [696 lines]
+├── prepare_ecoli_data.py          # Data validation
+├── setup.py                       # Package installation
+├── pyproject.toml                 # Project metadata
+├── requirements.txt               # Dependencies
+└── README.md                      # Documentation
+Key Components (Lines of Code):
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    CodonPrediction.py      1,373 lines    Inference engine
+    CodonEvaluation.py        584 lines    Metrics
+    CodonData.py             683 lines    Data handling
+    CodonUtils.py            872 lines    Utilities
+    finetune.py              734 lines    Training
+    benchmark_evaluation.py  696 lines    Evaluation
+    streamlit_gui/app.py    1,457 lines    Web GUI
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    TOTAL                   6,399 lines
+Core Innovations:
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  Augmented-Lagrangian Method (ALM) for GC control
+        • Adaptive penalty coefficients
+        • Curriculum learning
+        • Self-tuning multipliers
+   Constrained beam search with GC bounds
+        • Real-time GC monitoring during generation
+        • Pruning of non-compliant candidates
+    Multi-metric evaluation framework
+        • CAI, tAI, GC content
+        • Negative cis-elements detection
+        • Homopolymer analysis</code></pre>
+        </div>
+    </div>
+    <!-- Footer -->
+    <script>
+        // Initialize syntax highlighting
+        hljs.highlightAll();
+    </script>
+</body>
+</html>

ENCOT_Code_Showcase.html ADDED Viewed

	@@ -0,0 +1,791 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ENCOT - Key Code Sections</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github-dark.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/yaml.min.js"></script>
+    <style>
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #0d1117;
+            color: #c9d1d9;
+        }
+        .header {
+            text-align: center;
+            padding: 40px 0;
+            background: linear-gradient(135deg, #1f6feb 0%, #58a6ff 100%);
+            border-radius: 10px;
+            margin-bottom: 30px;
+        }
+        .header h1 {
+            margin: 0;
+            color: white;
+            font-size: 3em;
+            text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
+        }
+        .header p {
+            color: rgba(255,255,255,0.9);
+            font-size: 1.2em;
+            margin: 10px 0 0 0;
+        }
+        .section {
+            background: #161b22;
+            border: 1px solid #30363d;
+            border-radius: 8px;
+            margin: 30px 0;
+            padding: 25px;
+            page-break-inside: avoid;
+        }
+        .section-title {
+            color: #58a6ff;
+            font-size: 1.8em;
+            margin: 0 0 10px 0;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #21262d;
+        }
+        .section-number {
+            display: inline-block;
+            background: #1f6feb;
+            color: white;
+            padding: 5px 15px;
+            border-radius: 20px;
+            font-size: 0.8em;
+            margin-right: 10px;
+        }
+        .description {
+            color: #8b949e;
+            margin: 15px 0;
+            font-size: 1.1em;
+            line-height: 1.6;
+        }
+        .file-info {
+            background: #0d1117;
+            padding: 10px 15px;
+            border-radius: 5px;
+            margin: 15px 0;
+            border-left: 4px solid #1f6feb;
+        }
+        .file-path {
+            color: #58a6ff;
+            font-family: 'Consolas', 'Monaco', monospace;
+        }
+        .line-range {
+            color: #8b949e;
+            font-size: 0.9em;
+        }
+        .highlight-note {
+            background: #ffd33d;
+            color: #1f2328;
+            padding: 3px 8px;
+            border-radius: 3px;
+            font-weight: bold;
+            font-size: 0.9em;
+        }
+        pre {
+            margin: 15px 0;
+            border-radius: 6px;
+            overflow-x: auto;
+        }
+        pre code {
+            font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
+            font-size: 14px;
+            line-height: 1.5;
+        }
+        .key-feature {
+            background: #1f6feb;
+            color: white;
+            padding: 15px;
+            border-radius: 5px;
+            margin: 15px 0;
+        }
+        .footer {
+            text-align: center;
+            margin-top: 50px;
+            padding: 20px;
+            color: #8b949e;
+            border-top: 1px solid #21262d;
+        }
+        @media print {
+            body {
+                background: white;
+                color: black;
+            }
+            .section {
+                border: 1px solid #ccc;
+                page-break-inside: avoid;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>🧬 ENCOT</h1>
+        <p>Enhanced Codon Optimization Tool - Key Code Sections</p>
+    </div>
+    <!-- Section 1: ALM Training Class -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">1</span>
+            ALM Training Harness - Core Innovation
+        </h2>
+        <div class="description">
+            The PyTorch Lightning training harness implementing the Augmented-Lagrangian Method (ALM)
+            for precise GC content control during fine-tuning.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 finetune.py</div>
+            <div class="line-range">Lines 73-148 | Class Definition & Initialization</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> ALM parameters initialization including lagrangian multipliers,
+            adaptive penalty coefficients, and curriculum learning setup
+        </div>
+        <pre><code class="language-python">class plTrainHarness(pl.LightningModule):
+    """
+    PyTorch Lightning training harness for ENCOT with Augmented-Lagrangian Method (ALM) GC control.
+    This class implements the training loop for fine-tuning CodonTransformer on E. coli sequences
+    with precise GC content control using an Augmented-Lagrangian Method. The ALM approach allows
+    the model to learn codon preferences while maintaining GC content within a target range (e.g., 52%).
+    Key features:
+    - Masked language modeling (MLM) loss for codon prediction
+    - ALM-based GC content constraint enforcement
+    - Curriculum learning: warm-up epochs before enforcing GC constraints
+    - Adaptive penalty coefficient (rho) adjustment based on constraint violation progress
+    The ALM method minimizes: L = L_MLM + λ·(GC - μ) + (ρ/2)(GC - μ)²
+    where λ is the Lagrangian multiplier and ρ is the penalty coefficient.
+    """
+    def __init__(self, model, learning_rate, warmup_fraction, gc_penalty_weight, tokenizer,
+                 gc_target=0.52, use_lagrangian=False, lagrangian_rho=10.0, curriculum_epochs=3,
+                 alm_tolerance=1e-5, alm_dual_tolerance=1e-5, alm_penalty_update_factor=10.0,
+                 alm_initial_penalty_factor=20.0, alm_tolerance_update_factor=0.1,
+                 alm_rel_penalty_increase_threshold=0.1, alm_max_penalty=1e6, alm_min_penalty=1e-6):
+        super().__init__()
+        self.model = model
+        self.learning_rate = learning_rate
+        self.warmup_fraction = warmup_fraction
+        self.gc_penalty_weight = gc_penalty_weight
+        self.tokenizer = tokenizer
+        # Augmented-Lagrangian GC Control parameters
+        self.gc_target = gc_target
+        self.use_lagrangian = use_lagrangian
+        self.lagrangian_rho = lagrangian_rho
+        self.curriculum_epochs = curriculum_epochs
+        # Enhanced ALM parameters (inspired by alpaqa research)
+        self.alm_tolerance = alm_tolerance
+        self.alm_dual_tolerance = alm_dual_tolerance
+        self.alm_penalty_update_factor = alm_penalty_update_factor
+        self.alm_initial_penalty_factor = alm_initial_penalty_factor
+        self.alm_tolerance_update_factor = alm_tolerance_update_factor
+        self.alm_rel_penalty_increase_threshold = alm_rel_penalty_increase_threshold
+        self.alm_max_penalty = alm_max_penalty
+        self.alm_min_penalty = alm_min_penalty
+        # Initialize Lagrangian multiplier as buffer (persists across checkpoints)
+        self.register_buffer("lambda_gc", torch.tensor(0.0))
+        # Adaptive penalty coefficient (rho) - starts as parameter, becomes adaptive
+        self.register_buffer("rho_adaptive", torch.tensor(self.lagrangian_rho))
+        # Step counter for periodic lambda updates
+        self.register_buffer("step_counter", torch.tensor(0))
+        # ALM convergence tracking
+        self.register_buffer("previous_constraint_violation", torch.tensor(float('inf')))
+</code></pre>
+    </div>
+    <!-- Section 2: Training Step with ALM Loss -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">2</span>
+            Training Step - ALM Loss Calculation
+        </h2>
+        <div class="description">
+            The training step that combines MLM loss with Lagrangian-based GC constraint enforcement.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 finetune.py</div>
+            <div class="line-range">Lines 150-230 | training_step method</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Calculation of gc_constraint, lagrangian_loss with adaptive penalties
+        </div>
+        <pre><code class="language-python">    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        mlm_loss = outputs.loss
+        # Enhanced Lagrangian-based GC penalty
+        if self.use_lagrangian and self.current_epoch >= self.curriculum_epochs:
+            # Compute GC content from logits
+            logits = outputs.logits
+            predicted_tokens = torch.argmax(logits, dim=-1)
+            # Calculate GC content per sequence
+            gc_content_batch = []
+            for seq_tokens in predicted_tokens:
+                valid_tokens = seq_tokens[seq_tokens >= 26]
+                if len(valid_tokens) == 0:
+                    gc_content_batch.append(self.gc_target)
+                    continue
+                gc_counts = sum(1 for token in valid_tokens if token.item() in G_indices + C_indices)
+                gc_content = gc_counts / len(valid_tokens)
+                gc_content_batch.append(gc_content)
+            gc_content_mean = sum(gc_content_batch) / len(gc_content_batch)
+            # Compute GC constraint violation
+            gc_constraint = gc_content_mean - self.gc_target
+            # Augmented Lagrangian loss term
+            lagrangian_loss = (
+                self.lambda_gc * gc_constraint +
+                (self.rho_adaptive / 2) * (gc_constraint ** 2)
+            )
+            total_loss = mlm_loss + lagrangian_loss
+            # Log metrics
+            self.log("train/mlm_loss", mlm_loss, prog_bar=True)
+            self.log("train/gc_constraint", gc_constraint, prog_bar=True)
+            self.log("train/lagrangian_loss", lagrangian_loss, prog_bar=False)
+            self.log("train/lambda_gc", self.lambda_gc, prog_bar=False)
+            self.log("train/rho", self.rho_adaptive, prog_bar=False)
+            self.log("train/gc_content", gc_content_mean, prog_bar=True)
+            # Update Lagrangian multiplier periodically
+            self.step_counter += 1
+            if self.step_counter % 20 == 0:
+                self._update_alm_parameters(gc_constraint)
+        else:
+            total_loss = mlm_loss
+            self.log("train/mlm_loss", mlm_loss, prog_bar=True)
+        self.log("train/total_loss", total_loss, prog_bar=True)
+        return total_loss
+</code></pre>
+    </div>
+    <!-- Section 3: Adaptive Penalty Update -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">3</span>
+            Adaptive ALM Parameter Updates
+        </h2>
+        <div class="description">
+            Self-tuning mechanism that adjusts Lagrangian multipliers and penalty coefficients based on constraint violation progress.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 finetune.py</div>
+            <div class="line-range">Lines 260-320 | _update_alm_parameters method</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Adaptive penalty adjustment logic - increases penalty if violations don't improve
+        </div>
+        <pre><code class="language-python">    def _update_alm_parameters(self, gc_constraint):
+        """
+        Update Lagrangian multiplier and penalty coefficient according to ALM rules.
+        This implements the adaptive penalty update strategy:
+        - If constraint violation is decreasing sufficiently, update lambda and keep rho
+        - If constraint violation is not improving, increase rho (penalty coefficient)
+        """
+        constraint_violation = abs(gc_constraint.item())
+        # Check if we're making sufficient progress
+        relative_improvement = (
+            (self.previous_constraint_violation - constraint_violation) /
+            max(self.previous_constraint_violation, 1e-8)
+        )
+        if constraint_violation <= self.alm_tolerance:
+            # Constraint satisfied - update lambda, optionally reduce rho
+            self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+            # Could reduce rho here if desired, but keeping it stable works well
+        elif relative_improvement < self.alm_rel_penalty_increase_threshold:
+            # Not making enough progress - increase penalty
+            self.rho_adaptive = torch.clamp(
+                self.rho_adaptive * self.alm_penalty_update_factor,
+                min=self.alm_min_penalty,
+                max=self.alm_max_penalty
+            )
+            # Also update lambda
+            self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+        else:
+            # Making good progress - just update lambda
+            self.lambda_gc = self.lambda_gc + self.rho_adaptive * gc_constraint
+        # Update tracking
+        self.previous_constraint_violation = torch.tensor(constraint_violation)
+</code></pre>
+    </div>
+    <!-- Section 4: Main Prediction Function -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">4</span>
+            DNA Sequence Prediction Function
+        </h2>
+        <div class="description">
+            The main inference function that optimizes protein sequences to DNA with support for constrained beam search and GC content bounds.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 CodonTransformer/CodonPrediction.py</div>
+            <div class="line-range">Lines 38-120 | predict_dna_sequence function signature</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Function parameters including use_constrained_search and gc_bounds
+        </div>
+        <pre><code class="language-python">def predict_dna_sequence(
+    protein: str,
+    organism: Union[int, str],
+    device: torch.device,
+    tokenizer: Union[str, PreTrainedTokenizerFast] = None,
+    model: Union[str, torch.nn.Module] = None,
+    attention_type: str = "original_full",
+    deterministic: bool = True,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    num_sequences: int = 1,
+    match_protein: bool = False,
+    use_constrained_search: bool = False,
+    gc_bounds: Tuple[float, float] = (0.30, 0.70),
+    beam_size: int = 5,
+    length_penalty: float = 1.0,
+    diversity_penalty: float = 0.0,
+) -> Union[DNASequencePrediction, List[DNASequencePrediction]]:
+    """
+    Predict the DNA sequence(s) for a given protein using the ENCOT model.
+    This function takes a protein sequence and an organism (as ID or name) as input
+    and returns the predicted DNA sequence(s) using the ENCOT model. It can use
+    either provided tokenizer and model objects or load them from specified paths.
+    Args:
+        protein (str): The input protein sequence for which to predict the DNA sequence.
+        organism (Union[int, str]): Either the ID of the organism or its name (e.g.,
+            "Escherichia coli general").
+        device (torch.device): The device (CPU or GPU) to run the model on.
+        use_constrained_search (bool, optional): Enable constrained beam search with GC bounds.
+        gc_bounds (Tuple[float, float], optional): GC content bounds (min, max) for
+            constrained search. Defaults to (0.30, 0.70).
+        beam_size (int, optional): Beam size for beam search. Defaults to 5.
+    Returns:
+        Union[DNASequencePrediction, List[DNASequencePrediction]]: Predicted DNA sequence(s)
+            with associated metrics.
+    """
+</code></pre>
+    </div>
+    <!-- Section 5: Evaluation Metrics -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">5</span>
+            Evaluation Metrics - CAI & tAI
+        </h2>
+        <div class="description">
+            Functions for calculating Codon Adaptation Index (CAI) and tRNA Adaptation Index (tAI),
+            key metrics for evaluating codon optimization quality.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 CodonTransformer/CodonEvaluation.py</div>
+            <div class="line-range">Lines 23-50, 370-420 | Metrics functions</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> CAI and tAI calculation implementations
+        </div>
+        <pre><code class="language-python">def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
+    """
+    Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences.
+    Args:
+        sequences (List[str]): List of DNA sequences.
+    Returns:
+        dict: The CSI weights.
+    """
+    return relative_adaptiveness(sequences=sequences)
+def get_CSI_value(dna: str, weights: Dict[str, float]) -> float:
+    """
+    Calculate the Codon Similarity Index (CSI) for a DNA sequence.
+    Args:
+        dna (str): The DNA sequence.
+        weights (dict): The CSI weights from get_CSI_weights.
+    Returns:
+        float: The CSI value.
+    """
+    return CAI(dna, weights)
+def get_ecoli_tai_weights():
+    """
+    Returns pre-calculated tAI weights for E. coli K-12 MG1655.
+    These weights are based on tRNA gene copy numbers and wobble base pairing rules.
+    """
+    return {
+        'TTT': 0.58, 'TTC': 0.42, 'TTA': 0.13, 'TTG': 0.13,
+        'TCT': 0.15, 'TCC': 0.15, 'TCA': 0.12, 'TCG': 0.15,
+        # ... full codon table
+    }
+def calculate_tAI(sequence: str, tai_weights: Dict[str, float]) -> float:
+    """
+    Calculate the tRNA Adaptation Index (tAI) for a DNA sequence.
+    Args:
+        sequence (str): DNA sequence (must be divisible by 3)
+        tai_weights (Dict[str, float]): tAI weights for each codon
+    Returns:
+        float: Geometric mean of tAI weights for all codons in the sequence
+    """
+    if len(sequence) % 3 != 0:
+        raise ValueError("Sequence length must be divisible by 3")
+    codons = [sequence[i:i+3].upper() for i in range(0, len(sequence), 3)]
+    weights = [tai_weights.get(codon, 0.5) for codon in codons if codon not in ['TAA', 'TAG', 'TGA']]
+    if not weights:
+        return 0.0
+    # Geometric mean
+    product = 1.0
+    for w in weights:
+        product *= w
+    return product ** (1.0 / len(weights))
+</code></pre>
+    </div>
+    <!-- Section 6: Training Configuration -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">6</span>
+            Training Configuration - ALM Settings
+        </h2>
+        <div class="description">
+            YAML configuration file defining all training hyperparameters, including ALM-specific settings for GC content control.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 configs/train_ecoli_alm.yaml</div>
+            <div class="line-range">Complete file | Training configuration</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> ALM section with gc_target, curriculum_epochs, and penalty parameters
+        </div>
+        <pre><code class="language-yaml"># ENCOT ALM Training Configuration
+# This configuration reproduces the main training setup from the paper
+# using the Augmented-Lagrangian Method (ALM) for GC content control.
+model:
+  base_model: "adibvafa/CodonTransformer-base"
+  tokenizer: "adibvafa/CodonTransformer"
+data:
+  dataset_dir: "data"
+  # Expected files: finetune_set.json (created by preprocess_data.py)
+training:
+  batch_size: 6
+  max_epochs: 15
+  learning_rate: 5e-5
+  warmup_fraction: 0.1
+  num_workers: 5
+  accumulate_grad_batches: 1
+  num_gpus: 4
+  save_every_n_steps: 512
+  seed: 123
+  log_every_n_steps: 20
+checkpoint:
+  checkpoint_dir: "models/alm-enhanced-training"
+  checkpoint_filename: "balanced_alm_finetune.ckpt"
+# Augmented-Lagrangian Method (ALM) for GC content control
+alm:
+  enabled: true
+  gc_target: 0.52  # Target GC content for E. coli (52%)
+  curriculum_epochs: 3  # Warm-up epochs before enforcing GC constraint
+  # ALM penalty parameters
+  initial_penalty_factor: 20.0
+  penalty_update_factor: 10.0
+  max_penalty: 1e6
+  min_penalty: 1e-6
+  # ALM tolerance parameters
+  tolerance: 1e-5  # Primal tolerance
+  dual_tolerance: 1e-5  # Dual tolerance for constraint violation
+  tolerance_update_factor: 0.1
+  # Adaptive penalty adjustment
+  rel_penalty_increase_threshold: 0.1
+# Legacy penalty method (if ALM disabled)
+gc_penalty:
+  weight: 0.0  # Only used if use_lagrangian=false
+</code></pre>
+    </div>
+    <!-- Section 7: Data Preparation -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">7</span>
+            Data Preparation & Validation
+        </h2>
+        <div class="description">
+            Functions for validating and preparing E. coli gene sequences for training, including sequence validation checks.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 prepare_ecoli_data.py</div>
+            <div class="line-range">Lines 5-30 | Validation function</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Sequence validation rules (start/stop codons, frame, no internal stops)
+        </div>
+        <pre><code class="language-python">def is_valid_sequence(dna_seq: str) -> bool:
+    """
+    Applies a series of validation checks to a DNA sequence.
+    Args:
+        dna_seq (str): The DNA sequence to validate.
+    Returns:
+        bool: True if the sequence is valid, False otherwise.
+    """
+    # Check if length is divisible by 3 (valid codon frame)
+    if len(dna_seq) % 3 != 0:
+        return False
+    # Check for valid start codon
+    if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
+        return False
+    # Check for valid stop codon
+    if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
+        return False
+    # Check for internal stop codons (excluding the last codon)
+    codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
+    if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
+        return False
+    # Check if sequence contains only valid nucleotides
+    if not all(c in 'ATGC' for c in dna_seq.upper()):
+        return False
+    return True
+</code></pre>
+    </div>
+    <!-- Section 8: Streamlit GUI -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">8</span>
+            Streamlit GUI - Main Interface
+        </h2>
+        <div class="description">
+            Web-based graphical interface for ENCOT built with Streamlit, providing user-friendly access to optimization features.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 streamlit_gui/app.py</div>
+            <div class="line-range">Lines 625-640 | Main function</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Streamlit app structure with tabs and model loading
+        </div>
+        <pre><code class="language-python">def main():
+    st.title("ENCOT")
+    st.markdown("E. coli codon optimization with constraint-aware decoding and in silico evaluation metrics.")
+    # Load model
+    load_model_and_tokenizer()
+    # Create the main tabbed interface
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "Single Optimize",
+        "Batch Process",
+        "Comparative Analysis",
+        "Advanced Settings"
+    ])
+    with tab1:
+        single_sequence_optimization()
+    with tab2:
+        batch_processing()
+    with tab3:
+        comparative_analysis()
+    with tab4:
+        advanced_settings()
+    # Footer
+    st.markdown("---")
+    st.markdown("**ENCOT**")
+    st.markdown("Open-source codon optimization for E. coli with reproducible evaluation.")
+</code></pre>
+    </div>
+    <!-- Section 9: Benchmark Evaluation -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">9</span>
+            Benchmark Evaluation Pipeline
+        </h2>
+        <div class="description">
+            Comprehensive benchmarking pipeline for evaluating ENCOT performance on test sequences with multiple metrics.
+        </div>
+        <div class="file-info">
+            <div class="file-path">📄 benchmark_evaluation.py</div>
+            <div class="line-range">Lines 300-400 | Benchmark function</div>
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Highlight:</strong> Multi-metric evaluation (CAI, tAI, GC, cis-elements)
+        </div>
+        <pre><code class="language-python">def benchmark_sequences(sequences, model, tokenizer, device, cai_weights, tai_weights):
+    """
+    Run ENCOT on protein sequences and compute metrics for optimized DNA.
+    Args:
+        sequences: List of protein sequences to optimize
+        model: Loaded ENCOT model
+        tokenizer: Tokenizer for the model
+        device: PyTorch device (CPU/GPU)
+        cai_weights: Pre-computed CAI weights
+        tai_weights: Pre-computed tAI weights
+    Returns:
+        DataFrame with optimization results and metrics
+    """
+    results = []
+    for name, protein in tqdm(sequences, desc="Optimizing sequences"):
+        # Optimize the sequence
+        output = predict_dna_sequence(
+            protein=protein,
+            organism="Escherichia coli general",
+            device=device,
+            model=model,
+            tokenizer=tokenizer,
+            deterministic=True,
+            use_constrained_search=True,
+            gc_bounds=(0.45, 0.55)
+        )
+        optimized_dna = output.predicted_dna
+        # Calculate metrics
+        cai = get_CSI_value(optimized_dna, cai_weights)
+        tai = calculate_tAI(optimized_dna, tai_weights)
+        gc_content = get_GC_content(optimized_dna)
+        cis_elements = count_negative_cis_elements(optimized_dna)
+        results.append({
+            'name': name,
+            'protein': protein,
+            'optimized_dna': optimized_dna,
+            'CAI': cai,
+            'tAI': tai,
+            'GC_content': gc_content,
+            'negative_cis_elements': cis_elements
+        })
+    return pd.DataFrame(results)
+</code></pre>
+    </div>
+    <!-- Section 10: Project Structure -->
+    <div class="section">
+        <h2 class="section-title">
+            <span class="section-number">10</span>
+            Project Overview & Architecture
+        </h2>
+        <div class="description">
+            Complete project structure showing the organization of modules, scripts, and configuration files.
+        </div>
+        <div class="key-feature">
+            <strong>🎯 Key Components:</strong> Training (finetune.py), Inference (CodonPrediction.py),
+            Evaluation (CodonEvaluation.py), GUI (streamlit_gui/), Configs (configs/)
+        </div>
+        <pre><code class="language-plaintext">ENCOT/
+├── CodonTransformer/              # Core library modules
+│   ├── CodonPrediction.py         # Model loading & DNA sequence prediction
+│   ├── CodonEvaluation.py         # Metrics (CAI, tAI, GC, CFD, etc.)
+│   ├── CodonData.py               # Data preprocessing & preparation
+│   ├── CodonUtils.py              # Constants, mappings, utilities
+│   └── CodonPostProcessing.py     # DNA-Chisel integration
+│
+├── scripts/                        # Command-line tools
+│   ├── train.py                   # Training wrapper
+│   ├── optimize_sequence.py       # Sequence optimization CLI
+│   ├── run_benchmarks.py          # Benchmark evaluation
+│   └── preprocess_data.py         # Data preparation
+│
+├── configs/                        # YAML configurations
+│   ├── train_ecoli_alm.yaml       # Main ALM training config ⭐
+│   └── train_ecoli_quick.yaml     # Quick test config
+│
+├── streamlit_gui/                 # Web interface
+│   ├── app.py                     # Main Streamlit GUI ⭐
+│   ├── demo.py                    # Demo script
+│   └── run_gui.py                 # Launcher
+│
+├── data/                           # Datasets
+│   ├── finetune_set.json          # Training data
+│   └── test_set.json              # Test data
+│
+├── finetune.py                    # Main training script ⭐⭐⭐
+├── benchmark_evaluation.py        # Evaluation script
+├── setup.py                       # Package setup
+├── pyproject.toml                 # Project configuration
+└── README.md                      # Documentation
+Key Innovations:
+⭐⭐⭐ Augmented-Lagrangian Method (ALM) for GC control
+⭐⭐  Constrained beam search with GC bounds
+⭐   Multi-metric evaluation (CAI, tAI, GC, cis-elements)
+</code></pre>
+    </div>
+    <div class="footer">
+        <h3>ENCOT - Enhanced Codon Optimization Tool</h3>
+        <p>Repository: <a href="https://github.com/geno543/ENCOT" style="color: #58a6ff;">github.com/geno543/ENCOT</a></p>
+        <p>© 2026 | Apache License 2.0</p>
+    </div>
+    <script>
+        // Initialize syntax highlighting
+        hljs.highlightAll();
+        // Add line numbers
+        document.querySelectorAll('pre code').forEach((block) => {
+            const lines = block.innerHTML.split('\n');
+            const numberedLines = lines.map((line, index) => {
+                return `<span class="line-number" style="color: #6e7681; user-select: none; margin-right: 1em;">${String(index + 1).padStart(3, ' ')}</span>${line}`;
+            }).join('\n');
+            block.innerHTML = numberedLines;
+        });
+    </script>
+</body>
+</html>

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 Adibvafa Fallahpour
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+# Makefile
+.PHONY: test
+test:
+	python -m unittest discover -s tests
+.PHONY: test_with_coverage
+test_with_coverage:
+	coverage run -m unittest discover -s tests

README.md CHANGED Viewed

@@ -1,10 +1,495 @@
----
-title: ColiFormer Ui
-emoji: 👁
-colorFrom: gray
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# ENCOT: A Transformer-Based Codon Optimization Model Balancing Multiple Objectives for Enhanced E. coli Gene Expression
+<p align="center">
+  <a href="https://huggingface.co/saketh11/ColiFormer"><img src="https://img.shields.io/badge/HuggingFace-Model-FFBF00?style=for-the-badge&logo=huggingface&logoColor=white" alt="HuggingFace Model"></a>
+  <a href="https://huggingface.co/datasets/saketh11/ColiFormer-Data"><img src="https://img.shields.io/badge/HuggingFace-Data-FFBF00?style=for-the-badge&logo=huggingface&logoColor=white" alt="HuggingFace Dataset"></a>
+</p>
+## Abstract
+ENCOT is a transformer-based model for codon optimization of protein sequences in *Escherichia coli*. Built on top of CodonTransformer (a multi-species BigBird model trained on over 1 million DNA–protein pairs), ENCOT is fine-tuned specifically for E. coli codon preferences using 3,676 high-expression E. coli genes curated from NCBI.
+ENCOT balances multiple objectives (CAI, GC content, tAI, RNA stability, and minimization of negative cis-regulatory elements) and uses an **Augmented-Lagrangian Method (ALM)** to enforce GC content control during training. Performance was evaluated on 37,053 native E. coli genes and 80 recombinant protein targets, demonstrating strong improvements in in silico expression metrics while maintaining biologically appropriate constraints.
+## Paper Reference
+**ENCOT: A Transformer-Based Codon Optimization Model Balancing Multiple Objectives for Enhanced E. coli Gene Expression**
+Saketh Baddam, Omar Emam, Abdelrahman Elfikky, Francesco Cavarretta, George Luka, Ibrahim Farag, Yasser Sanad
+bioRxiv preprint (not peer-reviewed): `https://doi.org/10.1101/2025.11.26.690826`
+**What does “preprint and not peer-reviewed” mean?** A preprint is a publicly available manuscript shared before formal journal peer review. It can be cited, but its claims have not yet been evaluated by journal referees.
+### Citation
+If you use ENCOT in your research, please cite:
+```bibtex
+@article{encot2025,
+  title{ENCOT: A Transformer-Based Codon Optimization Model Balancing Multiple Objectives for Enhanced E. coli Gene Expression},
+  author={Baddam, Saketh and Emam, Omar and Elfikky, Abdelrahman and Cavarretta, Francesco and Luka, George and Farag, Ibrahim and Sanad, Yasser},
+  journal={bioRxiv},
+  year={2025},
+  doi={10.1101/2025.11.26.690826},
+  url={https://doi.org/10.1101/2025.11.26.690826},
+  note={Preprint (not peer-reviewed)}
+}
+```
+## Quick Start
+Optimize a protein sequence in just a few lines:
+```python
+import torch
+from transformers import AutoTokenizer
+from CodonTransformer.CodonPrediction import load_model, predict_dna_sequence
+from huggingface_hub import hf_hub_download
+# Load model from Hugging Face
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+checkpoint_path = hf_hub_download(
+    repo_id="saketh11/ColiFormer",
+    filename="balanced_alm_finetune.ckpt",
+    cache_dir="./hf_cache"
+)
+model = load_model(model_path=checkpoint_path, device=device)
+tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+# Optimize a protein sequence
+protein = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG"
+output = predict_dna_sequence(
+    protein=protein,
+    organism="Escherichia coli general",
+    device=device,
+    model=model,
+    tokenizer=tokenizer,
+    deterministic=True,
+    match_protein=True
+)
+print(f"Optimized DNA: {output.predicted_dna}")
+```
+Or use the command-line interface:
+```bash
+python scripts/optimize_sequence.py \
+    --input "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG" \
+    --output optimized.fasta
+```
+## Installation
+### Requirements
+- Python >= 3.9
+- CUDA-capable GPU (recommended for training, optional for inference)
+### Setup
+1. **Clone the repository:**
+```bash
+git clone https://github.com/geno543/ENCOT.git
+cd ENCOT
+```
+2. **Create a virtual environment:**
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. **Install dependencies:**
+```bash
+pip install -r requirements.txt
+```
+The installation takes approximately 10-30 seconds depending on your system and existing packages.
+## Public Streamlit Demo (Anyone Can Try It)
+If you want a public link so anyone can test ENCOT in a browser, deploy the app with either Streamlit Community Cloud or Hugging Face Spaces.
+### Option A: Streamlit Community Cloud (Fastest)
+1. Push this repository to GitHub.
+2. Go to https://share.streamlit.io and sign in.
+3. Click **New app** and choose your repository.
+4. Set **Main file path** to `streamlit_app.py`.
+5. Use the repository `requirements.txt` for dependencies.
+6. Deploy and share the generated public URL.
+### Option B: Hugging Face Spaces (Streamlit)
+1. Create a new Space (SDK: **Streamlit**).
+2. Upload this project (or connect the GitHub repo).
+3. Ensure app file is `streamlit_app.py`.
+4. Keep the repo public so anyone can access the Space URL.
+### Local check before deployment
+```bash
+streamlit run streamlit_app.py --server.port 8501
+```
+This uses the existing UI in `streamlit_gui/app.py`, including model loading from Hugging Face and optimization controls.
+## Data Preparation
+### Preparing E. coli Training Data
+To prepare training data from raw E. coli gene sequences:
+1. **Place your data files in the `data/` directory:**
+   - `data/CAI.csv` - CSV file with columns: gene_id, cai_score, dna_sequence
+   - `data/Database 3_4300 gene.csv` - CSV file with high-CAI sequences (column: dna_sequence)
+2. **Run the preprocessing script:**
+```bash
+python scripts/preprocess_data.py
+```
+This will:
+- Validate and process DNA sequences
+- Create `data/ecoli_processed_genes.csv` with validated sequences
+- Generate `data/finetune_set.json` for training (high-CAI sequences)
+- Generate `data/test_set.json` for evaluation (100 random sequences)
+**Custom paths:**
+```bash
+python scripts/preprocess_data.py \
+    --cai_csv data/my_cai_data.csv \
+    --high_cai_csv data/my_high_cai_data.csv \
+    --output_dir my_data \
+    --test_size 200
+```
+### Dataset Structure
+The processed dataset includes:
+- **Training set**: 4,300 high-CAI E. coli sequences (from `Database 3_4300 gene.csv`)
+- **Test set**: 100 randomly sampled sequences (for evaluation)
+- **Reference sequences**: 50,000+ E. coli genes for CAI/tAI calculation
+The complete dataset is available at [saketh11/ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data) on Hugging Face.
+## Training
+### Quick Start Training
+Train ENCOT with the default ALM configuration:
+```bash
+python scripts/train.py --config configs/train_ecoli_alm.yaml
+```
+### Configuration Files
+We provide three configuration files:
+1. **`configs/train_ecoli_alm.yaml`** - Main training configuration with ALM GC control
+   - 15 epochs, batch size 6, 4 GPUs
+   - ALM enabled with GC target 52%
+   - Curriculum learning: 3 warm-up epochs
+2. **`configs/train_ecoli_quick.yaml`** - Quick sanity check
+   - 1 epoch, batch size 2, CPU-only
+   - Useful for testing your setup
+3. **`configs/benchmark.yaml`** - Benchmark evaluation settings
+### Training Parameters
+Key parameters in the config files:
+- **`training.batch_size`**: Batch size (default: 6)
+- **`training.max_epochs`**: Number of training epochs (default: 15)
+- **`training.learning_rate`**: Learning rate (default: 5e-5)
+- **`training.num_gpus`**: Number of GPUs (default: 4)
+- **`alm.enabled`**: Enable ALM GC control (default: true)
+- **`alm.gc_target`**: Target GC content (default: 0.52 for E. coli)
+- **`alm.curriculum_epochs`**: Warm-up epochs before enforcing GC constraint (default: 3)
+### Override Config Values
+You can override config values from the command line:
+```bash
+python scripts/train.py \
+    --config configs/train_ecoli_alm.yaml \
+    --num_gpus 2 \
+    --batch_size 4 \
+    --max_epochs 10
+```
+### Training Output
+Checkpoints are saved to the directory specified in `checkpoint.checkpoint_dir`:
+- Model state dict: `balanced_alm_finetune.ckpt`
+- Training logs: TensorBoard logs in the checkpoint directory
+Monitor training progress:
+```bash
+tensorboard --logdir models/alm-enhanced-training
+```
+## Inference / Sequence Optimization
+### Single Sequence Optimization
+Optimize a single protein sequence:
+```bash
+python scripts/optimize_sequence.py \
+    --input "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG" \
+    --output optimized.fasta
+```
+### Batch Processing
+Process multiple sequences from a FASTA file:
+```bash
+python scripts/optimize_sequence.py \
+    --input sequences.fasta \
+    --output optimized.fasta \
+    --batch
+```
+### GC Content Constraints
+Specify GC content bounds:
+```bash
+python scripts/optimize_sequence.py \
+    --input protein.fasta \
+    --output optimized.fasta \
+    --gc-min 0.45 \
+    --gc-max 0.55
+```
+### Using Custom Checkpoint
+```bash
+python scripts/optimize_sequence.py \
+    --input protein.fasta \
+    --output optimized.fasta \
+    --checkpoint models/my_model.ckpt
+```
+### Python API
+For programmatic use:
+```python
+from CodonTransformer.CodonPrediction import load_model, predict_dna_sequence
+from transformers import AutoTokenizer
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = load_model(model_path="models/alm-enhanced-training/balanced_alm_finetune.ckpt", device=device)
+tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+output = predict_dna_sequence(
+    protein="MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG",
+    organism="Escherichia coli general",
+    device=device,
+    model=model,
+    tokenizer=tokenizer,
+    deterministic=True,
+    match_protein=True,
+    use_constrained_search=True,
+    gc_bounds=(0.45, 0.55),
+    beam_size=20
+)
+print(f"Optimized DNA: {output.predicted_dna}")
+```
+## Reproducing Paper Results
+### Benchmark Evaluation
+To reproduce the benchmark results from the paper:
+1. **Prepare benchmark sequences:**
+   Place your benchmark sequences in an Excel file (see `Benchmark 80 sequences.xlsx` for format).
+2. **Run benchmark evaluation:**
+```bash
+python scripts/run_benchmarks.py --config configs/benchmark.yaml
+```
+This will:
+- Load the fine-tuned ENCOT model
+- Optimize all sequences in the benchmark file
+- Calculate metrics (CAI, tAI, GC content, CFD, negative cis-elements)
+- Generate comparison plots and summary statistics
+- Save results to `benchmark_results/run_TIMESTAMP/`
+### Expected Results
+On the benchmark set of 80 sequences:
+- **CAI improvement**: +6.2% vs base CodonTransformer
+- **tAI improvement**: +8.6% vs base CodonTransformer
+- **GC content**: Mean 52.1% (target: 52%)
+- **Runtime**: ~1-3 seconds per sequence (GPU)
+### Custom Benchmark
+```bash
+python scripts/run_benchmarks.py \
+    --excel_path my_benchmark.xlsx \
+    --checkpoint_path models/my_model.ckpt \
+    --output_dir my_results \
+    --use_gpu
+```
+## Model Architecture
+### Base Model
+ENCOT is built on CodonTransformer, a BigBird transformer model:
+- **Architecture**: BigBirdForMaskedLM (89.6M parameters)
+- **Pre-training**: 1M+ DNA-protein pairs from 164 organisms
+- **Context length**: 2048 tokens
+- **Attention**: Block-sparse attention for efficiency
+### Fine-tuning
+ENCOT is fine-tuned on E. coli-specific data:
+- **Training data**: 4,300 high-CAI E. coli sequences
+- **Loss function**: Masked Language Modeling (MLM) + GC constraint
+- **Optimizer**: AdamW with CosineAnnealingWarmRestarts scheduler
+- **Learning rate**: 5e-5 with 10% warmup
+### Augmented-Lagrangian Method (ALM)
+The ALM approach enforces GC content constraints during training:
+**Objective function:**
+```
+L = L_MLM + λ·(GC - μ) + (ρ/2)(GC - μ)²
+```
+Where:
+- `L_MLM`: Masked language modeling loss
+- `λ`: Lagrangian multiplier (updated adaptively)
+- `ρ`: Penalty coefficient (self-tuning)
+- `GC`: Mean GC content (sliding window of 50 codons)
+- `μ`: Target GC content (0.52 for E. coli)
+**Key features:**
+- **Curriculum learning**: 3 warm-up epochs before enforcing GC constraint
+- **Adaptive penalty**: Penalty coefficient increases if constraint violation doesn't improve
+- **Self-tuning**: Lagrangian multiplier and penalty updated every 20 steps
+This approach allows the model to learn codon preferences while maintaining precise GC content control, critical for synthesis and expression in E. coli.
+## Evaluation Metrics
+ENCOT computes comprehensive metrics for optimized sequences:
+- **CAI (Codon Adaptation Index)**: Measures similarity to highly expressed genes (0-1, higher is better)
+- **tAI (tRNA Adaptation Index)**: Measures tRNA availability (0-1, higher is better)
+- **GC Content**: Percentage of G+C nucleotides (target: 52% for E. coli)
+- **CFD (Codon Frequency Distribution)**: Similarity to reference codon frequencies
+- **Negative cis-elements**: Count of problematic sequence motifs
+- **Homopolymer runs**: Long repeats that can cause synthesis issues
+## Project Structure
+```
+encot/
+├── configs/                    # YAML configuration files
+│   ├── train_ecoli_alm.yaml   # Main training config
+│   ├── train_ecoli_quick.yaml # Quick test config
+│   └── benchmark.yaml         # Benchmark config
+├── scripts/                    # Entry-point scripts
+│   ├── preprocess_data.py     # Data preparation
+│   ├── train.py               # Training wrapper
+│   ├── optimize_sequence.py   # Sequence optimization
+│   └── run_benchmarks.py      # Benchmark evaluation
+├── CodonTransformer/          # Core module (custom, not PyPI)
+│   ├── CodonPrediction.py     # Model loading & inference
+│   ├── CodonEvaluation.py     # Metrics calculation
+│   ├── CodonData.py           # Data preprocessing
+│   └── ...
+├── data/                       # Datasets
+│   ├── finetune_set.json      # Training data
+│   ├── test_set.json          # Test data
+│   └── ecoli_processed_genes.csv  # Reference sequences
+├── models/                     # Model checkpoints
+├── notebooks/                  # Jupyter notebooks
+├── tests/                      # Test suite
+├── streamlit_gui/             # Streamlit web interface
+├── finetune.py                # Training script (original)
+├── benchmark_evaluation.py    # Evaluation script (original)
+└── README.md                  # This file
+```
+## Troubleshooting
+### Common Issues
+**1. CUDA out of memory:**
+- Reduce `batch_size` in config file
+- Use gradient accumulation: increase `accumulate_grad_batches`
+**2. Model checkpoint not found:**
+- The script will auto-download from Hugging Face if local checkpoint missing
+- Ensure you have internet connection for first run
+**3. Data preprocessing errors:**
+- Verify CSV files have correct column names
+- Check that DNA sequences are valid (divisible by 3, proper start/stop codons)
+**4. Import errors:**
+- Ensure you've activated the virtual environment
+- Run `pip install -r requirements.txt` again
+### Getting Help
+- **Issues**: Open an issue on GitHub
+- **Questions**: Check the documentation or contact the authors
+## License
+This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- **CodonTransformer**: Base model from [adibvafa/CodonTransformer](https://github.com/adibvafa/CodonTransformer)
+- **Hugging Face**: Model hosting and distribution
+- **E. coli data**: NCBI and Kazusa codon usage databases
+## Citation
+If you use ENCOT in your research, please cite:
+```bibtex
+@article{encot2025,
+  title={ENCOT: A Transformer-Based Codon Optimization Model Balancing Multiple Objectives for Enhanced E. coli Gene Expression},
+  author={Baddam, Saketh and Emam, Omar and Elfikky, Abdelrahman and Cavarretta, Francesco and Luka, George and Farag, Ibrahim and Sanad, Yasser},
+  journal={bioRxiv},
+  year={2025},
+  doi={10.1101/2025.11.26.690826},
+  url={https://doi.org/10.1101/2025.11.26.690826},
+  note={Preprint (not peer-reviewed)}
+}
+```
+---
+**ENCOT** - State-of-the-art codon optimization for E. coli expression systems.

app.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Hugging Face Spaces Streamlit entrypoint for ENCOT."""
+from pathlib import Path
+import sys
+ROOT = Path(__file__).resolve().parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+# Importing this module executes the Streamlit UI.
+import streamlit_gui.app  # noqa: F401,E402

benchmark_evaluation.py ADDED Viewed

	@@ -0,0 +1,695 @@

+"""
+File: benchmark_evaluation.py
+------------------------------
+Benchmark E. coli protein sequences with ENCOT, generate optimized DNA,
+compute metrics (CAI, tAI, GC, CFD, cis-elements), and produce summary tables
+and figures.
+"""
+import sys
+import os
+import argparse
+import pandas as pd
+import numpy as np
+import torch
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime
+import time
+from tqdm import tqdm
+from typing import Dict, List, Tuple, Any
+from CAI import CAI, relative_adaptiveness
+from CodonTransformer.CodonData import (
+    download_codon_frequencies_from_kazusa,
+    get_codon_frequencies,
+)
+from CodonTransformer.CodonPrediction import (
+    load_model,
+    predict_dna_sequence,
+)
+from CodonTransformer.CodonEvaluation import (
+    get_GC_content,
+    get_ecoli_tai_weights,
+    get_min_max_profile,
+    calculate_tAI,
+    count_negative_cis_elements,
+)
+from transformers import AutoTokenizer
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from evaluate_optimizer import translate_dna_to_protein
+def find_longest_orf(dna_sequence: str) -> str:
+    """
+    Find the longest open reading frame (ORF) in a DNA sequence.
+    Args:
+        dna_sequence (str): Input DNA sequence (ATCGN characters).
+    Returns:
+        str: Longest ORF (from start to stop codon), or empty string if none.
+    """
+    dna_sequence = dna_sequence.upper()
+    start_codons = ['ATG']
+    stop_codons = ['TAA', 'TAG', 'TGA']
+    longest_orf = ""
+    for frame in range(3):
+        current_orf = ""
+        in_orf = False
+        for i in range(frame, len(dna_sequence) - 2, 3):
+            codon = dna_sequence[i:i+3]
+            if len(codon) != 3:
+                break
+            if codon in start_codons and not in_orf:
+                in_orf = True
+                current_orf = codon
+            elif in_orf:
+                current_orf += codon
+                if codon in stop_codons:
+                    if len(current_orf) > len(longest_orf):
+                        longest_orf = current_orf
+                    in_orf = False
+                    current_orf = ""
+        if in_orf and len(current_orf) > len(longest_orf):
+            longest_orf = current_orf
+    return longest_orf
+def _detect_columns(df: pd.DataFrame, name_hint: str | None = None, seq_hint: str | None = None) -> tuple[str | None, str]:
+    """
+    Detect name and sequence columns in a case-insensitive, robust way.
+    Args:
+        df (pd.DataFrame): Input DataFrame read from Excel.
+        name_hint (str | None): Optional override for name/label column (case-insensitive).
+        seq_hint (str | None): Optional override for sequence column (case-insensitive).
+    Returns:
+        tuple[str | None, str]: Detected (name_column or None, sequence_column).
+    Raises:
+        ValueError: If a sequence-like column cannot be found.
+    """
+    cols = list(df.columns)
+    low_map = {c.lower().strip(): c for c in cols}
+    # If hints are provided and exist (case-insensitive), honor them
+    if name_hint:
+        nh = name_hint.lower().strip()
+        if nh in low_map:
+            name_col = low_map[nh]
+        else:
+            name_col = None
+    else:
+        name_col = None
+    if seq_hint:
+        sh = seq_hint.lower().strip()
+        if sh in low_map:
+            seq_col = low_map[sh]
+        else:
+            seq_col = None
+    else:
+        seq_col = None
+    # If not found, try candidates
+    if name_col is None:
+        name_candidates = [
+            'name','id','title','gene','protein','description','label','accession','locus','entry','uniprot','ncbi','protein name'
+        ]
+        for k in name_candidates:
+            if k in low_map:
+                name_col = low_map[k]
+                break
+    if seq_col is None:
+        seq_candidates = [
+            # protein-first
+            'protein sequence','protein_sequence','protein','aa sequence','aa_sequence','aa','amino acid sequence','amino_acid_sequence',
+            # generic
+            'sequence','seq',
+            # dna/cds
+            'cds','dna','coding sequence','coding_sequence','cds sequence','cds_sequence'
+        ]
+        for k in seq_candidates:
+            if k in low_map:
+                seq_col = low_map[k]
+                break
+    if not seq_col:
+        raise ValueError(f"Could not detect sequence column. Available columns: {cols}")
+    return name_col, seq_col
+def parse_excel_sequences(excel_path: str, name_col: str | None = None, seq_col: str | None = None, sheet_name: str | int | None = None) -> List[Dict[str, str]]:
+    """
+    Parse sequences from the benchmark Excel file and auto-detect relevant columns.
+    Args:
+        excel_path (str): Path to the Excel file.
+        name_col (str | None): Optional override for sequence name column.
+        seq_col (str | None): Optional override for sequence column.
+        sheet_name (str | int | None): Sheet name or index (default: first sheet).
+    Returns:
+        List[Dict[str, str]]: List of standardized sequence records with fields:
+            id, name, protein_sequence, original_sequence (DNA or None), is_dna.
+    Raises:
+        ValueError: If a sequence column cannot be detected.
+    """
+    sn = sheet_name
+    if isinstance(sn, str) and sn.isdigit():
+        sn = int(sn)
+    if sn is None:
+        sn = 0
+    df_or_dict = pd.read_excel(excel_path, sheet_name=sn)
+    if isinstance(df_or_dict, dict):
+        first_title, df = next(iter(df_or_dict.items()))
+        print(f"Using sheet: {first_title}")
+    else:
+        df = df_or_dict
+    sequences = []
+    detected_name_col, detected_seq_col = _detect_columns(df, name_col, seq_col)
+    print(f"Detected columns -> name: {detected_name_col or '[generated]'}, sequence: {detected_seq_col}")
+    for idx, row in df.iterrows():
+        sequence = str(row[detected_seq_col]).strip()
+        if detected_name_col:
+            name = str(row[detected_name_col]).strip()
+        else:
+            name = f"seq_{idx}"
+        if name.startswith('>'):
+            name = name[1:].strip()
+        sequence = ''.join(filter(str.isalpha, sequence))
+        dna_chars = sum(1 for c in sequence.upper() if c in 'ATCGN')
+        is_dna = (dna_chars / len(sequence)) > 0.95 if len(sequence) > 0 else False
+        if is_dna:
+            longest_orf = find_longest_orf(sequence)
+            if longest_orf and len(longest_orf) >= 30:
+                original_dna = longest_orf
+                protein_seq = translate_dna_to_protein(longest_orf)
+            else:
+                truncated_len = (len(sequence) // 3) * 3
+                if truncated_len >= 30:
+                    original_dna = sequence[:truncated_len]
+                    protein_seq = translate_dna_to_protein(original_dna)
+                else:
+                    continue
+            if '*' in protein_seq:
+                stop_pos = protein_seq.find('*')
+                if stop_pos >= 10:
+                    protein_seq = protein_seq[:stop_pos]
+                    original_dna = original_dna[:stop_pos*3]
+                else:
+                    continue
+        else:
+            protein_seq = sequence.upper()
+            protein_seq = protein_seq.replace('*', '')
+            original_dna = None
+        if len(protein_seq) < 10:
+            continue
+        sequences.append({
+            'id': idx,
+            'name': name,
+            'protein_sequence': protein_seq,
+            'original_sequence': original_dna,
+            'is_dna': is_dna
+        })
+    return sequences
+def calculate_cfd(dna_sequence: str, codon_frequencies: Dict) -> float:
+    """
+    Calculate Codon Frequency Distribution (CFD) similarity to a reference.
+    Args:
+        dna_sequence (str): Input DNA sequence.
+        codon_frequencies (Dict): Reference frequencies; accepts flattened mapping
+            or an amino2codon structure (will be flattened).
+    Returns:
+        float: Similarity score in [0, 1] where higher is more similar.
+    """
+    if not dna_sequence:
+        return 0.0
+    codon_count = {}
+    total_codons = 0
+    for i in range(0, len(dna_sequence) - 2, 3):
+        codon = dna_sequence[i:i+3].upper()
+        if len(codon) == 3:
+            codon_count[codon] = codon_count.get(codon, 0) + 1
+            total_codons += 1
+    seq_freq = {}
+    if total_codons > 0:
+        for codon, count in codon_count.items():
+            seq_freq[codon] = count / total_codons
+    # Flatten amino2codon frequencies if needed
+    flat_codon_freq = {}
+    if isinstance(codon_frequencies, dict):
+        first_key = next(iter(codon_frequencies.keys()))
+        if isinstance(codon_frequencies[first_key], tuple) and len(codon_frequencies[first_key]) == 2:
+            for amino, (codons, freqs) in codon_frequencies.items():
+                for codon, freq in zip(codons, freqs):
+                    flat_codon_freq[codon] = freq
+        else:
+            flat_codon_freq = codon_frequencies
+    similarity = 0.0
+    count = 0
+    for codon in set(list(seq_freq.keys()) + list(flat_codon_freq.keys())):
+        seq_f = seq_freq.get(codon, 0.0)
+        ref_f = flat_codon_freq.get(codon, 0.0)
+        similarity += 1 - abs(seq_f - ref_f)
+        count += 1
+    return similarity / count if count > 0 else 0.0
+def run_model_on_sequences(
+    sequences: List[Dict],
+    model,
+    tokenizer,
+    device,
+    cai_weights: Dict,
+    tai_weights: Dict,
+    codon_frequencies: Dict,
+    reference_profile: List[float],
+    output_dir: str
+) -> pd.DataFrame:
+    """
+    Run ColiFormer on protein sequences and compute metrics for optimized DNA.
+    Args:
+        sequences (List[Dict]): Parsed sequence records.
+        model: Loaded ColiFormer model.
+        tokenizer: Tokenizer used by the model.
+        device: Torch device.
+        cai_weights (Dict): CAI weights.
+        tai_weights (Dict): tAI weights.
+        codon_frequencies (Dict): Reference codon frequencies.
+        reference_profile (List[float]): Reserved for DTW profile (unused here).
+        output_dir (str): Directory for outputs (not written here).
+    Returns:
+        pd.DataFrame: Per-sequence metrics and optimized DNA.
+    """
+    results = []
+    print(f"Processing {len(sequences)} sequences...")
+    for seq_data in tqdm(sequences, desc="Optimizing sequences"):
+        protein_seq = seq_data['protein_sequence']
+        if len(protein_seq) < 10:
+            continue
+        try:
+            start_time = time.time()
+            output = predict_dna_sequence(
+                protein=protein_seq,
+                organism="Escherichia coli general",
+                device=device,
+                model=model,
+                deterministic=True,
+                match_protein=True,
+            )
+            runtime = time.time() - start_time
+            if isinstance(output, list):
+                optimized_dna = output[0].predicted_dna
+            else:
+                optimized_dna = output.predicted_dna
+            original_metrics = {}
+            if seq_data['is_dna'] and seq_data['original_sequence']:
+                original_dna = seq_data['original_sequence'].upper()
+                original_metrics = {
+                    'original_cai': CAI(original_dna, weights=cai_weights),
+                    'original_gc': get_GC_content(original_dna),
+                    'original_tai': calculate_tAI(original_dna, tai_weights),
+                    'original_cfd': calculate_cfd(original_dna, codon_frequencies),
+                    'original_neg_cis': count_negative_cis_elements(original_dna),
+                }
+            optimized_metrics = {
+                'optimized_cai': CAI(optimized_dna, weights=cai_weights),
+                'optimized_gc': get_GC_content(optimized_dna),
+                'optimized_tai': calculate_tAI(optimized_dna, tai_weights),
+                'optimized_cfd': calculate_cfd(optimized_dna, codon_frequencies),
+                'optimized_neg_cis': count_negative_cis_elements(optimized_dna),
+                'runtime': runtime,
+            }
+            result = {
+                'id': seq_data['id'],
+                'name': seq_data['name'],
+                'protein_sequence': protein_seq,
+                'protein_length': len(protein_seq),
+                'optimized_dna': optimized_dna,
+                **original_metrics,
+                **optimized_metrics,
+            }
+            results.append(result)
+        except Exception as e:
+            print(f"Error processing sequence {seq_data['id']}: {str(e)}")
+            continue
+    return pd.DataFrame(results)
+def generate_visualizations(results_df: pd.DataFrame, output_dir: str):
+    """
+    Generate visualizations and a metrics summary table.
+    Saves:
+        - CAI before/after bar plot
+        - Median CAI comparison
+        - Metrics distribution panel
+        - CSV summary table
+    Args:
+        results_df (pd.DataFrame): Results from optimization.
+        output_dir (str): Output directory root.
+    Returns:
+        pd.DataFrame: Summary table of aggregate metrics.
+    """
+    plt.style.use('seaborn-v0_8-darkgrid')
+    sns.set_palette("husl")
+    fig_dir = os.path.join(output_dir, 'figures')
+    os.makedirs(fig_dir, exist_ok=True)
+    # 1. Before/After CAI Graph
+    if 'original_cai' in results_df.columns:
+        plt.figure(figsize=(12, 8))
+        before_cai = results_df['original_cai'].dropna()
+        after_cai = results_df.loc[before_cai.index, 'optimized_cai']
+        x = np.arange(len(before_cai))
+        width = 0.35
+        fig, ax = plt.subplots(figsize=(14, 8))
+        bars1 = ax.bar(x - width/2, before_cai, width, label='Before Optimization', alpha=0.8)
+        bars2 = ax.bar(x + width/2, after_cai, width, label='After Optimization', alpha=0.8)
+        ax.set_xlabel('Sequence Index', fontsize=12)
+        ax.set_ylabel('CAI Score', fontsize=12)
+        ax.set_title('ENCOT: CAI Before and After Optimization', fontsize=14, fontweight='bold')
+        ax.set_xticks(x[::5])  # Show every 5th label
+        ax.set_xticklabels(x[::5])
+        ax.legend()
+        ax.grid(axis='y', alpha=0.3)
+        avg_before = before_cai.mean()
+        avg_after = after_cai.mean()
+        improvement = ((avg_after - avg_before) / avg_before) * 100
+        ax.text(0.02, 0.98, f'Average CAI Before: {avg_before:.3f}\nAverage CAI After: {avg_after:.3f}\nImprovement: {improvement:.1f}%',
+                transform=ax.transAxes, fontsize=10, verticalalignment='top',
+                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+        plt.tight_layout()
+        plt.savefig(os.path.join(fig_dir, 'cai_before_after.png'), dpi=300, bbox_inches='tight')
+        plt.close()
+        print(f"CAI Before/After graph saved to {os.path.join(fig_dir, 'cai_before_after.png')}")
+        # 1b. Median CAI Before/After Graph
+        plt.figure(figsize=(8, 6))
+        median_before = before_cai.median()
+        median_after = after_cai.median()
+        categories = ['Before Optimization', 'After Optimization']
+        medians = [median_before, median_after]
+        colors = ['#ff7f0e', '#2ca02c']
+        bars = plt.bar(categories, medians, color=colors, alpha=0.8, width=0.6)
+        plt.ylabel('Median CAI Score', fontsize=12)
+        plt.title('ENCOT: Median CAI Before and After Optimization', fontsize=14, fontweight='bold')
+        plt.ylim(0, max(medians) * 1.2)
+        for bar, median in zip(bars, medians):
+            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
+                    f'{median:.3f}', ha='center', va='bottom', fontweight='bold')
+        improvement_pct = ((median_after - median_before) / median_before) * 100
+        plt.text(0.5, max(medians) * 0.95, f'Improvement: {improvement_pct:.1f}%',
+                ha='center', transform=plt.gca().transData, fontsize=12,
+                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
+        plt.grid(axis='y', alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(os.path.join(fig_dir, 'median_cai_comparison.png'), dpi=300, bbox_inches='tight')
+        plt.close()
+        print(f"Median CAI comparison graph saved to {os.path.join(fig_dir, 'median_cai_comparison.png')}")
+    # 2. Summary metrics table
+    metrics_summary = {}
+    if 'original_cai' in results_df.columns:
+        metrics_summary['CAI'] = {
+            'Before': results_df['original_cai'].mean(),
+            'After': results_df['optimized_cai'].mean(),
+            'Improvement': ((results_df['optimized_cai'].mean() - results_df['original_cai'].mean()) / results_df['original_cai'].mean()) * 100
+        }
+        metrics_summary['GC Content (%)'] = {
+            'Before': results_df['original_gc'].mean(),
+            'After': results_df['optimized_gc'].mean(),
+            'Difference': results_df['optimized_gc'].mean() - results_df['original_gc'].mean()
+        }
+        metrics_summary['tAI'] = {
+            'Before': results_df['original_tai'].mean(),
+            'After': results_df['optimized_tai'].mean(),
+            'Improvement': ((results_df['optimized_tai'].mean() - results_df['original_tai'].mean()) / results_df['original_tai'].mean()) * 100
+        }
+        metrics_summary['CFD'] = {
+            'Before': results_df['original_cfd'].mean(),
+            'After': results_df['optimized_cfd'].mean(),
+            'Improvement': ((results_df['optimized_cfd'].mean() - results_df['original_cfd'].mean()) / results_df['original_cfd'].mean()) * 100
+        }
+        metrics_summary['Negative Cis Elements'] = {
+            'Before': results_df['original_neg_cis'].mean(),
+            'After': results_df['optimized_neg_cis'].mean(),
+            'Reduction': results_df['original_neg_cis'].mean() - results_df['optimized_neg_cis'].mean()
+        }
+    else:
+        metrics_summary['CAI'] = {
+            'Optimized': results_df['optimized_cai'].mean(),
+            'Std Dev': results_df['optimized_cai'].std()
+        }
+        metrics_summary['GC Content (%)'] = {
+            'Optimized': results_df['optimized_gc'].mean(),
+            'Std Dev': results_df['optimized_gc'].std()
+        }
+        metrics_summary['tAI'] = {
+            'Optimized': results_df['optimized_tai'].mean(),
+            'Std Dev': results_df['optimized_tai'].std()
+        }
+        metrics_summary['CFD'] = {
+            'Optimized': results_df['optimized_cfd'].mean(),
+            'Std Dev': results_df['optimized_cfd'].std()
+        }
+        metrics_summary['Negative Cis Elements'] = {
+            'Optimized': results_df['optimized_neg_cis'].mean(),
+            'Std Dev': results_df['optimized_neg_cis'].std()
+        }
+    metrics_summary['Runtime (seconds)'] = {
+        'Mean': results_df['runtime'].mean(),
+        'Median': results_df['runtime'].median(),
+        'Total': results_df['runtime'].sum()
+    }
+    summary_df = pd.DataFrame(metrics_summary).T
+    summary_df = summary_df.round(4)
+    summary_df.to_csv(os.path.join(output_dir, 'metrics_summary.csv'))
+    print(f"\nMetrics Summary saved to {os.path.join(output_dir, 'metrics_summary.csv')}")
+    print("\n" + "="*60)
+    print("METRICS SUMMARY:")
+    print("="*60)
+    print(summary_df.to_string())
+    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
+    axes = axes.flatten()
+    metrics_to_plot = [
+        ('optimized_cai', 'CAI Distribution'),
+        ('optimized_gc', 'GC Content Distribution (%)'),
+        ('optimized_tai', 'tAI Distribution'),
+        ('optimized_cfd', 'CFD Distribution'),
+        ('optimized_neg_cis', 'Negative Cis Elements'),
+        ('runtime', 'Runtime Distribution (seconds)')
+    ]
+    for idx, (col, title) in enumerate(metrics_to_plot):
+        if col in results_df.columns:
+            axes[idx].hist(results_df[col].dropna(), bins=20, edgecolor='black', alpha=0.7)
+            axes[idx].set_title(title, fontsize=10, fontweight='bold')
+            axes[idx].set_xlabel(col.replace('optimized_', '').replace('_', ' ').title())
+            axes[idx].set_ylabel('Frequency')
+            axes[idx].grid(axis='y', alpha=0.3)
+            mean_val = results_df[col].mean()
+            axes[idx].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.3f}')
+            axes[idx].legend()
+    plt.suptitle('ENCOT: Optimization Metrics Distribution', fontsize=14, fontweight='bold', y=1.02)
+    plt.tight_layout()
+    plt.savefig(os.path.join(fig_dir, 'metrics_distribution.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"Metrics distribution plot saved to {os.path.join(fig_dir, 'metrics_distribution.png')}")
+    return summary_df
+def main():
+    """CLI entrypoint to run the ENCOT benchmark workflow."""
+    parser = argparse.ArgumentParser(description="Benchmark ENCOT on E. coli sequences")
+    parser.add_argument("--excel_path", type=str, default="Benchmark 80 sequences.xlsx",
+                        help="Path to benchmark Excel file")
+    parser.add_argument("--checkpoint_path", type=str, default="models/ecoli-codon-optimizer/finetune_best.ckpt",
+                        help="Path to fine-tuned model checkpoint")
+    parser.add_argument("--natural_sequences_path", type=str, default="data/ecoli_processed_genes.csv",
+                        help="Path to natural E. coli sequences for CAI calculation")
+    parser.add_argument("--output_dir", type=str, default="benchmark_results",
+                        help="Directory to save results")
+    parser.add_argument("--use_gpu", action="store_true", help="Use GPU if available")
+    parser.add_argument("--name_col", type=str, default=None, help="Optional: column name for sequence label (case-insensitive)")
+    parser.add_argument("--seq_col", type=str, default=None, help="Optional: column name for sequence (case-insensitive)")
+    parser.add_argument("--sheet_name", type=str, default=None, help="Optional: Excel sheet name or index")
+    args = parser.parse_args()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = os.path.join(args.output_dir, f"run_{timestamp}")
+    os.makedirs(output_dir, exist_ok=True)
+    print("="*60)
+    print("ENCOT BENCHMARK EVALUATION")
+    print("="*60)
+    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
+    print(f"Using device: {device}")
+    print(f"\nLoading sequences from {args.excel_path}...")
+    sequences = parse_excel_sequences(
+        args.excel_path,
+        name_col=args.name_col,
+        seq_col=args.seq_col,
+        sheet_name=args.sheet_name,
+    )
+    print(f"Loaded {len(sequences)} sequences")
+    print("\nLoading ENCOT model...")
+    model = load_model(model_path=args.checkpoint_path, device=device)
+    tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+    print("Model loaded successfully")
+    print("\nPreparing evaluation utilities...")
+    natural_df = pd.read_csv(args.natural_sequences_path)
+    ref_sequences = natural_df['dna_sequence'].tolist()
+    cai_weights = relative_adaptiveness(sequences=ref_sequences)
+    print("CAI weights generated")
+    tai_weights = get_ecoli_tai_weights()
+    print("tAI weights loaded")
+    try:
+        codon_frequencies = download_codon_frequencies_from_kazusa(taxonomy_id=83333)
+        print("Codon frequencies loaded from Kazusa")
+    except Exception as e:
+        print(f"Warning: Kazusa download failed ({e}). Using local frequencies.")
+        codon_frequencies = get_codon_frequencies(
+            ref_sequences, organism="Escherichia coli general"
+        )
+    reference_profile = []
+    print("\n" + "="*60)
+    print("RUNNING OPTIMIZATION...")
+    print("="*60)
+    results_df = run_model_on_sequences(
+        sequences=sequences,
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+        cai_weights=cai_weights,
+        tai_weights=tai_weights,
+        codon_frequencies=codon_frequencies,
+        reference_profile=reference_profile,
+        output_dir=output_dir
+    )
+    results_path = os.path.join(output_dir, 'optimization_results.csv')
+    results_df.to_csv(results_path, index=False)
+    print(f"\nRaw results saved to {results_path}")
+    optimized_sequences = results_df[['id', 'name', 'protein_sequence', 'optimized_dna']].copy()
+    optimized_sequences['protein_length'] = results_df['protein_length']
+    optimized_sequences['dna_length'] = optimized_sequences['optimized_dna'].apply(len)
+    optimized_sequences['optimized_cai'] = results_df['optimized_cai']
+    optimized_sequences['optimized_gc'] = results_df['optimized_gc']
+    optimized_sequences['optimized_tai'] = results_df['optimized_tai']
+    if 'original_cai' in results_df.columns:
+        optimized_sequences['original_cai'] = results_df['original_cai']
+        optimized_sequences['cai_improvement'] = ((results_df['optimized_cai'] - results_df['original_cai']) / results_df['original_cai'] * 100).round(2)
+    optimized_sequences_path = os.path.join(output_dir, 'optimized_dna_sequences.csv')
+    optimized_sequences.to_csv(optimized_sequences_path, index=False)
+    print(f"Optimized DNA sequences saved to {optimized_sequences_path}")
+    print("\n" + "="*60)
+    print("GENERATING VISUALIZATIONS...")
+    print("="*60)
+    summary_df = generate_visualizations(results_df, output_dir)
+    print("\n" + "="*60)
+    print("BENCHMARK EVALUATION COMPLETE")
+    print("="*60)
+    print(f"Results saved to: {output_dir}")
+    print(f"Total sequences processed: {len(results_df)}")
+    print(f"Average runtime per sequence: {results_df['runtime'].mean():.2f} seconds")
+    print(f"Total runtime: {results_df['runtime'].sum():.2f} seconds")
+if __name__ == "__main__":
+    main()

comprehensive_model_comparison.png ADDED Viewed

Git LFS Details

SHA256: 7ccd04a955c52c6384c3bb94983d71ed4eca22fe0fac815aaaa147344cd024bc
Pointer size: 131 Bytes
Size of remote file: 630 kB

configs/train_ecoli_alm.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# ENCOT ALM Training Configuration
+# This configuration reproduces the main training setup from the paper
+# using the Augmented-Lagrangian Method (ALM) for GC content control.
+model:
+  base_model: "adibvafa/CodonTransformer-base"
+  tokenizer: "adibvafa/CodonTransformer"
+data:
+  dataset_dir: "data"
+  # Expected files: finetune_set.json (created by preprocess_data.py)
+training:
+  batch_size: 6
+  max_epochs: 15
+  learning_rate: 5e-5
+  warmup_fraction: 0.1
+  num_workers: 5
+  accumulate_grad_batches: 1
+  num_gpus: 4
+  save_every_n_steps: 512
+  seed: 123
+  log_every_n_steps: 20
+checkpoint:
+  checkpoint_dir: "models/alm-enhanced-training"
+  checkpoint_filename: "balanced_alm_finetune.ckpt"
+# Augmented-Lagrangian Method (ALM) for GC content control
+alm:
+  enabled: true
+  gc_target: 0.52  # Target GC content for E. coli (52%)
+  curriculum_epochs: 3  # Warm-up epochs before enforcing GC constraint
+  # ALM penalty parameters
+  initial_penalty_factor: 20.0
+  penalty_update_factor: 10.0
+  max_penalty: 1e6
+  min_penalty: 1e-6
+  # ALM tolerance parameters
+  tolerance: 1e-5  # Primal tolerance
+  dual_tolerance: 1e-5  # Dual tolerance for constraint violation
+  tolerance_update_factor: 0.1
+  # Adaptive penalty adjustment
+  rel_penalty_increase_threshold: 0.1
+# Legacy penalty method (if ALM disabled)
+gc_penalty:
+  weight: 0.0  # Only used if use_lagrangian=false

configs/train_ecoli_quick.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# ENCOT Quick Training Configuration
+# This is a minimal configuration for quick sanity checks and testing.
+# Use this to verify your setup before running full training.
+model:
+  base_model: "adibvafa/CodonTransformer-base"
+  tokenizer: "adibvafa/CodonTransformer"
+data:
+  dataset_dir: "data"
+training:
+  batch_size: 2
+  max_epochs: 1
+  learning_rate: 5e-5
+  warmup_fraction: 0.1
+  num_workers: 0  # Disable multiprocessing for debugging
+  accumulate_grad_batches: 1
+  num_gpus: 0  # CPU-only for quick testing
+  save_every_n_steps: 10
+  seed: 123
+  log_every_n_steps: 5
+checkpoint:
+  checkpoint_dir: "models/test-training"
+  checkpoint_filename: "quick_test.ckpt"
+alm:
+  enabled: false  # Disable ALM for quick test
+  gc_target: 0.52
+  curriculum_epochs: 0
+gc_penalty:
+  weight: 0.0

create_model_datasets.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import pandas as pd
+import json
+import os
+from CodonTransformer.CodonData import prepare_training_data
+def main():
+    """
+    Main function to partition the processed data into fine-tuning and test sets.
+    """
+    if not os.path.exists('data'):
+        print("Error: 'data' directory not found. Please run prepare_ecoli_data.py first.")
+        return
+    processed_data_path = 'data/ecoli_processed_genes.csv'
+    if not os.path.exists(processed_data_path):
+        print(f"Error: Processed data file not found at {processed_data_path}")
+        return
+    df_processed = pd.read_csv(processed_data_path)
+    df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
+    df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
+    df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
+    df_finetune['organism'] = "Escherichia coli general"
+    finetune_output_path = 'data/finetune_set.json'
+    prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
+    print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")
+    df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
+    df_test = df_test_pool.sample(n=100, random_state=42) # for reproducibility
+    df_test['organism'] = 51 # E. coli general
+    df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
+    test_records = df_test[['codons', 'organism']].to_dict(orient='records')
+    test_output_path = 'data/test_set.json'
+    with open(test_output_path, 'w') as f:
+        json.dump(test_records, f, indent=4)
+    print(f"Test set saved to {test_output_path} with {len(df_test)} records.")
+if __name__ == "__main__":
+    main()

evaluate_optimizer.py ADDED Viewed

	@@ -0,0 +1,577 @@

+import sys
+"""
+File: evaluate_optimizer.py
+---------------------------
+Evaluate ColiFormer with enhanced capabilities:
+1) DNAChisel post-processing for sequence polishing
+2) Optional multi-objective generation (Pareto-style filtering)
+3) Enhanced beam search with multiple candidates
+4) Comprehensive metrics and optional ablation studies
+"""
+import argparse
+import json
+import os
+import warnings
+from typing import Dict, List, Tuple, Any
+import numpy as np
+import pandas as pd
+import torch
+from CAI import CAI, relative_adaptiveness
+from tqdm import tqdm
+from CodonTransformer.CodonData import (
+    download_codon_frequencies_from_kazusa,
+    get_codon_frequencies,
+)
+from CodonTransformer.CodonPrediction import (
+    load_model,
+    predict_dna_sequence,
+    get_high_frequency_choice_sequence_optimized,
+)
+from CodonTransformer.CodonEvaluation import (
+    calculate_dtw_distance,
+    calculate_homopolymer_runs,
+    calculate_tAI,
+    count_negative_cis_elements,
+    get_GC_content,
+    get_ecoli_tai_weights,
+    get_min_max_profile,
+    get_sequence_similarity,
+    scan_for_restriction_sites,
+    calculate_ENC,
+    calculate_CPB,
+    calculate_SCUO,
+)
+from CodonTransformer.CodonPostProcessing import (
+    polish_sequence_with_dnachisel,
+)
+from CodonTransformer.CodonUtils import DNASequencePrediction
+def translate_dna_to_protein(dna_sequence: str) -> str:
+    """Translate DNA sequence to protein sequence."""
+    codon_table = {
+        'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+        'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+        'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+        'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+        'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+        'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+        'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+        'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+        'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+        'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+        'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+        'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
+    }
+    protein = ""
+    for i in range(0, len(dna_sequence), 3):
+        codon = dna_sequence[i:i+3].upper()
+        if len(codon) == 3:
+            aa = codon_table.get(codon, 'X')
+            if aa == '*':  # Stop codon
+                break
+            protein += aa
+    return protein
+def evaluate_with_enhancements(
+    protein_sequence: str,
+    model,
+    tokenizer,
+    device,
+    cai_weights: Dict[str, float],
+    tai_weights: Dict[str, float],
+    codon_frequencies: Dict,
+    reference_profile: List[float],
+    args,
+) -> Dict[str, Any]:
+    """
+    Evaluate a protein sequence with enhanced generation techniques.
+    Args:
+        protein_sequence: Input protein sequence
+        model: Fine-tuned model
+        tokenizer: Model tokenizer
+        device: PyTorch device
+        cai_weights: CAI weights dictionary
+        tai_weights: tAI weights dictionary
+        codon_frequencies: Codon frequencies dictionary
+        reference_profile: Reference profile for DTW calculation
+        args: Command line arguments
+    Returns:
+        Dict containing evaluation results for all methods
+    """
+    results = {}
+    # 1. Original fine-tuned model (baseline)
+    try:
+        original_output = predict_dna_sequence(
+            protein=protein_sequence,
+            organism="Escherichia coli general",
+            device=device,
+            model=model,
+            deterministic=True,
+            match_protein=True,
+            use_constrained_search=args.use_constrained_search,
+            gc_bounds=tuple(args.gc_bounds),
+            beam_size=args.beam_size,
+            length_penalty=args.length_penalty,
+            diversity_penalty=args.diversity_penalty,
+        )
+        if isinstance(original_output, list):
+            original_dna = original_output[0].predicted_dna
+        else:
+            original_dna = original_output.predicted_dna
+        results['fine_tuned_original'] = {
+            'dna_sequence': original_dna,
+            'method': 'fine_tuned_original',
+            'enhancement': 'none',
+        }
+    except Exception as e:
+        print(f"Warning: Original fine-tuned generation failed: {str(e)}")
+        results['fine_tuned_original'] = {
+            'dna_sequence': '',
+            'method': 'fine_tuned_original',
+            'enhancement': 'none',
+            'error': str(e),
+        }
+    # 2. Enhanced sequence generation (DNAChisel + Pareto filtering)
+    if args.use_enhanced_generation:
+        try:
+            enhanced_dna, generation_report = enhanced_sequence_generation(
+                protein_sequence=protein_sequence,
+                model=model,
+                tokenizer=tokenizer,
+                device=device,
+                beam_size=args.enhanced_beam_size,
+                gc_bounds=(args.gc_bounds[0] * 100, args.gc_bounds[1] * 100),
+                use_dnachisel_polish=args.use_dnachisel,
+                use_pareto_filtering=args.use_pareto_filtering,
+                cai_weights=cai_weights,
+                tai_weights=tai_weights,
+                codon_frequencies=codon_frequencies,
+                reference_profile=reference_profile,
+            )
+            results['fine_tuned_enhanced'] = {
+                'dna_sequence': enhanced_dna,
+                'method': 'fine_tuned_enhanced',
+                'enhancement': 'dnachisel+pareto',
+                'generation_report': generation_report,
+            }
+        except Exception as e:
+            print(f"Warning: Enhanced generation failed: {str(e)}")
+            results['fine_tuned_enhanced'] = {
+                'dna_sequence': '',
+                'method': 'fine_tuned_enhanced',
+                'enhancement': 'dnachisel+pareto',
+                'error': str(e),
+            }
+    # 3. DNAChisel post-processing only (ablation study)
+    if args.use_dnachisel and 'fine_tuned_original' in results and results['fine_tuned_original']['dna_sequence']:
+        try:
+            dnachisel_dna, polish_report = polish_sequence_with_dnachisel(
+                dna_sequence=results['fine_tuned_original']['dna_sequence'],
+                protein_sequence=protein_sequence,
+                gc_bounds=(args.gc_bounds[0] * 100, args.gc_bounds[1] * 100),
+                maximize_cai=True,
+                seed=42,
+            )
+            results['fine_tuned_dnachisel'] = {
+                'dna_sequence': dnachisel_dna,
+                'method': 'fine_tuned_dnachisel',
+                'enhancement': 'dnachisel_only',
+                'polish_report': polish_report,
+            }
+        except Exception as e:
+            print(f"Warning: DNAChisel post-processing failed: {str(e)}")
+            results['fine_tuned_dnachisel'] = {
+                'dna_sequence': '',
+                'method': 'fine_tuned_dnachisel',
+                'enhancement': 'dnachisel_only',
+                'error': str(e),
+            }
+    return results
+def calculate_comprehensive_metrics(
+    dna_sequence: str,
+    protein_sequence: str,
+    cai_weights: Dict[str, float],
+    tai_weights: Dict[str, float],
+    codon_frequencies: Dict,
+    reference_profile: List[float],
+    ref_sequences: List[str],
+) -> Dict[str, float]:
+    """Calculate comprehensive metrics for a DNA sequence."""
+    if not dna_sequence:
+        return {
+            'cai': 0.0,
+            'tai': 0.0,
+            'gc_content': 0.0,
+            'restriction_sites': float('inf'),
+            'neg_cis_elements': float('inf'),
+            'homopolymer_runs': float('inf'),
+            'dtw_distance': float('inf'),
+            'enc': 0.0,
+            'cpb': 0.0,
+            'scuo': 0.0,
+        }
+    return calculate_sequence_metrics(
+        dna_sequence=dna_sequence,
+        protein_sequence=protein_sequence,
+        cai_weights=cai_weights,
+        tai_weights=tai_weights,
+        codon_frequencies=codon_frequencies,
+        reference_profile=reference_profile,
+    )
+def run_ablation_study(results_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Run ablation study to compare different enhancement methods.
+    Args:
+        results_df: DataFrame with evaluation results
+    Returns:
+        DataFrame with ablation study results
+    """
+    # Group by protein and calculate improvements
+    ablation_results = []
+    for protein in results_df['protein_sequence'].unique():
+        protein_results = results_df[results_df['protein_sequence'] == protein]
+        # Get baseline (original fine-tuned)
+        baseline = protein_results[protein_results['method'] == 'fine_tuned_original']
+        if baseline.empty:
+            continue
+        baseline_metrics = baseline.iloc[0]
+        # Compare each enhancement method
+        for method in protein_results['method'].unique():
+            if method == 'fine_tuned_original':
+                continue
+            method_results = protein_results[protein_results['method'] == method]
+            if method_results.empty:
+                continue
+            method_metrics = method_results.iloc[0]
+            # Calculate improvements
+            improvements = {
+                'protein': protein,
+                'method': method,
+                'enhancement': method_metrics['enhancement'],
+                'cai_improvement': method_metrics['cai'] - baseline_metrics['cai'],
+                'tai_improvement': method_metrics['tai'] - baseline_metrics['tai'],
+                'gc_improvement': abs(method_metrics['gc_content'] - 52) - abs(baseline_metrics['gc_content'] - 52),
+                'restriction_sites_improvement': baseline_metrics['restriction_sites'] - method_metrics['restriction_sites'],
+                'neg_cis_improvement': baseline_metrics['neg_cis_elements'] - method_metrics['neg_cis_elements'],
+                'homopolymer_improvement': baseline_metrics['homopolymer_runs'] - method_metrics['homopolymer_runs'],
+                'dtw_improvement': baseline_metrics['dtw_distance'] - method_metrics['dtw_distance'],
+                'composite_score_improvement': (
+                    (method_metrics['cai'] - baseline_metrics['cai']) * 0.3 +
+                    (method_metrics['tai'] - baseline_metrics['tai']) * 0.3 +
+                    (abs(baseline_metrics['gc_content'] - 52) - abs(method_metrics['gc_content'] - 52)) * 0.2 +
+                    (baseline_metrics['restriction_sites'] - method_metrics['restriction_sites']) * 0.1 +
+                    (baseline_metrics['neg_cis_elements'] - method_metrics['neg_cis_elements']) * 0.1
+                ),
+            }
+            ablation_results.append(improvements)
+    return pd.DataFrame(ablation_results)
+def main(args):
+    """Main function to run the enhanced evaluation."""
+    print("=== Enhanced CodonTransformer Evaluation ===")
+    # Setup device
+    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
+    print(f"Using device: {device}")
+    # Load test data
+    with open(args.test_data_path, "r") as f:
+        first = f.read(1)
+        f.seek(0)
+        if first == "[":
+            test_set = json.load(f)
+        else:
+            test_set = [json.loads(line) for line in f if line.strip()]
+    # Limit test set size if requested
+    if args.max_test_proteins > 0:
+        test_set = test_set[:args.max_test_proteins]
+    print(f"Loaded {len(test_set)} proteins from the test set.")
+    # Load models
+    print("Loading models...")
+    finetuned_model = load_model(model_path=args.checkpoint_path, device=device)
+    print(f"Fine-tuned model loaded from {args.checkpoint_path}")
+    # Load tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+    # Load base model if comparison requested
+    base_model = None
+    if args.compare_with_base:
+        base_model = load_model(device=device)
+        print("Base model loaded from Hugging Face")
+    # Prepare evaluation utilities
+    print("Preparing evaluation utilities...")
+    # CAI weights
+    natural_csv = args.natural_sequences_path
+    natural_df = pd.read_csv(natural_csv)
+    ref_sequences = natural_df['dna_sequence'].tolist()
+    cai_weights = relative_adaptiveness(sequences=ref_sequences)
+    print("CAI weights generated")
+    # tAI weights
+    tai_weights = get_ecoli_tai_weights()
+    print("tAI weights loaded")
+    # Codon frequencies
+    try:
+        codon_frequencies = download_codon_frequencies_from_kazusa(taxonomy_id=83333)
+        print("Codon frequencies loaded from Kazusa")
+    except Exception as e:
+        print(f"Warning: Kazusa download failed ({e}). Using local frequencies.")
+        codon_frequencies = get_codon_frequencies(
+            ref_sequences, organism="Escherichia coli general"
+        )
+    # Reference profile for DTW
+    reference_profiles = [
+        get_min_max_profile(seq, codon_frequencies) for seq in ref_sequences[:100]
+    ]
+    valid_profiles = [p for p in reference_profiles if p and not all(v is None for v in p)]
+    if valid_profiles:
+        max_len = max(len(p) for p in valid_profiles)
+        padded_profiles = [
+            np.pad(
+                np.array([v for v in p if v is not None]),
+                (0, max_len - len([v for v in p if v is not None])),
+                "constant",
+                constant_values=np.nan,
+            )
+            for p in valid_profiles
+        ]
+        avg_reference_profile = np.nanmean(padded_profiles, axis=0).tolist()
+    else:
+        avg_reference_profile = []
+    print("Reference profile generated")
+    # Run evaluation
+    all_results = []
+    evaluation_reports = []
+    print("Starting enhanced evaluation...")
+    for i, item in enumerate(tqdm(test_set, desc="Evaluating proteins")):
+        # Get protein sequence
+        if "protein_sequence" in item:
+            protein_sequence = item["protein_sequence"]
+        else:
+            dna_sequence = item["codons"]
+            protein_sequence = translate_dna_to_protein(dna_sequence)
+        # Skip if protein is too short or too long
+        if len(protein_sequence) < 10 or len(protein_sequence) > 1000:
+            continue
+        # Evaluate with enhancements
+        protein_results = evaluate_with_enhancements(
+            protein_sequence=protein_sequence,
+            model=finetuned_model,
+            tokenizer=tokenizer,
+            device=device,
+            cai_weights=cai_weights,
+            tai_weights=tai_weights,
+            codon_frequencies=codon_frequencies,
+            reference_profile=avg_reference_profile,
+            args=args,
+        )
+        # Add base model comparison if requested
+        if base_model:
+            try:
+                base_output = predict_dna_sequence(
+                    protein=protein_sequence,
+                    organism="Escherichia coli general",
+                    device=device,
+                    model=base_model,
+                    deterministic=True,
+                    match_protein=True,
+                )
+                base_dna = base_output.predicted_dna if not isinstance(base_output, list) else base_output[0].predicted_dna
+                protein_results['base_model'] = {
+                    'dna_sequence': base_dna,
+                    'method': 'base_model',
+                    'enhancement': 'none',
+                }
+            except Exception as e:
+                print(f"Warning: Base model generation failed: {str(e)}")
+        # Add naive baseline
+        try:
+            naive_dna = get_high_frequency_choice_sequence_optimized(
+                protein=protein_sequence, codon_frequencies=codon_frequencies
+            )
+            protein_results['naive_hfc'] = {
+                'dna_sequence': naive_dna,
+                'method': 'naive_hfc',
+                'enhancement': 'none',
+            }
+        except Exception as e:
+            print(f"Warning: Naive HFC generation failed: {str(e)}")
+        # Calculate metrics for each method
+        for method_name, method_result in protein_results.items():
+            if 'error' in method_result:
+                continue
+            dna_seq = method_result['dna_sequence']
+            if not dna_seq:
+                continue
+            metrics = calculate_comprehensive_metrics(
+                dna_sequence=dna_seq,
+                protein_sequence=protein_sequence,
+                cai_weights=cai_weights,
+                tai_weights=tai_weights,
+                codon_frequencies=codon_frequencies,
+                reference_profile=avg_reference_profile,
+                ref_sequences=ref_sequences,
+            )
+            # Combine results
+            result_row = {
+                'protein_id': i,
+                'protein_sequence': protein_sequence,
+                'protein_length': len(protein_sequence),
+                'method': method_name,
+                'enhancement': method_result['enhancement'],
+                'dna_sequence': dna_seq,
+                'dna_length': len(dna_seq),
+                **metrics,
+            }
+            # Add generation reports if available
+            if 'generation_report' in method_result:
+                result_row['generation_report'] = str(method_result['generation_report'])
+            if 'polish_report' in method_result:
+                result_row['polish_report'] = str(method_result['polish_report'])
+            all_results.append(result_row)
+    # Create results DataFrame
+    results_df = pd.DataFrame(all_results)
+    # Save detailed results
+    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+    results_df.to_csv(args.output_path, index=False)
+    print(f"Detailed results saved to {args.output_path}")
+    # Run ablation study
+    if args.run_ablation_study:
+        ablation_df = run_ablation_study(results_df)
+        ablation_path = args.output_path.replace('.csv', '_ablation.csv')
+        ablation_df.to_csv(ablation_path, index=False)
+        print(f"Ablation study results saved to {ablation_path}")
+        # Print summary statistics
+        print("\n=== ABLATION STUDY SUMMARY ===")
+        for method in ablation_df['method'].unique():
+            method_results = ablation_df[ablation_df['method'] == method]
+            print(f"\n{method.upper()}:")
+            print(f"  CAI improvement: {method_results['cai_improvement'].mean():.4f} ± {method_results['cai_improvement'].std():.4f}")
+            print(f"  tAI improvement: {method_results['tai_improvement'].mean():.4f} ± {method_results['tai_improvement'].std():.4f}")
+            print(f"  GC improvement: {method_results['gc_improvement'].mean():.4f} ± {method_results['gc_improvement'].std():.4f}")
+            print(f"  Restriction sites improvement: {method_results['restriction_sites_improvement'].mean():.2f} ± {method_results['restriction_sites_improvement'].std():.2f}")
+            print(f"  Composite score improvement: {method_results['composite_score_improvement'].mean():.4f} ± {method_results['composite_score_improvement'].std():.4f}")
+    # Print final summary
+    print("\n=== EVALUATION COMPLETE ===")
+    print(f"Total proteins evaluated: {len(results_df['protein_id'].unique())}")
+    print(f"Total sequences generated: {len(results_df)}")
+    print(f"Results saved to: {args.output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Enhanced CodonTransformer Evaluation")
+    # Input/Output paths
+    parser.add_argument("--checkpoint_path", type=str, default="models/ecoli-codon-optimizer/finetune_best.ckpt",
+                        help="Path to fine-tuned model checkpoint")
+    parser.add_argument("--test_data_path", type=str, default="data/test_set.json",
+                        help="Path to test dataset")
+    parser.add_argument("--natural_sequences_path", type=str, default="data/ecoli_processed_genes.csv",
+                        help="Path to natural E. coli sequences for CAI calculation")
+    parser.add_argument("--output_path", type=str, default="results/enhanced_evaluation_results.csv",
+                        help="Path to save evaluation results")
+    # Model parameters
+    parser.add_argument("--use_gpu", action="store_true", help="Use GPU if available")
+    parser.add_argument("--compare_with_base", action="store_true", help="Compare with base model")
+    # Generation parameters
+    parser.add_argument("--use_constrained_search", action="store_true",
+                        help="Use constrained beam search")
+    parser.add_argument("--gc_bounds", type=float, nargs=2, default=[0.50, 0.54],
+                        help="GC content bounds (min max)")
+    parser.add_argument("--beam_size", type=int, default=10,
+                        help="Beam size for standard generation")
+    parser.add_argument("--length_penalty", type=float, default=1.2,
+                        help="Length penalty for beam search")
+    parser.add_argument("--diversity_penalty", type=float, default=0.1,
+                        help="Diversity penalty for beam search")
+    # Enhancement parameters
+    parser.add_argument("--use_enhanced_generation", action="store_true",
+                        help="Use enhanced generation with DNAChisel and Pareto filtering")
+    parser.add_argument("--enhanced_beam_size", type=int, default=20,
+                        help="Beam size for enhanced generation")
+    parser.add_argument("--use_dnachisel", action="store_true",
+                        help="Use DNAChisel post-processing")
+    parser.add_argument("--use_pareto_filtering", action="store_true",
+                        help="Use Pareto frontier filtering")
+    # Evaluation parameters
+    parser.add_argument("--max_test_proteins", type=int, default=0,
+                        help="Maximum number of proteins to test (0 for all)")
+    parser.add_argument("--run_ablation_study", action="store_true",
+                        help="Run ablation study comparing methods")
+    args = parser.parse_args()
+    main(args)

prepare_ecoli_data.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+from Bio.Seq import Seq
+import os
+def is_valid_sequence(dna_seq: str) -> bool:
+    """
+    Applies a series of validation checks to a DNA sequence.
+    Args:
+        dna_seq (str): The DNA sequence to validate.
+    Returns:
+        bool: True if the sequence is valid, False otherwise.
+    """
+    if len(dna_seq) % 3 != 0:
+        return False
+    if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
+        return False
+    if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
+        return False
+    codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
+    if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
+        return False
+    if not all(c in 'ATGC' for c in dna_seq.upper()):
+        return False
+    return True
+def main():
+    """
+    Main function to process and validate E. coli gene data.
+    """
+    if not os.path.exists('data'):
+        os.makedirs('data')
+    print("Loading data from CSV files...")
+    df_all = pd.read_csv("data/CAI.csv", header=0, names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3'])
+    df_high_cai = pd.read_csv("data/Database 3_4300 gene.csv", header=0, names=['dna_sequence'])
+    high_cai_sequences = set(df_high_cai['dna_sequence'])
+    validated_genes = []
+    for index, row in df_all.iterrows():
+        gene_id = row['gene_id']
+        dna_sequence = str(row['dna_sequence'])
+        if is_valid_sequence(dna_sequence):
+            protein_sequence = str(Seq(dna_sequence).translate())
+            is_high_cai = dna_sequence in high_cai_sequences
+            validated_genes.append({
+                'gene_id': gene_id,
+                'dna_sequence': dna_sequence,
+                'protein_sequence': protein_sequence,
+                'cai_score': row.get('cai_score', None),
+                'is_high_cai': is_high_cai
+            })
+    df_processed = pd.DataFrame(validated_genes)
+    output_path = 'data/ecoli_processed_genes.csv'
+    df_processed.to_csv(output_path, index=False)
+    print(f"Processed data saved to {output_path}")
+    print(f"Total validated genes: {len(df_processed)}")
+if __name__ == "__main__":
+    main()

pretrain.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+File: pretrain.py
+-----------------
+Pretrain the base transformer model on JSON datasets prepared via
+CodonData.prepare_training_data. This is typically not needed for ENCOT
+as we use the pretrained CodonTransformer base. See README for setup and usage.
+"""
+import argparse
+import os
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import DataLoader
+from transformers import BigBirdConfig, BigBirdForMaskedLM, PreTrainedTokenizerFast
+from CodonTransformer.CodonUtils import (
+    MAX_LEN,
+    NUM_ORGANISMS,
+    TOKEN2MASK,
+    IterableJSONData,
+)
+class MaskedTokenizerCollator:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, examples):
+        tokenized = self.tokenizer(
+            [ex["codons"] for ex in examples],
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            truncation=True,
+            padding=True,
+            max_length=MAX_LEN,
+            return_tensors="pt",
+        )
+        seq_len = tokenized["input_ids"].shape[-1]
+        species_index = torch.tensor([[ex["organism"]] for ex in examples])
+        tokenized["token_type_ids"] = species_index.repeat(1, seq_len)
+        inputs = tokenized["input_ids"]
+        targets = inputs.clone()
+        prob_matrix = torch.full(inputs.shape, 0.15)
+        prob_matrix[inputs < 5] = 0.0
+        selected = torch.bernoulli(prob_matrix).bool()
+        replaced = torch.bernoulli(torch.full(selected.shape, 0.8)).bool() & selected
+        inputs[replaced] = torch.tensor(
+            list((map(TOKEN2MASK.__getitem__, inputs[replaced].numpy())))
+        )
+        randomized = (
+            torch.bernoulli(torch.full(selected.shape, 0.1)).bool()
+            & selected
+            & ~replaced
+        )
+        random_idx = torch.randint(26, 90, inputs.shape, dtype=torch.long)
+        inputs[randomized] = random_idx[randomized]
+        tokenized["input_ids"] = inputs
+        tokenized["labels"] = torch.where(selected, targets, -100)
+        return tokenized
+class plTrainHarness(pl.LightningModule):
+    def __init__(self, model, learning_rate, warmup_fraction):
+        super().__init__()
+        self.model = model
+        self.learning_rate = learning_rate
+        self.warmup_fraction = warmup_fraction
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=self.learning_rate,
+        )
+        lr_scheduler = {
+            "scheduler": torch.optim.lr_scheduler.OneCycleLR(
+                optimizer,
+                max_lr=self.learning_rate,
+                total_steps=self.trainer.estimated_stepping_batches,
+                pct_start=self.warmup_fraction,
+            ),
+            "interval": "step",
+            "frequency": 1,
+        }
+        return [optimizer], [lr_scheduler]
+    def training_step(self, batch, batch_idx):
+        self.model.bert.set_attention_type("block_sparse")
+        outputs = self.model(**batch)
+        self.log_dict(
+            dictionary={
+                "loss": outputs.loss,
+                "lr": self.trainer.optimizers[0].param_groups[0]["lr"],
+            },
+            on_step=True,
+            prog_bar=True,
+        )
+        return outputs.loss
+class EpochCheckpoint(pl.Callback):
+    def __init__(self, checkpoint_dir, save_interval):
+        super().__init__()
+        self.checkpoint_dir = checkpoint_dir
+        self.save_interval = save_interval
+    def on_train_epoch_end(self, trainer, pl_module):
+        current_epoch = trainer.current_epoch
+        if current_epoch % self.save_interval == 0 or current_epoch == 0:
+            checkpoint_path = os.path.join(
+                self.checkpoint_dir, f"epoch_{current_epoch}.ckpt"
+            )
+            trainer.save_checkpoint(checkpoint_path)
+            print(f"\nCheckpoint saved at {checkpoint_path}\n")
+def main(args):
+    """Pretrain the base transformer model."""
+    pl.seed_everything(args.seed)
+    torch.set_float32_matmul_precision("medium")
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_file=args.tokenizer_path,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+    )
+    config = BigBirdConfig(
+        vocab_size=len(tokenizer),
+        type_vocab_size=NUM_ORGANISMS,
+        sep_token_id=2,
+    )
+    model = BigBirdForMaskedLM(config=config)
+    harnessed_model = plTrainHarness(model, args.learning_rate, args.warmup_fraction)
+    train_data = IterableJSONData(args.train_data_path, dist_env="slurm")
+    data_loader = DataLoader(
+        dataset=train_data,
+        collate_fn=MaskedTokenizerCollator(tokenizer),
+        batch_size=args.batch_size,
+        num_workers=0 if args.debug else args.num_workers,
+        persistent_workers=False if args.debug else True,
+    )
+    save_checkpoint = EpochCheckpoint(args.checkpoint_dir, args.save_interval)
+    trainer = pl.Trainer(
+        default_root_dir=args.checkpoint_dir,
+        strategy="ddp_find_unused_parameters_true",
+        accelerator="gpu",
+        devices=1 if args.debug else args.num_gpus,
+        precision="16-mixed",
+        max_epochs=args.max_epochs,
+        deterministic=False,
+        enable_checkpointing=True,
+        callbacks=[save_checkpoint],
+        accumulate_grad_batches=args.accumulate_grad_batches,
+    )
+    # Pretrain the model
+    trainer.fit(harnessed_model, data_loader)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Pretrain the base transformer model.")
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        required=True,
+        help="Path to the tokenizer model file",
+    )
+    parser.add_argument(
+        "--train_data_path",
+        type=str,
+        required=True,
+        help="Path to the training data JSON file",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Directory where checkpoints will be saved",
+    )
+    parser.add_argument(
+        "--batch_size", type=int, default=6, help="Batch size for training"
+    )
+    parser.add_argument(
+        "--max_epochs", type=int, default=5, help="Maximum number of epochs to train"
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=5, help="Number of workers for data loading"
+    )
+    parser.add_argument(
+        "--accumulate_grad_batches",
+        type=int,
+        default=1,
+        help="Number of batches to accumulate gradients",
+    )
+    parser.add_argument(
+        "--num_gpus", type=int, default=16, help="Number of GPUs to use for training"
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Learning rate for the optimizer",
+    )
+    parser.add_argument(
+        "--warmup_fraction",
+        type=float,
+        default=0.1,
+        help="Fraction of total steps to use for warmup",
+    )
+    parser.add_argument(
+        "--save_interval", type=int, default=5, help="Save checkpoint every N epochs"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=123, help="Random seed for reproducibility"
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+    main(args)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,62 @@

+[tool.poetry]
+name = "ENCOT"
+version = "1.0.0"
+description = "Transformer-based codon optimization for E. coli using deep learning with Augmented-Lagrangian GC control."
+authors = ["Adibvafa Fallahpour <Adibvafa.fallahpour@mail.utoronto.ca>"]
+license = "Apache-2.0"
+readme = "README.md"
+homepage = "https://github.com/geno543/ENCOT"
+repository = "https://github.com/geno543/ENCOT"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+[tool.poetry.dependencies]
+python = "^3.9"
+biopython = "^1.83"
+ipywidgets = "^7.0.0"
+numpy = "<2.0.0"
+onnxruntime = "^1.16.3"
+pandas = "^2.0.0"
+python_codon_tables = "^0.1.12"
+pytorch_lightning = "^2.2.1"
+scikit-learn = "^1.2.2"
+scipy = "^1.13.1"
+setuptools = "^70.0.0"
+torch = "^2.0.0"
+tqdm = "^4.66.2"
+transformers = "^4.40.0"
+CAI-PyPI = "^2.0.1"
+codon-bias = "^1.0.2"
+gcua = "^0.1.2"
+dtw-python = "^1.3.0"
+[tool.poetry.dev-dependencies]
+coverage = {version = "^7.0", extras = ["toml"]}
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py310"
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+ignore = []
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+[tool.coverage.run]
+omit = [
+    # omit pytorch-generated files in /tmp
+    "/tmp/*",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+biopython>=1.83,<2.0
+CAI-PyPI>=2.0.1,<3.0
+ipywidgets>=7.0.0,<10.0
+numpy>=1.26.4,<2.0
+onnxruntime>=1.16.3,<3.0
+pandas>=2.0.0,<3.0
+python_codon_tables>=0.1.12,<1.0
+pytorch_lightning>=2.2.1,<3.0
+scikit-learn>=1.2.2,<2.0
+scipy>=1.13.1,<3.0
+setuptools>=70.0.0
+torch>=2.0.0,<3.0
+tqdm>=4.66.2,<5.0
+transformers>=4.40.0,<5.0
+codon-bias>=0.3.5,<0.4
+dtw-python>=1.3.0,<2.0
+dnachisel>=1.0
+paretoset>=1.2.0
+softadapt>=0.1.2,<0.2
+ema-pytorch>=0.4.3
+torchmetrics>=1.4.0
+pyyaml>=6.0
+matplotlib>=3.8,<4.0
+seaborn>=0.13,<0.14
+openpyxl>=3.1,<4.0
+huggingface-hub>=0.20,<1.0

scripts/optimize_sequence.py ADDED Viewed

	@@ -0,0 +1,383 @@

+"""
+Optimize protein sequences using ColiFormer.
+This script provides a user-friendly interface for codon optimization,
+supporting both single sequences and batch processing via FASTA files.
+Usage:
+    # Single sequence
+    python scripts/optimize_sequence.py --input "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG" --output optimized.fasta
+    # Batch processing from FASTA file
+    python scripts/optimize_sequence.py --input sequences.fasta --output optimized.fasta --batch
+    # With GC content constraints
+    python scripts/optimize_sequence.py --input protein.fasta --output optimized.fasta --gc-min 0.45 --gc-max 0.55
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Any, List, Tuple
+# Add parent directory to path to import CodonTransformer
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def parse_fasta(fasta_path: str) -> List[Tuple[str, str]]:
+    """
+    Parse FASTA file into list of (name, sequence) tuples.
+    Args:
+        fasta_path: Path to FASTA file
+    Returns:
+        List of (name, sequence) tuples
+    """
+    sequences = []
+    current_name = None
+    current_seq = []
+    with open(fasta_path, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith('>'):
+                if current_name is not None:
+                    sequences.append((current_name, ''.join(current_seq)))
+                current_name = line[1:] if len(line) > 1 else f"sequence_{len(sequences)+1}"
+                current_seq = []
+            else:
+                current_seq.append(line.upper())
+        if current_name is not None:
+            sequences.append((current_name, ''.join(current_seq)))
+    return sequences
+def write_fasta(output_path: str, sequences: List[Tuple[str, str]]):
+    """
+    Write sequences to FASTA file.
+    Args:
+        output_path: Output FASTA file path
+        sequences: List of (name, sequence) tuples
+    """
+    with open(output_path, 'w') as f:
+        for name, seq in sequences:
+            f.write(f">{name}\n")
+            # Write sequence in 60-character lines
+            for i in range(0, len(seq), 60):
+                f.write(seq[i:i+60] + "\n")
+def optimize_single_sequence(
+    protein: str,
+    model: Any,
+    tokenizer: Any,
+    device: Any,
+    organism: str = "Escherichia coli general",
+    gc_min: float = None,
+    gc_max: float = None,
+    cai_weights: dict = None,
+    tai_weights: dict = None
+) -> dict:
+    """
+    Optimize a single protein sequence.
+    Args:
+        protein: Protein sequence string
+        model: Loaded ColiFormer model
+        tokenizer: Tokenizer
+        device: PyTorch device
+        organism: Target organism name
+        gc_min: Minimum GC content (0-1)
+        gc_max: Maximum GC content (0-1)
+        cai_weights: CAI weights dictionary
+        tai_weights: tAI weights dictionary
+    Returns:
+        Dictionary with optimization results
+    """
+    # Lazy imports so `python scripts/optimize_sequence.py --help` works without ML deps installed.
+    from CodonTransformer.CodonPrediction import predict_dna_sequence
+    from CodonTransformer.CodonEvaluation import get_GC_content, calculate_tAI
+    from CAI import CAI
+    # Determine GC bounds if specified
+    gc_bounds = None
+    use_constrained = False
+    if gc_min is not None and gc_max is not None:
+        gc_bounds = (gc_min, gc_max)
+        use_constrained = True
+    # Run optimization
+    output = predict_dna_sequence(
+        protein=protein,
+        organism=organism,
+        device=device,
+        model=model,
+        tokenizer=tokenizer,
+        deterministic=True,
+        match_protein=True,
+        use_constrained_search=use_constrained,
+        gc_bounds=gc_bounds,
+        beam_size=20 if use_constrained else 5,
+    )
+    if isinstance(output, list):
+        output = output[0]
+    optimized_dna = output.predicted_dna
+    # Calculate metrics
+    gc_content = get_GC_content(optimized_dna) / 100.0  # Convert to fraction
+    metrics = {
+        'protein': protein,
+        'optimized_dna': optimized_dna,
+        'gc_content': gc_content,
+        'length': len(optimized_dna),
+    }
+    if cai_weights:
+        try:
+            metrics['cai'] = CAI(optimized_dna, weights=cai_weights)
+        except:
+            metrics['cai'] = None
+    else:
+        metrics['cai'] = None
+    if tai_weights:
+        try:
+            metrics['tai'] = calculate_tAI(optimized_dna, tai_weights)
+        except:
+            metrics['tai'] = None
+    else:
+        metrics['tai'] = None
+    return metrics
+def load_reference_data(ref_sequences_path: str = None):
+    """
+    Load reference sequences and calculate CAI weights.
+    Args:
+        ref_sequences_path: Path to CSV with reference sequences
+    Returns:
+        Tuple of (cai_weights, tai_weights)
+    """
+    # Lazy imports so `--help` works without ML deps installed.
+    import pandas as pd
+    from CAI import relative_adaptiveness
+    from CodonTransformer.CodonEvaluation import get_ecoli_tai_weights
+    cai_weights = None
+    tai_weights = None
+    # Try to load reference sequences for CAI
+    if ref_sequences_path and os.path.exists(ref_sequences_path):
+        try:
+            df = pd.read_csv(ref_sequences_path)
+            if 'dna_sequence' in df.columns:
+                ref_sequences = df['dna_sequence'].tolist()
+                cai_weights = relative_adaptiveness(sequences=ref_sequences)
+                print(f"Loaded CAI weights from {len(ref_sequences)} reference sequences")
+        except Exception as e:
+            print(f"Warning: Could not load CAI weights: {e}")
+    # Load tAI weights
+    try:
+        tai_weights = get_ecoli_tai_weights()
+        print("Loaded E. coli tAI weights")
+    except Exception as e:
+        print(f"Warning: Could not load tAI weights: {e}")
+    return cai_weights, tai_weights
+def main():
+    """Main entry point for sequence optimization."""
+    parser = argparse.ArgumentParser(
+        description="Optimize protein sequences using ENCOT",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Single sequence
+    python scripts/optimize_sequence.py --input "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG" --output optimized.fasta
+    # Batch processing from FASTA file
+    python scripts/optimize_sequence.py --input sequences.fasta --output optimized.fasta --batch
+    # With GC content constraints
+    python scripts/optimize_sequence.py --input protein.fasta --output optimized.fasta --gc-min 0.45 --gc-max 0.55
+    # Use custom checkpoint
+    python scripts/optimize_sequence.py --input protein.fasta --output optimized.fasta --checkpoint models/my_model.ckpt
+        """
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Input protein sequence (string) or FASTA file path"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output FASTA file path"
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="Path to model checkpoint (default: auto-download from Hugging Face)"
+    )
+    parser.add_argument(
+        "--organism",
+        type=str,
+        default="Escherichia coli general",
+        help="Target organism (default: Escherichia coli general)"
+    )
+    parser.add_argument(
+        "--gc-min",
+        type=float,
+        default=None,
+        help="Minimum GC content (0-1, e.g., 0.45 for 45%%)"
+    )
+    parser.add_argument(
+        "--gc-max",
+        type=float,
+        default=None,
+        help="Maximum GC content (0-1, e.g., 0.55 for 55%%)"
+    )
+    parser.add_argument(
+        "--batch",
+        action="store_true",
+        help="Process input as FASTA file with multiple sequences"
+    )
+    parser.add_argument(
+        "--ref-sequences",
+        type=str,
+        default="data/ecoli_processed_genes.csv",
+        help="Path to reference sequences CSV for CAI calculation"
+    )
+    parser.add_argument(
+        "--use-gpu",
+        action="store_true",
+        help="Use GPU if available"
+    )
+    args = parser.parse_args()
+    try:
+        # Lazy imports so `--help` works without ML deps installed.
+        import torch
+        from transformers import AutoTokenizer
+        from CodonTransformer.CodonPrediction import load_model
+        import pandas as pd
+        # Setup device
+        device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
+        print(f"Using device: {device}")
+        # Load model
+        print("Loading ColiFormer model...")
+        if args.checkpoint:
+            model = load_model(model_path=args.checkpoint, device=device)
+            print(f"Loaded model from {args.checkpoint}")
+        else:
+            # Try to load from Hugging Face
+            try:
+                from huggingface_hub import hf_hub_download
+                checkpoint_path = hf_hub_download(
+                    repo_id="saketh11/ColiFormer",
+                    filename="balanced_alm_finetune.ckpt",
+                    cache_dir="./hf_cache"
+                )
+                model = load_model(model_path=checkpoint_path, device=device)
+                print("Loaded model from Hugging Face (saketh11/ColiFormer)")
+            except Exception as e:
+                print(f"Warning: Could not load from Hugging Face: {e}")
+                print("Falling back to base CodonTransformer model...")
+                from transformers import BigBirdForMaskedLM
+                model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer").to(device)
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+        # Load reference data for metrics
+        cai_weights, tai_weights = load_reference_data(args.ref_sequences)
+        # Parse input
+        if args.batch or os.path.exists(args.input):
+            # FASTA file
+            print(f"Reading sequences from {args.input}...")
+            sequences = parse_fasta(args.input)
+            print(f"Found {len(sequences)} sequences")
+        else:
+            # Single sequence string
+            sequences = [("sequence_1", args.input.upper())]
+        # Optimize sequences
+        optimized_sequences = []
+        results = []
+        for i, (name, protein_seq) in enumerate(sequences, 1):
+            print(f"\nOptimizing sequence {i}/{len(sequences)}: {name}")
+            metrics = optimize_single_sequence(
+                protein=protein_seq,
+                model=model,
+                tokenizer=tokenizer,
+                device=device,
+                organism=args.organism,
+                gc_min=args.gc_min,
+                gc_max=args.gc_max,
+                cai_weights=cai_weights,
+                tai_weights=tai_weights
+            )
+            optimized_sequences.append((name, metrics['optimized_dna']))
+            results.append({
+                'name': name,
+                'protein_length': len(protein_seq),
+                'dna_length': metrics['length'],
+                'gc_content': f"{metrics['gc_content']*100:.2f}%",
+                'cai': metrics['cai'],
+                'tai': metrics['tai'],
+            })
+            print(f"  GC content: {metrics['gc_content']*100:.2f}%")
+            if metrics['cai']:
+                print(f"  CAI: {metrics['cai']:.3f}")
+            if metrics['tai']:
+                print(f"  tAI: {metrics['tai']:.3f}")
+        # Write output
+        write_fasta(args.output, optimized_sequences)
+        print(f"\nOptimized sequences saved to {args.output}")
+        # Print summary
+        if len(results) > 1:
+            print("\n" + "="*60)
+            print("Summary Statistics")
+            print("="*60)
+            df = pd.DataFrame(results)
+            print(df.to_string(index=False))
+            print("="*60)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

scripts/preprocess_data.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Preprocess E. coli gene data for ColiFormer training.
+This script combines the functionality of prepare_ecoli_data.py and
+create_model_datasets.py to prepare training and test datasets from raw CSV files.
+Usage:
+    python scripts/preprocess_data.py
+    python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
+"""
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path to import CodonTransformer
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def is_valid_sequence(dna_seq: str) -> bool:
+    """
+    Validate a DNA sequence for training suitability.
+    Args:
+        dna_seq: DNA sequence string
+    Returns:
+        True if sequence is valid (divisible by 3, proper start/stop codons, no internal stops)
+    """
+    if len(dna_seq) % 3 != 0:
+        return False
+    if not dna_seq.upper().startswith(('ATG', 'TTG', 'CTG', 'GTG')):
+        return False
+    if not dna_seq.upper().endswith(('TAA', 'TAG', 'TGA')):
+        return False
+    codons = [dna_seq[i:i+3].upper() for i in range(0, len(dna_seq) - 3, 3)]
+    if any(codon in ['TAA', 'TAG', 'TGA'] for codon in codons):
+        return False
+    if not all(c in 'ATGC' for c in dna_seq.upper()):
+        return False
+    return True
+def process_ecoli_data(cai_csv: str, high_cai_csv: str, output_dir: str = "data"):
+    """
+    Process raw E. coli gene data from CSV files.
+    Args:
+        cai_csv: Path to CAI.csv file with gene data
+        high_cai_csv: Path to Database 3_4300 gene.csv with high-CAI sequences
+        output_dir: Output directory for processed files
+    Returns:
+        Path to processed CSV file
+    """
+    # Lazy imports so `python scripts/preprocess_data.py --help` works without heavy deps installed.
+    import pandas as pd
+    from Bio.Seq import Seq
+    # Validate input files exist
+    if not os.path.exists(cai_csv):
+        raise FileNotFoundError(f"CAI CSV file not found: {cai_csv}")
+    if not os.path.exists(high_cai_csv):
+        raise FileNotFoundError(f"High-CAI CSV file not found: {high_cai_csv}")
+    # Create output directory if needed
+    os.makedirs(output_dir, exist_ok=True)
+    print("Loading data from CSV files...")
+    df_all = pd.read_csv(
+        cai_csv,
+        header=0,
+        names=['gene_id', 'cai_score', 'drop1', 'drop2', 'dna_sequence', 'drop3']
+    )
+    df_high_cai = pd.read_csv(
+        high_cai_csv,
+        header=0,
+        names=['dna_sequence']
+    )
+    high_cai_sequences = set(df_high_cai['dna_sequence'])
+    validated_genes = []
+    for index, row in df_all.iterrows():
+        gene_id = row['gene_id']
+        dna_sequence = str(row['dna_sequence'])
+        if is_valid_sequence(dna_sequence):
+            protein_sequence = str(Seq(dna_sequence).translate())
+            is_high_cai = dna_sequence in high_cai_sequences
+            validated_genes.append({
+                'gene_id': gene_id,
+                'dna_sequence': dna_sequence,
+                'protein_sequence': protein_sequence,
+                'cai_score': row.get('cai_score', None),
+                'is_high_cai': is_high_cai
+            })
+    df_processed = pd.DataFrame(validated_genes)
+    output_path = os.path.join(output_dir, 'ecoli_processed_genes.csv')
+    df_processed.to_csv(output_path, index=False)
+    print(f"Processed data saved to {output_path}")
+    print(f"Total validated genes: {len(df_processed)}")
+    return output_path
+def create_train_test_splits(processed_csv: str, output_dir: str = "data", test_size: int = 100):
+    """
+    Create training and test splits from processed data.
+    Args:
+        processed_csv: Path to processed ecoli_processed_genes.csv
+        output_dir: Output directory for JSON files
+        test_size: Number of sequences for test set
+    Returns:
+        Tuple of (finetune_json_path, test_json_path)
+    """
+    # Lazy imports so `--help` works without heavy deps installed.
+    import pandas as pd
+    from CodonTransformer.CodonData import prepare_training_data
+    if not os.path.exists(processed_csv):
+        raise FileNotFoundError(f"Processed data file not found: {processed_csv}")
+    os.makedirs(output_dir, exist_ok=True)
+    df_processed = pd.read_csv(processed_csv)
+    # Create fine-tuning set (high-CAI sequences)
+    df_finetune = df_processed[df_processed['is_high_cai'] == True].copy()
+    df_finetune.drop_duplicates(subset=['dna_sequence'], inplace=True)
+    df_finetune.rename(columns={'dna_sequence': 'dna', 'protein_sequence': 'protein'}, inplace=True)
+    df_finetune['organism'] = "Escherichia coli general"
+    finetune_output_path = os.path.join(output_dir, 'finetune_set.json')
+    prepare_training_data(df_finetune, finetune_output_path, shuffle=True)
+    print(f"Fine-tuning set saved to {finetune_output_path} with {len(df_finetune)} records.")
+    # Create test set (non-high-CAI sequences)
+    df_test_pool = df_processed[df_processed['is_high_cai'] == False].copy()
+    df_test = df_test_pool.sample(n=test_size, random_state=42)  # for reproducibility
+    df_test['organism'] = 51  # E. coli general organism ID
+    df_test.rename(columns={'dna_sequence': 'codons'}, inplace=True)
+    test_records = df_test[['codons', 'organism']].to_dict(orient='records')
+    test_output_path = os.path.join(output_dir, 'test_set.json')
+    with open(test_output_path, 'w') as f:
+        json.dump(test_records, f, indent=4)
+    print(f"Test set saved to {test_output_path} with {len(df_test)} records.")
+    return finetune_output_path, test_output_path
+def main():
+    """Main entry point for data preprocessing."""
+    parser = argparse.ArgumentParser(
+        description="Preprocess E. coli gene data for ENCOT training",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Use default paths
+    python scripts/preprocess_data.py
+    # Specify custom input files
+    python scripts/preprocess_data.py --cai_csv data/CAI.csv --high_cai_csv data/Database_3_4300_gene.csv
+    # Custom output directory and test size
+    python scripts/preprocess_data.py --output_dir my_data --test_size 200
+        """
+    )
+    parser.add_argument(
+        "--cai_csv",
+        type=str,
+        default="data/CAI.csv",
+        help="Path to CAI.csv file with gene data (default: data/CAI.csv)"
+    )
+    parser.add_argument(
+        "--high_cai_csv",
+        type=str,
+        default="data/Database 3_4300 gene.csv",
+        help="Path to Database 3_4300 gene.csv file (default: data/Database 3_4300 gene.csv)"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="data",
+        help="Output directory for processed files (default: data)"
+    )
+    parser.add_argument(
+        "--test_size",
+        type=int,
+        default=100,
+        help="Number of sequences for test set (default: 100)"
+    )
+    parser.add_argument(
+        "--skip_processing",
+        action="store_true",
+        help="Skip data processing step (assume ecoli_processed_genes.csv exists)"
+    )
+    args = parser.parse_args()
+    try:
+        # Step 1: Process raw data
+        if not args.skip_processing:
+            processed_csv = process_ecoli_data(
+                args.cai_csv,
+                args.high_cai_csv,
+                args.output_dir
+            )
+        else:
+            processed_csv = os.path.join(args.output_dir, 'ecoli_processed_genes.csv')
+            if not os.path.exists(processed_csv):
+                raise FileNotFoundError(
+                    f"Processed data not found at {processed_csv}. "
+                    "Remove --skip_processing flag to process raw data first."
+                )
+            print(f"Using existing processed data: {processed_csv}")
+        # Step 2: Create train/test splits
+        finetune_path, test_path = create_train_test_splits(
+            processed_csv,
+            args.output_dir,
+            args.test_size
+        )
+        print("\n" + "="*60)
+        print("Data preprocessing complete!")
+        print("="*60)
+        print(f"Training set: {finetune_path}")
+        print(f"Test set: {test_path}")
+        print("\nYou can now run training with:")
+        print(f"  python scripts/train.py --config configs/train_ecoli_alm.yaml")
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

scripts/run_benchmarks.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Run benchmark evaluation for ColiFormer.
+This script wraps benchmark_evaluation.py and evaluate_optimizer.py to provide
+a unified interface for running comprehensive evaluations.
+Usage:
+    python scripts/run_benchmarks.py --config configs/benchmark.yaml
+    python scripts/run_benchmarks.py --excel_path Benchmark_80_sequences.xlsx --checkpoint_path models/my_model.ckpt
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path to import benchmark scripts
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def load_config(config_path: str) -> dict:
+    """
+    Load configuration from YAML file.
+    Args:
+        config_path: Path to YAML config file
+    Returns:
+        Dictionary with configuration values
+    """
+    # Lazy import so `python scripts/run_benchmarks.py --help` works without dependencies installed.
+    import yaml
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def config_to_args(config: dict) -> argparse.Namespace:
+    """
+    Convert config dictionary to argparse.Namespace compatible with benchmark_evaluation.py.
+    Args:
+        config: Configuration dictionary from YAML
+    Returns:
+        argparse.Namespace with all required arguments
+    """
+    model_config = config.get('model', {})
+    data_config = config.get('data', {})
+    output_config = config.get('output', {})
+    eval_config = config.get('evaluation', {})
+    args = argparse.Namespace()
+    # Model paths
+    args.checkpoint_path = model_config.get('checkpoint_path', 'models/alm-enhanced-training/balanced_alm_finetune.ckpt')
+    # Data paths
+    args.excel_path = data_config.get('excel_path', 'Benchmark 80 sequences.xlsx')
+    args.natural_sequences_path = data_config.get('natural_sequences_path', 'data/ecoli_processed_genes.csv')
+    args.name_col = data_config.get('name_col')
+    args.seq_col = data_config.get('seq_col')
+    args.sheet_name = data_config.get('sheet_name')
+    # Output paths
+    args.output_dir = output_config.get('output_dir', 'benchmark_results')
+    # Evaluation parameters
+    args.use_gpu = eval_config.get('use_gpu', True)
+    args.compare_with_base = eval_config.get('compare_with_base', False)
+    args.max_test_proteins = eval_config.get('max_test_proteins', 0)
+    return args
+def validate_config(config: dict):
+    """
+    Validate configuration before running benchmarks.
+    Args:
+        config: Configuration dictionary
+    Raises:
+        ValueError: If configuration is invalid
+    """
+    data_config = config.get('data', {})
+    excel_path = data_config.get('excel_path', 'Benchmark 80 sequences.xlsx')
+    if not os.path.exists(excel_path):
+        raise ValueError(
+            f"Benchmark Excel file not found: {excel_path}\n"
+            "Please provide a valid path to your benchmark sequences file."
+        )
+    model_config = config.get('model', {})
+    checkpoint_path = model_config.get('checkpoint_path')
+    # Check if checkpoint exists locally, or will be downloaded from HF
+    if checkpoint_path and os.path.exists(checkpoint_path):
+        print(f"Using local checkpoint: {checkpoint_path}")
+    else:
+        print(f"Checkpoint not found locally: {checkpoint_path}")
+        print("Will attempt to download from Hugging Face (saketh11/ColiFormer) if needed")
+def main():
+    """Main entry point for benchmark evaluation."""
+    parser = argparse.ArgumentParser(
+        description="Run benchmark evaluation for ENCOT",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run with configuration file
+    python scripts/run_benchmarks.py --config configs/benchmark.yaml
+    # Run with command-line arguments
+    python scripts/run_benchmarks.py --excel_path Benchmark_80_sequences.xlsx --checkpoint_path models/my_model.ckpt
+    # Override config values
+    python scripts/run_benchmarks.py --config configs/benchmark.yaml --use_gpu --max_test_proteins 50
+        """
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="Path to YAML configuration file"
+    )
+    parser.add_argument(
+        "--excel_path",
+        type=str,
+        default=None,
+        help="Path to benchmark Excel file (overrides config)"
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default=None,
+        help="Path to model checkpoint (overrides config)"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output directory for results (overrides config)"
+    )
+    parser.add_argument(
+        "--use_gpu",
+        action="store_true",
+        help="Use GPU if available (overrides config)"
+    )
+    parser.add_argument(
+        "--max_test_proteins",
+        type=int,
+        default=None,
+        help="Maximum number of proteins to test (overrides config)"
+    )
+    args = parser.parse_args()
+    try:
+        # Lazy import so `--help` works even if plotting/ML deps are missing.
+        from benchmark_evaluation import main as benchmark_main
+        if args.config:
+            # Load configuration from file
+            print(f"Loading configuration from {args.config}...")
+            config = load_config(args.config)
+            # Override with command-line arguments if provided
+            if args.excel_path:
+                config.setdefault('data', {})['excel_path'] = args.excel_path
+            if args.checkpoint_path:
+                config.setdefault('model', {})['checkpoint_path'] = args.checkpoint_path
+            if args.output_dir:
+                config.setdefault('output', {})['output_dir'] = args.output_dir
+            if args.use_gpu:
+                config.setdefault('evaluation', {})['use_gpu'] = True
+            if args.max_test_proteins is not None:
+                config.setdefault('evaluation', {})['max_test_proteins'] = args.max_test_proteins
+            # Validate configuration
+            validate_config(config)
+            # Convert config to args namespace
+            benchmark_args = config_to_args(config)
+        else:
+            # Use command-line arguments directly
+            if not args.excel_path:
+                parser.error("Either --config or --excel_path must be provided")
+            benchmark_args = argparse.Namespace()
+            benchmark_args.excel_path = args.excel_path
+            benchmark_args.checkpoint_path = args.checkpoint_path or 'models/alm-enhanced-training/balanced_alm_finetune.ckpt'
+            benchmark_args.natural_sequences_path = 'data/ecoli_processed_genes.csv'
+            benchmark_args.output_dir = args.output_dir or 'benchmark_results'
+            benchmark_args.use_gpu = args.use_gpu
+            benchmark_args.max_test_proteins = args.max_test_proteins or 0
+            benchmark_args.name_col = None
+            benchmark_args.seq_col = None
+            benchmark_args.sheet_name = None
+            # Validate
+            if not os.path.exists(benchmark_args.excel_path):
+                raise ValueError(f"Benchmark Excel file not found: {benchmark_args.excel_path}")
+        # Print configuration summary
+        print("\n" + "="*60)
+        print("Benchmark Configuration Summary")
+        print("="*60)
+        print(f"Excel file: {benchmark_args.excel_path}")
+        print(f"Checkpoint: {benchmark_args.checkpoint_path}")
+        print(f"Output directory: {benchmark_args.output_dir}")
+        print(f"Use GPU: {benchmark_args.use_gpu}")
+        print(f"Max test proteins: {benchmark_args.max_test_proteins if benchmark_args.max_test_proteins > 0 else 'All'}")
+        print("="*60 + "\n")
+        # Run benchmark
+        benchmark_main(benchmark_args)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

scripts/train.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Training entry point for ColiFormer.
+This script wraps finetune.py and loads configuration from YAML files.
+Usage:
+    python scripts/train.py --config configs/train_ecoli_alm.yaml
+    python scripts/train.py --config configs/train_ecoli_quick.yaml
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path to import finetune
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def load_config(config_path: str) -> dict:
+    """
+    Load configuration from YAML file.
+    Args:
+        config_path: Path to YAML config file
+    Returns:
+        Dictionary with configuration values
+    """
+    # Lazy import so `python scripts/train.py --help` works without dependencies installed.
+    import yaml
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def config_to_args(config: dict) -> argparse.Namespace:
+    """
+    Convert config dictionary to argparse.Namespace compatible with finetune.py.
+    Args:
+        config: Configuration dictionary from YAML
+    Returns:
+        argparse.Namespace with all required arguments
+    """
+    # Extract nested config values
+    data_config = config.get('data', {})
+    training_config = config.get('training', {})
+    checkpoint_config = config.get('checkpoint', {})
+    alm_config = config.get('alm', {})
+    gc_penalty_config = config.get('gc_penalty', {})
+    # Build args namespace
+    args = argparse.Namespace()
+    # Data paths
+    args.dataset_dir = data_config.get('dataset_dir', 'data')
+    # Checkpoint paths
+    args.checkpoint_dir = checkpoint_config.get('checkpoint_dir', 'models/checkpoints')
+    args.checkpoint_filename = checkpoint_config.get('checkpoint_filename', 'finetune.ckpt')
+    # Training parameters
+    args.batch_size = training_config.get('batch_size', 6)
+    args.max_epochs = training_config.get('max_epochs', 15)
+    args.num_workers = training_config.get('num_workers', 5)
+    args.accumulate_grad_batches = training_config.get('accumulate_grad_batches', 1)
+    args.num_gpus = training_config.get('num_gpus', 4)
+    args.learning_rate = training_config.get('learning_rate', 5e-5)
+    args.warmup_fraction = training_config.get('warmup_fraction', 0.1)
+    args.save_every_n_steps = training_config.get('save_every_n_steps', 512)
+    args.seed = training_config.get('seed', 123)
+    args.log_every_n_steps = training_config.get('log_every_n_steps', 20)
+    args.debug = training_config.get('debug', False)
+    # GC penalty (legacy)
+    args.gc_penalty_weight = gc_penalty_config.get('weight', 0.0)
+    # ALM parameters
+    args.use_lagrangian = alm_config.get('enabled', False)
+    args.gc_target = alm_config.get('gc_target', 0.52)
+    args.curriculum_epochs = alm_config.get('curriculum_epochs', 3)
+    args.lagrangian_rho = alm_config.get('initial_penalty_factor', 20.0)  # Use initial_penalty_factor as rho
+    args.alm_tolerance = alm_config.get('tolerance', 1e-5)
+    args.alm_dual_tolerance = alm_config.get('dual_tolerance', 1e-5)
+    args.alm_penalty_update_factor = alm_config.get('penalty_update_factor', 10.0)
+    args.alm_initial_penalty_factor = alm_config.get('initial_penalty_factor', 20.0)
+    args.alm_tolerance_update_factor = alm_config.get('tolerance_update_factor', 0.1)
+    args.alm_rel_penalty_increase_threshold = alm_config.get('rel_penalty_increase_threshold', 0.1)
+    args.alm_max_penalty = alm_config.get('max_penalty', 1e6)
+    args.alm_min_penalty = alm_config.get('min_penalty', 1e-6)
+    return args
+def validate_config(config: dict):
+    """
+    Validate configuration before training.
+    Args:
+        config: Configuration dictionary
+    Raises:
+        ValueError: If configuration is invalid
+    """
+    data_config = config.get('data', {})
+    dataset_dir = data_config.get('dataset_dir', 'data')
+    # Check dataset directory exists
+    if not os.path.exists(dataset_dir):
+        raise ValueError(f"Dataset directory not found: {dataset_dir}")
+    # Check for expected data files
+    finetune_set = os.path.join(dataset_dir, 'finetune_set.json')
+    if not os.path.exists(finetune_set):
+        raise ValueError(
+            f"Training data not found: {finetune_set}\n"
+            "Please run data preprocessing first:\n"
+            "  python scripts/preprocess_data.py"
+        )
+    # Validate checkpoint directory can be created
+    checkpoint_config = config.get('checkpoint', {})
+    checkpoint_dir = checkpoint_config.get('checkpoint_dir', 'models/checkpoints')
+    os.makedirs(checkpoint_dir, exist_ok=True)
+def main():
+    """Main entry point for training."""
+    parser = argparse.ArgumentParser(
+        description="Train ENCOT model with configuration file",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Train with main ALM configuration
+    python scripts/train.py --config configs/train_ecoli_alm.yaml
+    # Quick test training (CPU, 1 epoch)
+    python scripts/train.py --config configs/train_ecoli_quick.yaml
+    # Override config values from command line
+    python scripts/train.py --config configs/train_ecoli_alm.yaml --num_gpus 2 --batch_size 4
+        """
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to YAML configuration file"
+    )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=None,
+        help="Override number of GPUs from config"
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=None,
+        help="Override batch size from config"
+    )
+    parser.add_argument(
+        "--max_epochs",
+        type=int,
+        default=None,
+        help="Override max epochs from config"
+    )
+    args = parser.parse_args()
+    try:
+        # Lazy import so `--help` works even if training deps are missing.
+        from finetune import main as finetune_main
+        # Load configuration
+        print(f"Loading configuration from {args.config}...")
+        config = load_config(args.config)
+        # Override with command-line arguments if provided
+        if args.num_gpus is not None:
+            config.setdefault('training', {})['num_gpus'] = args.num_gpus
+        if args.batch_size is not None:
+            config.setdefault('training', {})['batch_size'] = args.batch_size
+        if args.max_epochs is not None:
+            config.setdefault('training', {})['max_epochs'] = args.max_epochs
+        # Validate configuration
+        print("Validating configuration...")
+        validate_config(config)
+        # Convert config to args namespace
+        train_args = config_to_args(config)
+        # Print training summary
+        print("\n" + "="*60)
+        print("Training Configuration Summary")
+        print("="*60)
+        print(f"Dataset directory: {train_args.dataset_dir}")
+        print(f"Checkpoint directory: {train_args.checkpoint_dir}")
+        print(f"Checkpoint filename: {train_args.checkpoint_filename}")
+        print(f"Batch size: {train_args.batch_size}")
+        print(f"Max epochs: {train_args.max_epochs}")
+        print(f"Learning rate: {train_args.learning_rate}")
+        print(f"Number of GPUs: {train_args.num_gpus}")
+        print(f"ALM enabled: {train_args.use_lagrangian}")
+        if train_args.use_lagrangian:
+            print(f"GC target: {train_args.gc_target}")
+            print(f"Curriculum epochs: {train_args.curriculum_epochs}")
+        print("="*60 + "\n")
+        # Run training
+        finetune_main(train_args)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

setup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from setuptools import find_packages, setup
+def read_requirements():
+    with open("requirements.txt") as f:
+        return [line.strip() for line in f if line.strip() and not line.startswith("#")]
+def read_readme():
+    here = os.path.abspath(os.path.dirname(__file__))
+    readme_path = os.path.join(here, "README.md")
+    with open(readme_path, "r", encoding="utf-8") as f:
+        return f.read()
+setup(
+    name="ENCOT",
+    version="1.0.0",
+    packages=find_packages(),
+    install_requires=read_requirements(),
+    author="Adibvafa Fallahpour",
+    author_email="Adibvafa.fallahpour@mail.utoronto.ca",
+    description=(
+        "Transformer-based codon optimization for E. coli using "
+        "deep learning with Augmented-Lagrangian GC control. "
+        "Built on CodonTransformer for E. coli-specific optimization."
+    ),
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/geno543/ENCOT",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.9",
+)

src/CodonTransformer_inference_template.xlsx ADDED Viewed

Binary file (17.4 kB). View file

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Model weights, tokenizer, and other resources."""

src/banner_final.png ADDED Viewed

Git LFS Details

SHA256: 6aa745d1f362190e7ae0b8940154446e68426bfb16ef6be9336fb6f98168a205
Pointer size: 131 Bytes
Size of remote file: 468 kB

src/organism2id.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44f7b73bbb3c6ea82bf864e886b57b219cbd5f14fe79a8aa47d2befab5d40ad0
+size 4605

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Public Streamlit entrypoint for ENCOT.
+This file is intentionally minimal so hosting platforms like Streamlit
+Community Cloud can run the existing UI without changing project structure.
+"""
+from pathlib import Path
+import sys
+ROOT = Path(__file__).resolve().parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+# Importing this module runs the Streamlit app defined there.
+import streamlit_gui.app  # noqa: F401,E402

streamlit_gui/app.py ADDED Viewed

	@@ -0,0 +1,1456 @@

+"""
+File: app.py
+-------------
+Streamlit GUI for ENCOT. Provides sequence validation, optimization,
+and visualization for E. coli-focused workflows with optional post-processing.
+"""
+import streamlit as st
+import torch
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+import plotly.express as px
+from transformers import AutoTokenizer, BigBirdForMaskedLM
+from huggingface_hub import hf_hub_download
+from datasets import load_dataset
+import time
+import threading
+from typing import Dict, Optional, Tuple
+import warnings
+warnings.filterwarnings("ignore")
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from CodonTransformer.CodonPrediction import (
+    predict_dna_sequence,
+    load_model
+)
+from CodonTransformer.CodonEvaluation import (
+    get_GC_content,
+    calculate_tAI,
+    get_ecoli_tai_weights,
+    scan_for_restriction_sites,
+    count_negative_cis_elements,
+    calculate_homopolymer_runs
+)
+from CAI import CAI, relative_adaptiveness
+from CodonTransformer.CodonUtils import get_organism2id_dict
+import json
+try:
+    from CodonTransformer.CodonPostProcessing import (
+        polish_sequence_with_dnachisel,
+        DNACHISEL_AVAILABLE
+    )
+    POST_PROCESSING_AVAILABLE = True
+except ImportError:
+    POST_PROCESSING_AVAILABLE = False
+    DNACHISEL_AVAILABLE = False
+st.set_page_config(
+    page_title="ENCOT GUI",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+if 'model' not in st.session_state:
+    st.session_state.model = None
+if 'tokenizer' not in st.session_state:
+    st.session_state.tokenizer = None
+if 'device' not in st.session_state:
+    st.session_state.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if 'optimization_running' not in st.session_state:
+    st.session_state.optimization_running = False
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'post_processed_results' not in st.session_state:
+    st.session_state.post_processed_results = None
+if 'cai_weights' not in st.session_state:
+    st.session_state.cai_weights = None
+if 'tai_weights' not in st.session_state:
+    st.session_state.tai_weights = None
+def get_organism_tai_weights(organism: str) -> Dict[str, float]:
+    """Get organism-specific tAI weights from pre-calculated data"""
+    try:
+        # Load organism-specific tAI weights
+        weights_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'organism_tai_weights.json')
+        with open(weights_file, 'r') as f:
+            all_weights = json.load(f)
+        if organism in all_weights:
+            return all_weights[organism]
+        else:
+            # Fallback to E. coli if organism not found
+            st.warning(f"tAI weights for {organism} not found, using E. coli weights")
+            return all_weights.get("Escherichia coli general", get_ecoli_tai_weights())
+    except Exception as e:
+        st.error(f"Error loading organism-specific tAI weights: {e}")
+        return get_ecoli_tai_weights()
+def load_model_and_tokenizer():
+    """Load the model and tokenizer with progress tracking"""
+    if st.session_state.model is None or st.session_state.tokenizer is None:
+        with st.spinner("Loading model... This may take a few minutes."):
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            status_text.text("Loading tokenizer...")
+            progress_bar.progress(25)
+            st.session_state.tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+            status_text.text("Loading fine-tuned model from Hugging Face...")
+            progress_bar.progress(50)
+            # Try to download and load fine-tuned model from Hugging Face
+            try:
+                # Download the checkpoint file from Hugging Face
+                from huggingface_hub import hf_hub_download
+                status_text.text("Downloading model from saketh11/ColiFormer...")
+                model_path = hf_hub_download(
+                    repo_id="saketh11/ColiFormer",
+                    filename="balanced_alm_finetune.ckpt",
+                    cache_dir="./hf_cache"
+                )
+                status_text.text("Loading downloaded model...")
+                st.session_state.model = load_model(
+                    model_path=model_path,
+                    device=st.session_state.device,
+                    attention_type="original_full"
+                )
+                status_text.text("Fine-tuned model loaded from Hugging Face")
+                st.session_state.model_type = "fine_tuned_hf"
+            except Exception as e:
+                status_text.text(f"Failed to load from Hugging Face: {str(e)[:50]}...")
+                status_text.text("Loading base model as fallback...")
+                st.session_state.model = BigBirdForMaskedLM.from_pretrained("adibvafa/CodonTransformer")
+                st.session_state.model = st.session_state.model.to(st.session_state.device)
+                st.session_state.model_type = "base"
+            progress_bar.progress(100)
+            time.sleep(0.5)
+            status_text.empty()
+            progress_bar.empty()
+@st.cache_data
+def download_reference_data():
+    """Download and cache reference data from Hugging Face"""
+    try:
+        # Download the processed genes file from Hugging Face
+        file_path = hf_hub_download(
+            repo_id="saketh11/ColiFormer-Data",
+            filename="ecoli_processed_genes.csv",
+            repo_type="dataset"
+        )
+        df = pd.read_csv(file_path)
+        return df['dna_sequence'].tolist()
+    except Exception as e:
+        st.warning(f"Could not download reference data from Hugging Face: {e}")
+        # Fallback to minimal sequences
+        return [
+            "ATGGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGC",
+            "ATGAAATTTATTTATTATTATAAATTTATTTATTATTATAAATTTATTTAT",
+            "ATGGGTCGTCGTCGTCGTGGTCGTCGTCGTCGTGGTCGTCGTCGTCGTGGT"
+        ]
+@st.cache_data
+def download_tai_weights():
+    """Download and cache tAI weights from Hugging Face"""
+    try:
+        # Download the tAI weights file from Hugging Face
+        file_path = hf_hub_download(
+            repo_id="saketh11/ColiFormer-Data",
+            filename="organism_tai_weights.json",
+            repo_type="dataset"
+        )
+        with open(file_path, 'r') as f:
+            all_weights = json.load(f)
+        return all_weights.get("Escherichia coli general", get_ecoli_tai_weights())
+    except Exception as e:
+        st.warning(f"Could not download tAI weights from Hugging Face: {e}")
+        return get_ecoli_tai_weights()
+def load_reference_data(organism: str = "Escherichia coli general"):
+    """Load reference sequences and tAI weights for E. coli"""
+    if 'cai_weights' not in st.session_state or st.session_state['cai_weights'] is None:
+        try:
+            # Download reference sequences from Hugging Face
+            with st.spinner("Downloading E. coli reference sequences from Hugging Face..."):
+                ref_sequences = download_reference_data()
+                st.session_state['cai_weights'] = relative_adaptiveness(sequences=ref_sequences)
+                if len(ref_sequences) > 100:  # If we got the full dataset
+                    st.success(f"Downloaded {len(ref_sequences):,} E. coli reference sequences for CAI calculation")
+                else:
+                    st.info(f"Using {len(ref_sequences)} minimal reference sequences (full dataset unavailable)")
+        except Exception as e:
+            st.error(f"Error loading E. coli reference data: {e}")
+            st.session_state['cai_weights'] = {}
+    # tAI weights (E. coli only)
+    if 'tai_weights' not in st.session_state or st.session_state['tai_weights'] is None:
+        try:
+            with st.spinner("Downloading E. coli tAI weights from Hugging Face..."):
+                st.session_state['tai_weights'] = download_tai_weights()
+                st.success("Downloaded E. coli tAI weights")
+        except Exception as e:
+            st.error(f"Error loading E. coli tAI weights: {e}")
+            st.session_state['tai_weights'] = {}
+def validate_sequence(sequence: str) -> Tuple[bool, str, str, str]:
+    """Validate sequence and return status, message, sequence type, and possibly fixed sequence"""
+    if not sequence:
+        return False, "Sequence cannot be empty", "unknown", sequence
+    # Remove whitespace and convert to uppercase
+    sequence = sequence.strip().upper()
+    # Check if it's a DNA sequence
+    dna_chars = set("ATGC")
+    protein_chars = set("ACDEFGHIKLMNPQRSTVWY*_")
+    sequence_chars = set(sequence)
+    # If all characters are DNA nucleotides, treat as DNA
+    if sequence_chars.issubset(dna_chars):
+        if len(sequence) < 3:
+            return False, "DNA sequence must be at least 3 nucleotides long", "dna", sequence
+        # Auto-fix DNA sequences not divisible by 3
+        if len(sequence) % 3 != 0:
+            remainder = len(sequence) % 3
+            fixed_sequence = sequence[:-remainder]
+            message = f"Valid DNA sequence (auto-fixed: removed {remainder} nucleotides from end to make divisible by 3)"
+        else:
+            fixed_sequence = sequence
+            message = "Valid DNA sequence"
+        return True, message, "dna", fixed_sequence
+    # If contains protein-specific amino acids, treat as protein
+    elif sequence_chars.issubset(protein_chars):
+        if len(sequence) < 3:
+            return False, "Protein sequence must be at least 3 amino acids long", "protein", sequence
+        return True, "Valid protein sequence", "protein", sequence
+    # Invalid characters
+    else:
+        invalid_chars = sequence_chars - (dna_chars | protein_chars)
+        return False, f"Invalid characters found: {', '.join(invalid_chars)}", "unknown", sequence
+def calculate_input_metrics(sequence: str, organism: str, sequence_type: str) -> Dict:
+    """Calculate metrics for the input sequence using E. coli reference only"""
+    # Load reference data (E. coli only)
+    load_reference_data()
+    if sequence_type == "dna":
+        dna_sequence = sequence.upper()
+        metrics = {
+            'length': len(dna_sequence) // 3,
+            'gc_content': get_GC_content(dna_sequence),
+            'baseline_dna': dna_sequence,
+            'sequence_type': 'dna'
+        }
+        try:
+            if 'cai_weights' in st.session_state and st.session_state['cai_weights']:
+                metrics['cai'] = CAI(dna_sequence, weights=st.session_state['cai_weights'])
+            else:
+                metrics['cai'] = None
+        except:
+            metrics['cai'] = None
+        try:
+            if 'tai_weights' in st.session_state and st.session_state['tai_weights']:
+                metrics['tai'] = calculate_tAI(dna_sequence, st.session_state['tai_weights'])
+            else:
+                metrics['tai'] = None
+        except:
+            metrics['tai'] = None
+    else:
+        most_frequent_codons = {
+            'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT',
+            'G': 'GGC', 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG',
+            'M': 'ATG', 'N': 'AAC', 'P': 'CCG', 'Q': 'CAG', 'R': 'CGC',
+            'S': 'TCG', 'T': 'ACG', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAT',
+            '*': 'TAA', '_': 'TAA'
+        }
+        baseline_dna = ''.join([most_frequent_codons.get(aa, 'NNN') for aa in sequence])
+        metrics = {
+            'length': len(sequence),
+            'gc_content': get_GC_content(baseline_dna),
+            'baseline_dna': baseline_dna,
+            'sequence_type': 'protein'
+        }
+        try:
+            if 'cai_weights' in st.session_state and st.session_state['cai_weights']:
+                metrics['cai'] = CAI(baseline_dna, weights=st.session_state['cai_weights'])
+            else:
+                metrics['cai'] = None
+        except:
+            metrics['cai'] = None
+        try:
+            if 'tai_weights' in st.session_state and st.session_state['tai_weights']:
+                metrics['tai'] = calculate_tAI(baseline_dna, st.session_state['tai_weights'])
+            else:
+                metrics['tai'] = None
+        except:
+            metrics['tai'] = None
+    try:
+        analysis_dna = metrics['baseline_dna']
+        metrics['restriction_sites'] = len(scan_for_restriction_sites(analysis_dna))
+        metrics['negative_cis_elements'] = count_negative_cis_elements(analysis_dna)
+        metrics['homopolymer_runs'] = calculate_homopolymer_runs(analysis_dna)
+    except:
+        metrics['restriction_sites'] = 0
+        metrics['negative_cis_elements'] = 0
+        metrics['homopolymer_runs'] = 0
+    return metrics
+def translate_dna_to_protein(dna_sequence: str) -> str:
+    """Translate DNA sequence to protein sequence"""
+    codon_table = {
+        'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+        'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+        'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+        'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+        'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+        'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+        'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+        'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+        'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+        'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+        'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+        'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
+    }
+    protein = ""
+    for i in range(0, len(dna_sequence), 3):
+        codon = dna_sequence[i:i+3].upper()
+        if len(codon) == 3:
+            aa = codon_table.get(codon, 'X')
+            if aa == '*':  # Stop codon
+                break
+            protein += aa
+    return protein
+def create_gc_content_plot(sequence: str, window_size: int = 50) -> go.Figure:
+    """Create a sliding window GC content plot"""
+    if len(sequence) < window_size:
+        window_size = len(sequence) // 3
+    positions = []
+    gc_values = []
+    for i in range(0, len(sequence) - window_size + 1, 3):  # Step by codons
+        window = sequence[i:i + window_size]
+        gc_content = get_GC_content(window)
+        positions.append(i // 3)  # Position in codons
+        gc_values.append(gc_content)
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(
+        x=positions,
+        y=gc_values,
+        mode='lines',
+        name='GC Content',
+        line=dict(color='blue', width=2)
+    ))
+    # Add target range
+    fig.add_hline(y=45, line_dash="dash", line_color="red",
+                  annotation_text="Min Target (45%)")
+    fig.add_hline(y=55, line_dash="dash", line_color="red",
+                  annotation_text="Max Target (55%)")
+    fig.update_layout(
+        title=f'GC Content (sliding window: {window_size} bp)',
+        xaxis_title='Position (codons)',
+        yaxis_title='GC Content (%)',
+        height=300
+    )
+    return fig
+def create_gc_comparison_chart(before_metrics: Dict, after_metrics: Dict) -> go.Figure:
+    """Create a comparison chart for GC Content"""
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        name='Before Optimization',
+        x=['GC Content (%)'],
+        y=[before_metrics.get('gc_content', 0)],
+        marker_color='lightblue',
+        text=[f"{before_metrics.get('gc_content', 0):.1f}%"],
+        textposition='auto'
+    ))
+    fig.add_trace(go.Bar(
+        name='After Optimization',
+        x=['GC Content (%)'],
+        y=[after_metrics.get('gc_content', 0)],
+        marker_color='darkblue',
+        text=[f"{after_metrics.get('gc_content', 0):.1f}%"],
+        textposition='auto'
+    ))
+    fig.update_layout(
+        title='GC Content Comparison: Before vs After',
+        xaxis_title='Metric',
+        yaxis_title='Value (%)',
+        barmode='group',
+        height=300
+    )
+    return fig
+def create_expression_comparison_chart(before_metrics: Dict, after_metrics: Dict) -> go.Figure:
+    """Create a comparison chart for expression metrics (CAI, tAI)"""
+    metrics_names = ['CAI', 'tAI']
+    before_values = [
+        before_metrics.get('cai', 0) if before_metrics.get('cai') else 0,
+        before_metrics.get('tai', 0) if before_metrics.get('tai') else 0
+    ]
+    after_values = [
+        after_metrics.get('cai', 0) if after_metrics.get('cai') else 0,
+        after_metrics.get('tai', 0) if after_metrics.get('tai') else 0
+    ]
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        name='Before Optimization',
+        x=metrics_names,
+        y=before_values,
+        marker_color='lightblue',
+        text=[f"{v:.3f}" for v in before_values],
+        textposition='auto'
+    ))
+    fig.add_trace(go.Bar(
+        name='After Optimization',
+        x=metrics_names,
+        y=after_values,
+        marker_color='darkblue',
+        text=[f"{v:.3f}" for v in after_values],
+        textposition='auto'
+    ))
+    fig.update_layout(
+        title='Expression Metrics Comparison: Before vs After',
+        xaxis_title='Metric',
+        yaxis_title='Value',
+        barmode='group',
+        height=300
+    )
+    return fig
+def smart_codon_replacement(dna_sequence: str, target_gc_min: float = 0.45, target_gc_max: float = 0.55, max_iterations: int = 100) -> str:
+    """Smart codon replacement to optimize GC content while maximizing CAI"""
+    # Codon alternatives with their GC content
+    codon_alternatives = {
+        # Serine: high GC options
+        'TCT': ['TCG', 'TCC', 'TCA', 'AGT', 'AGC'],  # 33% -> 67%, 67%, 33%, 33%, 67%
+        'TCA': ['TCG', 'TCC', 'TCT', 'AGT', 'AGC'],
+        'AGT': ['TCG', 'TCC', 'TCT', 'TCA', 'AGC'],
+        # Leucine: various GC options
+        'TTA': ['TTG', 'CTT', 'CTC', 'CTA', 'CTG'],  # 0% -> 33%, 33%, 67%, 33%, 67%
+        'TTG': ['TTA', 'CTT', 'CTC', 'CTA', 'CTG'],
+        'CTT': ['CTG', 'CTC', 'TTA', 'TTG', 'CTA'],
+        'CTA': ['CTG', 'CTC', 'CTT', 'TTA', 'TTG'],
+        # Arginine: various GC options
+        'AGA': ['CGT', 'CGC', 'CGA', 'CGG', 'AGG'],  # 33% -> 67%, 100%, 67%, 100%, 67%
+        'AGG': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA'],
+        'CGT': ['CGC', 'CGG', 'CGA', 'AGA', 'AGG'],
+        'CGA': ['CGC', 'CGG', 'CGT', 'AGA', 'AGG'],
+        # Proline
+        'CCT': ['CCG', 'CCC', 'CCA'],  # 67% -> 100%, 100%, 67%
+        'CCA': ['CCG', 'CCC', 'CCT'],
+        # Threonine
+        'ACT': ['ACG', 'ACC', 'ACA'],  # 33% -> 67%, 67%, 33%
+        'ACA': ['ACG', 'ACC', 'ACT'],
+        # Alanine
+        'GCT': ['GCG', 'GCC', 'GCA'],  # 67% -> 100%, 100%, 67%
+        'GCA': ['GCG', 'GCC', 'GCT'],
+        # Glycine
+        'GGT': ['GGG', 'GGC', 'GGA'],  # 67% -> 100%, 100%, 67%
+        'GGA': ['GGG', 'GGC', 'GGT'],
+        # Valine
+        'GTT': ['GTG', 'GTC', 'GTA'],  # 67% -> 100%, 100%, 67%
+        'GTA': ['GTG', 'GTC', 'GTT'],
+    }
+    def get_codon_gc(codon):
+        return (codon.count('G') + codon.count('C')) / 3.0
+    current_sequence = dna_sequence.upper()
+    current_gc = get_GC_content(current_sequence)
+    if target_gc_min <= current_gc <= target_gc_max:
+        return current_sequence
+    codons = [current_sequence[i:i+3] for i in range(0, len(current_sequence), 3)]
+    for iteration in range(max_iterations):
+        current_gc = get_GC_content(''.join(codons))
+        if target_gc_min <= current_gc <= target_gc_max:
+            break
+        # Find best codon to replace
+        best_improvement = 0
+        best_pos = -1
+        best_replacement = None
+        for pos, codon in enumerate(codons):
+            if codon in codon_alternatives:
+                for alt_codon in codon_alternatives[codon]:
+                    # Calculate GC change
+                    old_gc_contrib = get_codon_gc(codon)
+                    new_gc_contrib = get_codon_gc(alt_codon)
+                    gc_change = new_gc_contrib - old_gc_contrib
+                    # Check if this change moves us toward target
+                    if current_gc < target_gc_min and gc_change > best_improvement:
+                        best_improvement = gc_change
+                        best_pos = pos
+                        best_replacement = alt_codon
+                    elif current_gc > target_gc_max and gc_change < best_improvement:
+                        best_improvement = abs(gc_change)
+                        best_pos = pos
+                        best_replacement = alt_codon
+        if best_pos >= 0:
+            if isinstance(best_replacement, str):
+                codons[best_pos] = best_replacement
+        else:
+            break  # No more improvements possible
+    return ''.join(codons)
+def run_optimization(protein: str, organism: str, use_post_processing: bool = False):
+    """Run the optimization using the exact method from run_full_comparison.py with auto GC correction"""
+    st.session_state.optimization_running = True
+    st.session_state.post_processed_results = None
+    try:
+        # Use the exact same method that achieved best results in evaluation
+        result = predict_dna_sequence(
+            protein=protein,
+            organism=organism,
+            device=st.session_state.device,
+            model=st.session_state.model,
+            deterministic=True,
+            match_protein=True,
+        )
+        # Check GC content and auto-correct if out of optimal range
+        _res = result[0] if isinstance(result, list) else result
+        initial_gc = get_GC_content(_res.predicted_dna)
+        if initial_gc < 45.0 or initial_gc > 55.0:
+            # Auto-correct GC content silently
+            optimized_dna = smart_codon_replacement(_res.predicted_dna, 0.45, 0.55)
+            smart_gc = get_GC_content(optimized_dna)
+            if 45.0 <= smart_gc <= 55.0:
+                from CodonTransformer.CodonUtils import DNASequencePrediction
+                result = DNASequencePrediction(
+                    organism=_res.organism,
+                    protein=_res.protein,
+                    processed_input=_res.processed_input,
+                    predicted_dna=optimized_dna
+                )
+            else:
+                # Fall back to constrained beam search silently
+                try:
+                    result = predict_dna_sequence(
+                        protein=protein,
+                        organism=organism,
+                        device=st.session_state.device,
+                        model=st.session_state.model,
+                        deterministic=True,
+                        match_protein=True,
+                        use_constrained_search=True,
+                        gc_bounds=(0.45, 0.55),
+                        beam_size=20
+                    )
+                    _res2 = result[0] if isinstance(result, list) else result
+                    final_gc = get_GC_content(_res2.predicted_dna)
+                except Exception as e:
+                    # If constrained search fails, use smart replacement result anyway
+                    from CodonTransformer.CodonUtils import DNASequencePrediction
+                    result = DNASequencePrediction(
+                        organism=_res.organism,
+                        protein=_res.protein,
+                        processed_input=_res.processed_input,
+                        predicted_dna=optimized_dna
+                    )
+        st.session_state.results = result
+        # Post-processing if enabled
+        if use_post_processing and POST_PROCESSING_AVAILABLE and result:
+            try:
+                _res = result[0] if isinstance(result, list) else result
+                polished_sequence = polish_sequence_with_dnachisel(
+                    dna_sequence=_res.predicted_dna,
+                    protein_sequence=protein,
+                    gc_bounds=(45.0, 55.0),
+                    cai_species=organism.lower().replace(' ', '_'),
+                    avoid_homopolymers_length=6
+                )
+                # Create enhanced result object
+                from CodonTransformer.CodonUtils import DNASequencePrediction
+                st.session_state.post_processed_results = DNASequencePrediction(
+                    organism=result.organism,
+                    protein=result.protein,
+                    processed_input=result.processed_input,
+                    predicted_dna=polished_sequence
+                )
+            except Exception as e:
+                st.session_state.post_processed_results = f"Post-processing error: {str(e)}"
+    except Exception as e:
+        st.session_state.results = f"Error: {str(e)}"
+    finally:
+        st.session_state.optimization_running = False
+def main():
+    st.title("ENCOT")
+    st.markdown("E. coli codon optimization with constraint-aware decoding and in silico evaluation metrics.")
+    # Remove the performance highlights expander (details/summary block)
+    # (No expander here anymore)
+    # Load model
+    load_model_and_tokenizer()
+    # Create the main tabbed interface
+    tab1, tab2, tab3, tab4 = st.tabs(["Single Optimize", "Batch Process", "Comparative Analysis", "Advanced Settings"])
+    with tab1:
+        single_sequence_optimization()
+    with tab2:
+        batch_processing_interface()
+    with tab3:
+        comparative_analysis_interface()
+    with tab4:
+        advanced_settings_interface()
+def single_sequence_optimization():
+    """Single sequence optimization interface - enhanced from original functionality"""
+    # Sidebar configuration
+    st.sidebar.header("Configuration")
+    organism_options = [
+        "Escherichia coli general",
+        "Saccharomyces cerevisiae",
+        "Homo sapiens",
+        "Bacillus subtilis",
+        "Pichia pastoris"
+    ]
+    organism = st.sidebar.selectbox("Select Target Organism", organism_options)
+    load_reference_data(organism)
+    with st.sidebar.expander("Advanced Optimization Settings"):
+        st.markdown("**Model Parameters**")
+        use_deterministic = st.checkbox("Deterministic Mode", value=True, help="Use deterministic decoding for reproducible results")
+        match_protein = st.checkbox("Match Protein Validation", value=True, help="Ensure DNA translates back to exact protein")
+        st.markdown("**GC Content Control**")
+        gc_target_min = st.slider("GC Target Min (%)", 30, 70, 45, help="Minimum GC content target")
+        gc_target_max = st.slider("GC Target Max (%)", 30, 70, 55, help="Maximum GC content target")
+        st.markdown("**Quality Constraints**")
+        avoid_restriction_sites = st.multiselect(
+            "Avoid Restriction Sites",
+            ["EcoRI", "BamHI", "HindIII", "XhoI", "NotI"],
+            default=["EcoRI", "BamHI"]
+        )
+    st.sidebar.subheader("Post-Processing")
+    use_post_processing = st.sidebar.checkbox(
+        "Enable DNAChisel Post-Processing",
+        value=False,
+        disabled=not POST_PROCESSING_AVAILABLE,
+        help="Polish sequences to remove restriction sites, homopolymers, and synthesis issues"
+    )
+    if not POST_PROCESSING_AVAILABLE:
+        st.sidebar.warning("DNAChisel not available. Install with: pip install dnachisel")
+    # Dataset Information
+    st.sidebar.markdown("---")
+    st.sidebar.markdown("### Dataset Information")
+    st.sidebar.markdown("""
+    - **Dataset**: [ColiFormer-Data](https://huggingface.co/datasets/saketh11/ColiFormer-Data)
+    - **Training**: 3,676 high-expression E. coli genes (NCBI-curated)
+    - **Evaluation**: 37,053 native E. coli genes + 80 recombinant protein targets
+    - **Auto-download**: CAI weights & tAI coefficients
+    """)
+    # Model Information
+    st.sidebar.markdown("### Model Information")
+    st.sidebar.markdown("""
+    - **Model**: [ColiFormer](https://huggingface.co/saketh11/ColiFormer)
+    - **Base**: CodonTransformer BigBird architecture
+    - **Architecture**: BigBird Transformer + ALM
+    - **Auto-download**: From Hugging Face Hub
+    """)
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.header("Input Sequence")
+        sequence_input = st.text_area(
+            "Enter Protein or DNA Sequence",
+            height=150,
+            placeholder="Enter protein sequence (MKWVT...) or DNA sequence (ATGGCG...)\n\nExample protein: MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKALVLIAFAQYLQQCPFEDHVKLVNEVTEFAKTCVADESAENCDKSLHTLFGDKLCTVATLRETYGEMADCCAKQEPERNECFLQHKDDNPNLPRLVRPEVDVMCTAFHDNEETFLKKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELRDEGKASSAKQRLKCASLQKFGERAFKAWAVARLSQRFPKAEFAEVSKLVTDLTKVHTECCHGDLLECADDRADLAKYICENQDSISSKLKECCEKPLLEKSHCIAEVENDEMPADLPSLAADFVESKDVCKNYAEAKDVFLGMFLYEYARRHPDYSVVLLLRLAKTYETTLEKCCAAADPHECYAKVFDEFKPLVEEPQNLIKQNCELFEQLGEYKFQNALLVRYTKKVPQVSTPTLVEVSRNLGKVGSKCCKHPEAKRMPCAEDYLSVVLNQLCVLHEKTPVSDRVTKCCTE"
+        )
+        analyze_btn = st.button("Analyze Sequence", type="primary")
+        if sequence_input and analyze_btn:
+            is_valid, message, sequence_type, fixed_sequence = validate_sequence(sequence_input)
+            if is_valid:
+                st.success(message)
+                # Store in session state for use by Optimize Sequence
+                st.session_state.sequence_clean = fixed_sequence
+                st.session_state.sequence_type = sequence_type
+                st.session_state.input_metrics = calculate_input_metrics(fixed_sequence, organism, sequence_type)
+                st.session_state.organism = organism
+            else:
+                st.error(message)
+                if "Invalid characters" in message:
+                    st.info("Suggestion: Remove spaces, numbers, and special characters. Use only standard amino acid letters (A–Z) for proteins or nucleotides (A/T/G/C) for DNA.")
+                elif "too long" in message:
+                    st.info("Suggestion: Consider breaking long sequences into smaller segments for optimization.")
+                elif "too short" in message:
+                    st.info("Suggestion: Minimum length is 3 characters. Ensure your sequence is complete.")
+                # Clear session state if invalid
+                st.session_state.sequence_clean = None
+                st.session_state.sequence_type = None
+                st.session_state.input_metrics = None
+                st.session_state.organism = None
+        elif not sequence_input:
+            st.session_state.sequence_clean = None
+            st.session_state.sequence_type = None
+            st.session_state.input_metrics = None
+            st.session_state.organism = None
+        # Always display the last analysis if it exists in session state
+        if st.session_state.get('input_metrics') and st.session_state.get('sequence_type'):
+            input_metrics = st.session_state.input_metrics
+            sequence_type = st.session_state.sequence_type
+            st.subheader("Input Analysis")
+            metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
+            with metrics_col1:
+                unit = "codons" if sequence_type == "dna" else "AA"
+                length = input_metrics.get('length', 0) if input_metrics else 0
+                gc_content = input_metrics.get('gc_content', 0) if input_metrics else 0
+                st.metric("Length", f"{length} {unit}")
+                st.metric("GC Content", f"{gc_content:.1f}%")
+            with metrics_col2:
+                cai_val = input_metrics.get('cai') if input_metrics else None
+                if cai_val:
+                    label = "CAI" if sequence_type == "dna" else "CAI (baseline)"
+                    st.metric(label, f"{cai_val:.3f}")
+                else:
+                    st.metric("CAI", "N/A")
+            with metrics_col3:
+                tai_val = input_metrics.get('tai') if input_metrics else None
+                if tai_val:
+                    label = "tAI" if sequence_type == "dna" else "tAI (baseline)"
+                    st.metric(label, f"{tai_val:.3f}")
+                else:
+                    st.metric("tAI", "N/A")
+            st.subheader("Sequence Quality Analysis")
+            analysis_col1, analysis_col2, analysis_col3 = st.columns(3)
+            with analysis_col1:
+                sites_count = input_metrics.get('restriction_sites', 0) if input_metrics else 0
+                color = "normal" if sites_count <= 2 else "inverse"
+                st.metric("Restriction Sites", sites_count)
+            with analysis_col2:
+                neg_elements = input_metrics.get('negative_cis_elements', 0) if input_metrics else 0
+                st.metric("Negative Elements", neg_elements)
+            with analysis_col3:
+                homo_runs = input_metrics.get('homopolymer_runs', 0) if input_metrics else 0
+                st.metric("Homopolymer Runs", homo_runs)
+            baseline_dna = input_metrics.get('baseline_dna', '') if input_metrics else ''
+            if baseline_dna and len(baseline_dna) > 150:
+                st.subheader("GC Content Distribution")
+                fig = create_gc_content_plot(baseline_dna)
+                fig.update_layout(
+                    title="Input Sequence GC Content Analysis",
+                    xaxis_title="Position (codons)",
+                    yaxis_title="GC Content (%)",
+                    hovermode='x unified'
+                )
+                st.plotly_chart(fig, use_container_width=True)
+    with col2:
+        st.header("Optimization Results")
+        # Enhanced optimization button
+        if (
+            st.session_state.get('sequence_clean')
+            and st.session_state.get('sequence_type')
+            and not st.session_state.optimization_running
+        ):
+            st.markdown("**Ready to optimize your sequence!**")
+            strategy_info = st.container()
+            with strategy_info:
+                st.info(f"""
+                **Optimization Strategy:**
+                • Target organism: {st.session_state.organism}
+                • Model: Fine-tuned CodonTransformer (89.6M parameters)
+                • GC target: {gc_target_min}-{gc_target_max}%
+                • Mode: {'Deterministic' if use_deterministic else 'Stochastic'}
+                """)
+            if st.button("Optimize Sequence", type="primary", use_container_width=True):
+                st.session_state.results = None
+                if st.session_state.sequence_type == "dna":
+                    protein_sequence = translate_dna_to_protein(st.session_state.sequence_clean)
+                    run_optimization(protein_sequence, st.session_state.organism, use_post_processing)
+                else:
+                    run_optimization(st.session_state.sequence_clean, st.session_state.organism, use_post_processing)
+        # Enhanced progress display
+        if st.session_state.optimization_running:
+            st.info("Optimizing sequence...")
+            # Create progress container
+            progress_container = st.container()
+            with progress_container:
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                # Enhanced progress steps
+                steps = [
+                    "Analyzing input sequence structure...",
+                    "Loading model...",
+                    "Running optimization algorithm...",
+                    "Applying GC/content constraints...",
+                    "Finalizing optimized sequence..."
+                ]
+                for i, step in enumerate(steps):
+                    progress_value = int((i + 1) / len(steps) * 100)
+                    progress_bar.progress(progress_value)
+                    status_text.text(step)
+                    time.sleep(0.8)  # Realistic timing
+            progress_bar.empty()
+            status_text.empty()
+        # Enhanced results display
+        if st.session_state.results and not st.session_state.optimization_running:
+            if isinstance(st.session_state.results, str):
+                st.error(f"Optimization failed: {st.session_state.results}")
+            else:
+                display_optimization_results(
+                    st.session_state.results,
+                    st.session_state.get('organism', organism),
+                    st.session_state.get('sequence_clean', ''),
+                    st.session_state.get('sequence_type', 'protein'),
+                    st.session_state.get('input_metrics', {})
+                )
+def display_optimization_results(result, organism, original_sequence, sequence_type, input_metrics):
+    """Enhanced results display with publication-quality visualizations"""
+    # Calculate optimized metrics
+    optimized_metrics = {
+        'gc_content': get_GC_content(result.predicted_dna),
+        'length': len(result.predicted_dna)
+    }
+    # Calculate CAI and tAI
+    try:
+        if 'cai_weights' in st.session_state and st.session_state['cai_weights']:
+            optimized_metrics['cai'] = CAI(result.predicted_dna, weights=st.session_state['cai_weights'])
+        else:
+            optimized_metrics['cai'] = None
+    except:
+        optimized_metrics['cai'] = None
+    try:
+        if 'tai_weights' in st.session_state and st.session_state['tai_weights']:
+            optimized_metrics['tai'] = calculate_tAI(result.predicted_dna, st.session_state['tai_weights'])
+        else:
+            optimized_metrics['tai'] = None
+    except:
+        optimized_metrics['tai'] = None
+    # Success header
+    st.success("Optimization complete.")
+    # Key improvements summary
+    st.subheader("Optimization Improvements")
+    imp_col1, imp_col2, imp_col3 = st.columns(3)
+    if input_metrics is not None:
+        with imp_col1:
+            if input_metrics.get('gc_content') and optimized_metrics.get('gc_content'):
+                gc_change = optimized_metrics['gc_content'] - input_metrics['gc_content']
+                st.metric("GC Content", f"{optimized_metrics['gc_content']:.1f}%", delta=f"{gc_change:+.1f}%")
+        with imp_col2:
+            if input_metrics.get('cai') and optimized_metrics.get('cai'):
+                cai_change = optimized_metrics['cai'] - input_metrics['cai']
+                st.metric("CAI Score", f"{optimized_metrics['cai']:.3f}", delta=f"{cai_change:+.3f}")
+        with imp_col3:
+            if input_metrics.get('tai') and optimized_metrics.get('tai'):
+                tai_change = optimized_metrics['tai'] - input_metrics['tai']
+                st.metric("tAI Score", f"{optimized_metrics['tai']:.3f}", delta=f"{tai_change:+.3f}")
+    # Optimized DNA sequence display
+    st.subheader("Optimized DNA Sequence")
+    st.text_area("Optimized DNA Sequence", result.predicted_dna, height=100)
+    # Enhanced download and export options
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.download_button(
+            label="Download DNA (FASTA)",
+            data=f">Optimized_{organism.replace(' ', '_')}\n{result.predicted_dna}",
+            file_name=f"optimized_sequence_{organism.replace(' ', '_')}.fasta",
+            mime="text/plain"
+        )
+    with col2:
+        # Create CSV report
+        csv_data = f"Metric,Original,Optimized,Improvement\n"
+        csv_data += f"GC Content (%),{input_metrics['gc_content']:.1f},{optimized_metrics['gc_content']:.1f},{optimized_metrics['gc_content'] - input_metrics['gc_content']:+.1f}\n"
+        if input_metrics['cai'] and optimized_metrics['cai']:
+            csv_data += f"CAI Score,{input_metrics['cai']:.3f},{optimized_metrics['cai']:.3f},{optimized_metrics['cai'] - input_metrics['cai']:+.3f}\n"
+        if input_metrics['tai'] and optimized_metrics['tai']:
+            csv_data += f"tAI Score,{input_metrics['tai']:.3f},{optimized_metrics['tai']:.3f},{optimized_metrics['tai'] - input_metrics['tai']:+.3f}\n"
+        st.download_button(
+            label="Download Metrics (CSV)",
+            data=csv_data,
+            file_name=f"optimization_metrics_{organism.replace(' ', '_')}.csv",
+            mime="text/csv"
+        )
+    with col3:
+        st.button("Generate PDF Report", help="Coming soon: PDF report")
+    # Enhanced comparison visualizations
+    st.subheader("Before vs After Analysis")
+    # Create enhanced comparison charts
+    create_enhanced_comparison_charts(input_metrics, optimized_metrics, original_sequence, result.predicted_dna, sequence_type)
+def create_enhanced_comparison_charts(input_metrics, optimized_metrics, original_dna, optimized_dna, sequence_type):
+    """Create publication-quality comparison visualizations"""
+    if input_metrics is None or optimized_metrics is None:
+        st.info("No comparison data available.")
+        return
+    # GC Content comparison
+    gc_comp_fig = create_gc_comparison_chart(input_metrics, optimized_metrics)
+    gc_comp_fig.update_layout(
+        title="GC Content Optimization Results",
+        font=dict(size=12),
+        height=350
+    )
+    st.plotly_chart(gc_comp_fig, use_container_width=True)
+    # Expression metrics comparison
+    if input_metrics.get('cai') and optimized_metrics.get('cai'):
+        expr_comp_fig = create_expression_comparison_chart(input_metrics, optimized_metrics)
+        expr_comp_fig.update_layout(
+            title="Expression Potential Improvement",
+            font=dict(size=12),
+            height=350
+        )
+        st.plotly_chart(expr_comp_fig, use_container_width=True)
+    # Side-by-side GC distribution analysis
+    st.subheader("GC Content Distribution Analysis")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write(f"**{'Original DNA' if sequence_type == 'dna' else 'Baseline (Most Frequent Codons)'}**")
+        baseline_dna = input_metrics.get('baseline_dna') if input_metrics else None
+        plot_dna = baseline_dna if baseline_dna is not None else original_dna
+        if plot_dna is not None and isinstance(plot_dna, str) and len(plot_dna) > 150:
+            fig_before = create_gc_content_plot(plot_dna)
+            fig_before.update_layout(title="Before Optimization", height=300)
+            st.plotly_chart(fig_before, use_container_width=True)
+        else:
+            st.info("Sequence too short for sliding window analysis")
+    with col2:
+        st.write("** Model Optimized**")
+        if optimized_dna is not None and isinstance(optimized_dna, str) and len(optimized_dna) > 150:
+            fig_after = create_gc_content_plot(optimized_dna)
+            fig_after.update_layout(title="After Optimization", height=300)
+            st.plotly_chart(fig_after, use_container_width=True)
+        else:
+            st.info("Sequence too short for sliding window analysis")
+def batch_processing_interface():
+    """Batch processing interface for multiple sequences"""
+    st.header("Batch Processing")
+    st.markdown("**Process multiple protein sequences simultaneously with optimization**")
+    # File upload section
+    st.subheader("Upload Sequences")
+    uploaded_file = st.file_uploader(
+        "Choose a file with multiple sequences",
+        type=['csv', 'xlsx', 'fasta', 'txt', 'fa'],
+        help="Upload CSV, Excel (XLSX, with 'sequence' column) or FASTA format files"
+    )
+    if uploaded_file:
+        st.success(f"File uploaded: {uploaded_file.name}")
+        # Process uploaded file
+        try:
+            def find_column(df, target):
+                # Find column name case-insensitively and ignoring spaces
+                for col in df.columns:
+                    if col.strip().lower() == target:
+                        return col
+                return None
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
+                seq_col = find_column(df, 'sequence')
+                name_col = find_column(df, 'name')
+                if seq_col:
+                    sequences = df[seq_col].tolist()
+                    if name_col:
+                        names = df[name_col].tolist()
+                    else:
+                        names = [f"Sequence_{i+1}" for i in range(len(sequences))]
+                else:
+                    st.error("CSV file must contain a column named 'sequence' (case-insensitive, spaces ignored)")
+                    return
+            elif uploaded_file.name.endswith('.xlsx'):
+                df = pd.read_excel(uploaded_file)
+                seq_col = find_column(df, 'sequence')
+                name_col = find_column(df, 'name')
+                if seq_col:
+                    sequences = df[seq_col].tolist()
+                    if name_col:
+                        names = df[name_col].tolist()
+                    else:
+                        names = [f"Sequence_{i+1}" for i in range(len(sequences))]
+                else:
+                    st.error("Excel file must contain a column named 'sequence' (case-insensitive, spaces ignored)")
+                    return
+            else:
+                # Handle FASTA format
+                content = uploaded_file.read().decode('utf-8')
+                sequences, names = parse_fasta_content(content)
+            st.info(f"Found {len(sequences)} sequences ready for optimization")
+            # Batch configuration
+            col1, col2 = st.columns(2)
+            with col1:
+                batch_organism = st.selectbox("Target Organism", [
+                    "Escherichia coli general", "Saccharomyces cerevisiae", "Homo sapiens"
+                ])
+            with col2:
+                max_sequences = st.number_input("Max sequences to process", 1, len(sequences), min(10, len(sequences)))
+            # Start batch processing
+            if st.button("Start Batch Optimization", type="primary"):
+                run_batch_optimization(sequences[:max_sequences], names[:max_sequences], batch_organism)
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+    # Batch results display
+    if 'batch_results' in st.session_state and st.session_state.batch_results:
+        display_batch_results()
+def parse_fasta_content(content):
+    """Parse FASTA format content"""
+    sequences = []
+    names = []
+    current_seq = ""
+    current_name = ""
+    for line in content.split('\n'):
+        line = line.strip()
+        if line.startswith('>'):
+            if current_seq:
+                sequences.append(current_seq)
+                names.append(current_name)
+            current_name = line[1:] if len(line) > 1 else f"Sequence_{len(sequences)+1}"
+            current_seq = ""
+        else:
+            current_seq += line
+    if current_seq:
+        sequences.append(current_seq)
+        names.append(current_name)
+    return sequences, names
+def run_batch_optimization(sequences, names, organism):
+    """Run batch optimization with progress tracking"""
+    st.session_state.batch_results = []
+    st.session_state.batch_logs = []  # Collect info logs for auto-fixes
+    # Load reference data for CAI/tAI
+    load_reference_data(organism)
+    cai_weights = st.session_state.get('cai_weights', None)
+    tai_weights = st.session_state.get('tai_weights', None)
+    # Create progress tracking
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    for i, (seq, name) in enumerate(zip(sequences, names)):
+        progress = (i + 1) / len(sequences)
+        progress_bar.progress(progress)
+        status_text.text(f"Processing {name} ({i+1}/{len(sequences)})")
+        try:
+            # Validate sequence and get possibly fixed sequence
+            is_valid, message, sequence_type, fixed_seq = validate_sequence(seq)
+            if is_valid:
+                # Log if auto-fixed
+                if 'auto-fixed' in message:
+                    st.session_state.batch_logs.append(f"{name}: {message}")
+                # Calculate original metrics (use fixed_seq for DNA)
+                if sequence_type == "dna":
+                    orig_gc = get_GC_content(fixed_seq)
+                    orig_cai = CAI(fixed_seq, weights=cai_weights) if cai_weights else None
+                    orig_tai = calculate_tAI(fixed_seq, tai_weights) if tai_weights else None
+                else:
+                    # For protein, create baseline DNA
+                    most_frequent_codons = {
+                        'A': 'GCG', 'C': 'TGC', 'D': 'GAT', 'E': 'GAA', 'F': 'TTT',
+                        'G': 'GGC', 'H': 'CAT', 'I': 'ATT', 'K': 'AAA', 'L': 'CTG',
+                        'M': 'ATG', 'N': 'AAC', 'P': 'CCG', 'Q': 'CAG', 'R': 'CGC',
+                        'S': 'TCG', 'T': 'ACG', 'V': 'GTG', 'W': 'TGG', 'Y': 'TAT',
+                        '*': 'TAA', '_': 'TAA'
+                    }
+                    baseline_dna = ''.join([most_frequent_codons.get(aa, 'NNN') for aa in fixed_seq])
+                    orig_gc = get_GC_content(baseline_dna)
+                    orig_cai = CAI(baseline_dna, weights=cai_weights) if cai_weights else None
+                    orig_tai = calculate_tAI(baseline_dna, tai_weights) if tai_weights else None
+                # Run optimization using the fixed sequence
+                result = predict_dna_sequence(
+                    protein=fixed_seq if sequence_type == "protein" else translate_dna_to_protein(fixed_seq),
+                    organism=organism,
+                    device=st.session_state.device,
+                    model=st.session_state.model,
+                    deterministic=True,
+                    match_protein=True,
+                )
+                # If result is a list, use the first element
+                if isinstance(result, list):
+                    result_obj = result[0]
+                else:
+                    result_obj = result
+                # Calculate optimized metrics
+                opt_gc = get_GC_content(result_obj.predicted_dna)
+                opt_cai = CAI(result_obj.predicted_dna, weights=cai_weights) if cai_weights else None
+                opt_tai = calculate_tAI(result_obj.predicted_dna, tai_weights) if tai_weights else None
+                metrics = {
+                    'name': name,
+                    'original_sequence': fixed_seq,
+                    'optimized_dna': result_obj.predicted_dna,
+                    'gc_content_before': orig_gc,
+                    'gc_content_after': opt_gc,
+                    'cai_before': orig_cai,
+                    'cai_after': opt_cai,
+                    'tai_before': orig_tai,
+                    'tai_after': opt_tai,
+                    'length_before': len(fixed_seq),
+                    'length_after': len(result_obj.predicted_dna),
+                    'validation_message': message
+                }
+                st.session_state.batch_results.append(metrics)
+            else:
+                # Only skip if truly invalid (not auto-fixable)
+                st.session_state.batch_logs.append(f"{name}: {message}")
+        except Exception as e:
+            st.session_state.batch_logs.append(f"{name}: Error processing: {str(e)}")
+    progress_bar.empty()
+    status_text.empty()
+    st.success(f"Batch optimization complete. Processed {len(st.session_state.batch_results)} sequences.")
+def display_batch_results():
+    """Display batch processing results"""
+    st.subheader("Batch Results")
+    # Show all logs (auto-fixes and errors)
+    if hasattr(st.session_state, 'batch_logs') and st.session_state.batch_logs:
+        for log in st.session_state.batch_logs:
+            st.info(log)
+    results_df = pd.DataFrame(st.session_state.batch_results)
+    # Summary statistics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Sequences Processed", len(results_df))
+    with col2:
+        st.metric("Avg GC Before", f"{results_df['gc_content_before'].mean():.1f}%")
+        st.metric("Avg GC After", f"{results_df['gc_content_after'].mean():.1f}%")
+    with col3:
+        st.metric("Avg CAI Before", f"{results_df['cai_before'].mean():.3f}")
+        st.metric("Avg CAI After", f"{results_df['cai_after'].mean():.3f}")
+    with col4:
+        st.metric("Avg tAI Before", f"{results_df['tai_before'].mean():.3f}")
+        st.metric("Avg tAI After", f"{results_df['tai_after'].mean():.3f}")
+    # CAI Extremes Analysis
+    st.subheader("CAI Performance Analysis")
+    # Filter out rows with NaN CAI values for analysis
+    valid_cai_df = results_df.dropna(subset=['cai_after'])
+    if len(valid_cai_df) > 0:
+        # Find lowest and highest CAI sequences
+        lowest_cai_idx = valid_cai_df['cai_after'].idxmin()
+        highest_cai_idx = valid_cai_df['cai_after'].idxmax()
+        lowest_cai_row = results_df.loc[lowest_cai_idx]
+        highest_cai_row = results_df.loc[highest_cai_idx]
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("**Lowest CAI Sequence**")
+            st.write(f"**Name:** {lowest_cai_row['name']}")
+            st.metric("CAI Score", f"{lowest_cai_row['cai_after']:.3f}")
+            st.metric("GC Content", f"{lowest_cai_row['gc_content_after']:.1f}%")
+            st.metric("tAI Score", f"{lowest_cai_row['tai_after']:.3f}")
+            st.metric("Length", f"{lowest_cai_row['length_after']} bp")
+            # Show improvement
+            if pd.notna(lowest_cai_row['cai_before']):
+                cai_improvement = lowest_cai_row['cai_after'] - lowest_cai_row['cai_before']
+                st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
+        with col2:
+            st.markdown("**Highest CAI Sequence**")
+            st.write(f"**Name:** {highest_cai_row['name']}")
+            st.metric("CAI Score", f"{highest_cai_row['cai_after']:.3f}")
+            st.metric("GC Content", f"{highest_cai_row['gc_content_after']:.1f}%")
+            st.metric("tAI Score", f"{highest_cai_row['tai_after']:.3f}")
+            st.metric("Length", f"{highest_cai_row['length_after']} bp")
+            # Show improvement
+            if pd.notna(highest_cai_row['cai_before']):
+                cai_improvement = highest_cai_row['cai_after'] - highest_cai_row['cai_before']
+                st.metric("CAI Improvement", f"{cai_improvement:+.3f}")
+        # CAI Distribution Chart
+        st.subheader("CAI Distribution")
+        fig = go.Figure()
+        fig.add_trace(go.Histogram(
+            x=valid_cai_df['cai_after'],
+            nbinsx=20,
+            name='Optimized CAI Scores',
+            marker_color='darkblue',
+            opacity=0.7
+        ))
+        # Add vertical lines for lowest and highest
+        fig.add_vline(
+            x=lowest_cai_row['cai_after'],
+            line_dash="dash",
+            line_color="red",
+            annotation_text=f"Lowest: {lowest_cai_row['cai_after']:.3f}"
+        )
+        fig.add_vline(
+            x=highest_cai_row['cai_after'],
+            line_dash="dash",
+            line_color="green",
+            annotation_text=f"Highest: {highest_cai_row['cai_after']:.3f}"
+        )
+        fig.update_layout(
+            title="Distribution of Optimized CAI Scores",
+            xaxis_title="CAI Score",
+            yaxis_title="Number of Sequences",
+            height=400,
+            showlegend=False
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # GC Content Distribution Chart
+        st.subheader("GC Content Distribution")
+        valid_gc_df = results_df.dropna(subset=['gc_content_after'])
+        if len(valid_gc_df) > 0:
+            lowest_gc_idx = valid_gc_df['gc_content_after'].idxmin()
+            highest_gc_idx = valid_gc_df['gc_content_after'].idxmax()
+            lowest_gc_row = results_df.loc[lowest_gc_idx]
+            highest_gc_row = results_df.loc[highest_gc_idx]
+            fig_gc = go.Figure()
+            fig_gc.add_trace(go.Histogram(
+                x=valid_gc_df['gc_content_after'],
+                nbinsx=20,
+                name='Optimized GC Content',
+                marker_color='teal',
+                opacity=0.7
+            ))
+            fig_gc.add_vline(
+                x=lowest_gc_row['gc_content_after'],
+                line_dash="dash",
+                line_color="red",
+                annotation_text=f"Lowest: {lowest_gc_row['gc_content_after']:.1f}%"
+            )
+            fig_gc.add_vline(
+                x=highest_gc_row['gc_content_after'],
+                line_dash="dash",
+                line_color="green",
+                annotation_text=f"Highest: {highest_gc_row['gc_content_after']:.1f}%"
+            )
+            fig_gc.update_layout(
+                title="Distribution of Optimized GC Content",
+                xaxis_title="GC Content (%)",
+                yaxis_title="Number of Sequences",
+                height=400,
+                showlegend=False
+            )
+            st.plotly_chart(fig_gc, use_container_width=True)
+        else:
+            st.warning("No valid GC content values found in the batch results.")
+    else:
+        st.warning("No valid CAI scores found in the batch results. Check if CAI weights are properly loaded.")
+    # Sequence selector
+    seq_names = results_df['name'].tolist()
+    selected_seq = st.selectbox("Select a sequence to view details", seq_names)
+    seq_row = results_df[results_df['name'] == selected_seq].iloc[0]
+    st.markdown(f"### Details for: {selected_seq}")
+    if 'validation_message' in seq_row and 'auto-fixed' in seq_row['validation_message']:
+        st.info(seq_row['validation_message'])
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("**Original Sequence**")
+        st.text_area("Original Sequence", seq_row['original_sequence'], height=100)
+        st.metric("GC Content (Before)", f"{seq_row['gc_content_before']:.1f}%")
+        st.metric("CAI (Before)", f"{seq_row['cai_before']:.3f}")
+        st.metric("tAI (Before)", f"{seq_row['tai_before']:.3f}")
+        st.metric("Length (Before)", f"{seq_row['length_before']}")
+    with col2:
+        st.markdown("**Optimized Sequence**")
+        st.text_area("Optimized Sequence", seq_row['optimized_dna'], height=100)
+        st.metric("GC Content (After)", f"{seq_row['gc_content_after']:.1f}%")
+        st.metric("CAI (After)", f"{seq_row['cai_after']:.3f}")
+        st.metric("tAI (After)", f"{seq_row['tai_after']:.3f}")
+        st.metric("Length (After)", f"{seq_row['length_after']}")
+    # Plots for before/after GC content
+    st.subheader("GC Content Distribution (Before vs After)")
+    if len(seq_row['original_sequence']) > 150 and len(seq_row['optimized_dna']) > 150:
+        fig_before = create_gc_content_plot(seq_row['original_sequence'])
+        fig_before.update_layout(title="Before Optimization", height=300)
+        fig_after = create_gc_content_plot(seq_row['optimized_dna'])
+        fig_after.update_layout(title="After Optimization", height=300)
+        st.plotly_chart(fig_before, use_container_width=True)
+        st.plotly_chart(fig_after, use_container_width=True)
+    else:
+        st.info("Sequence(s) too short for sliding window analysis")
+    # Download batch results
+    if st.button("Download Batch Results"):
+        csv_data = results_df.to_csv(index=False)
+        st.download_button(
+            label="Download CSV",
+            data=csv_data,
+            file_name="batch_optimization_results.csv",
+            mime="text/csv"
+        )
+def comparative_analysis_interface():
+    """Comparative analysis interface"""
+    st.header("Comparative Analysis")
+    st.markdown("For quantitative comparisons and plots, use the benchmark script:")
+    st.code("python scripts/run_benchmarks.py --config configs/benchmark.yaml")
+def advanced_settings_interface():
+    """Advanced settings and configuration interface"""
+    st.header("Advanced Settings")
+    st.markdown("**Configure advanced parameters and model settings**")
+    # Model configuration
+    st.subheader("Model Configuration")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write("**Current Model Status:**")
+        if st.session_state.model:
+            model_type = getattr(st.session_state, 'model_type', 'unknown')
+            st.success(f"Model loaded: {model_type}")
+            st.write(f"Device: {st.session_state.device}")
+        else:
+            st.warning("Model not loaded")
+    with col2:
+        st.write("**Model Information:**")
+        st.write("• Architecture: BigBird Transformer")
+        st.write("• Parameters: 89.6M")
+        st.write("• Fine-tuning data: 3,676 high-expression E. coli genes (NCBI-curated)")
+    # Performance tuning
+    st.subheader("Performance Tuning")
+    # Memory management
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Clear Cache"):
+            st.cache_data.clear()
+            st.success("Cache cleared successfully")
+    with col2:
+        if st.button("Reload Model"):
+            st.session_state.model = None
+            st.session_state.tokenizer = None
+            st.rerun()
+    # System information
+    st.subheader("System Information")
+    import torch
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.write("**PyTorch:**")
+        st.write(f"Version: {torch.__version__}")
+        st.write(f"CUDA Available: {torch.cuda.is_available()}")
+    with col2:
+        st.write("**Device:**")
+        st.write(f"Current: {st.session_state.device}")
+        if torch.cuda.is_available():
+            st.write(f"GPU: {torch.cuda.get_device_name()}")
+    with col3:
+        st.write("**Memory:**")
+        if torch.cuda.is_available():
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+            st.write(f"GPU Memory: {gpu_memory:.1f} GB")
+    # Footer
+    st.markdown("---")
+    st.markdown("**ENCOT**")
+    st.markdown("Open-source codon optimization for E. coli with reproducible evaluation.")
+if __name__ == "__main__":
+    main()

streamlit_gui/demo.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python3
+"""
+Demo script for ColiFormer Streamlit GUI
+This script demonstrates the GUI functionality with example sequences
+and showcases key features of the ColiFormer optimization tool.
+"""
+import sys
+import os
+import time
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent.parent))
+def print_header():
+    """Print demo header"""
+    print("=" * 40)
+    print("  ColiFormer GUI Demo")
+    print("=" * 40)
+    print()
+def print_section(title):
+    """Print section header"""
+    print(f"\n{title}")
+    print("-" * (len(title) + 4))
+def demo_validation():
+    """Demonstrate protein sequence validation"""
+    print_section("Protein Sequence Validation")
+    # Import validation function
+    from streamlit_gui.app import validate_protein_sequence
+    test_sequences = [
+        ("MKTVRQERLK", "Valid short peptide"),
+        ("MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG", "Valid longer protein"),
+        ("MKTVRQERLKX", "Invalid character (X)"),
+        ("MK", "Too short"),
+        ("mktvrqerlk", "Lowercase (should work)"),
+        ("MKTVRQERLK*", "With stop codon"),
+    ]
+    for seq, description in test_sequences:
+        is_valid, message = validate_protein_sequence(seq)
+        status = "OK" if is_valid else "FAIL"
+        print(f"{status} {description}: {message}")
+def demo_metrics():
+    """Demonstrate metrics calculation"""
+    print_section("Metrics Calculation Demo")
+    from streamlit_gui.app import calculate_input_metrics
+    example_proteins = [
+        ("MKTVRQERLK", "Short peptide (10 AA)"),
+        ("MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG", "Medium protein (67 AA)"),
+        ("MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTE", "Long protein (72 AA)"),
+    ]
+    organism = "Escherichia coli general"
+    for protein, description in example_proteins:
+        print(f"\n{description}")
+        print(f"   Sequence: {protein[:30]}{'...' if len(protein) > 30 else ''}")
+        metrics = calculate_input_metrics(protein, organism)
+        print(f"   Length: {metrics['length']} amino acids")
+        print(f"   GC Content: {metrics['gc_content']:.1f}%")
+        if metrics['tai']:
+            print(f"   tAI: {metrics['tai']:.3f}")
+        if metrics['cai']:
+            print(f"   CAI: {metrics['cai']:.3f}")
+        else:
+            print("   CAI: Not available for this organism")
+def demo_visualization():
+    """Demonstrate visualization capabilities"""
+    print_section("Visualization Demo")
+    from streamlit_gui.app import create_gc_content_plot, create_metrics_comparison_chart
+    # Test DNA sequence for GC content plot
+    test_dna = "ATGGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGC"
+    print("Creating GC content sliding window plot...")
+    try:
+        fig = create_gc_content_plot(test_dna)
+        print("   OK: GC content plot created successfully")
+        print(f"   Analyzing {len(test_dna)} base pairs")
+    except Exception as e:
+        print(f"   FAIL: Error creating GC plot: {e}")
+    print("\nCreating metrics comparison chart...")
+    try:
+        before_metrics = {
+            'gc_content': 45.2,
+            'cai': 0.485,
+            'tai': 0.312
+        }
+        after_metrics = {
+            'gc_content': 52.1,
+            'cai': 0.634,
+            'tai': 0.456
+        }
+        fig = create_metrics_comparison_chart(before_metrics, after_metrics)
+        print("   OK: Comparison chart created successfully")
+        print("   Shows improvement in all metrics")
+    except Exception as e:
+        print(f"   FAIL: Error creating comparison chart: {e}")
+def demo_codon_evaluation():
+    """Demonstrate CodonEvaluation functions"""
+    print_section("CodonEvaluation Functions Demo")
+    from CodonTransformer.CodonEvaluation import get_GC_content, calculate_tAI, get_ecoli_tai_weights
+    test_sequences = [
+        ("ATGGCGAAAGCGCTGTATCGC", "High GC content"),
+        ("ATGAAATTTATTTATTATTAT", "Low GC content"),
+        ("ATGGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGC", "Medium length"),
+    ]
+    print("Testing GC content calculation:")
+    for seq, description in test_sequences:
+        gc_content = get_GC_content(seq)
+        print(f"   {description}: {gc_content:.1f}%")
+    print("\nTesting tAI calculation:")
+    try:
+        tai_weights = get_ecoli_tai_weights()
+        for seq, description in test_sequences:
+            tai_value = calculate_tAI(seq, tai_weights)
+            print(f"   {description}: {tai_value:.3f}")
+    except Exception as e:
+        print(f"   FAIL: tAI calculation error: {e}")
+def demo_model_info():
+    """Show model information"""
+    print_section("Model Information")
+    try:
+        import torch
+        from transformers import AutoTokenizer
+        print("Model Details:")
+        print("   Base model: adibvafa/CodonTransformer")
+        print("   Architecture: BigBird Transformer")
+        print("   Task: Masked Language Modeling for codon optimization")
+        print("\nSystem Information:")
+        print(f"   PyTorch: {torch.__version__}")
+        print(f"   Device: {'GPU (CUDA)' if torch.cuda.is_available() else 'CPU'}")
+        if torch.cuda.is_available():
+            print(f"   GPU: {torch.cuda.get_device_name(0)}")
+            print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
+        print("\nTokenizer Test:")
+        tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+        print(f"   OK: Tokenizer loaded: {len(tokenizer)} tokens")
+        print(f"   Vocab size: {tokenizer.vocab_size}")
+    except Exception as e:
+        print(f"   FAIL: Error loading model info: {e}")
+def demo_gui_features():
+    """Show GUI features overview"""
+    print_section("GUI Features Overview")
+    features = [
+        ("Real-time Validation", "Instant feedback on protein sequence validity"),
+        ("Metrics Dashboard", "GC content, CAI, tAI calculations"),
+        ("Constrained Optimization", "GC content control with beam search"),
+        ("Visual Analytics", "Interactive plots and comparisons"),
+        ("Configurable Parameters", "Organism selection, beam size, GC targets"),
+        ("Export Options", "Download optimized sequences"),
+        ("Progress Tracking", "Real-time optimization progress"),
+        ("Responsive Design", "Works on desktop and mobile"),
+    ]
+    for feature, description in features:
+        print(f"   {feature}: {description}")
+def demo_usage_examples():
+    """Show usage examples"""
+    print_section("Usage Examples")
+    examples = [
+        {
+            "name": "Short Peptide Optimization",
+            "protein": "MKTVRQERLK",
+            "organism": "Escherichia coli general",
+            "use_case": "Quick testing and validation"
+        },
+        {
+            "name": "Insulin Chain A",
+            "protein": "GIVEQCCTSICSLYQLENYCN",
+            "organism": "Escherichia coli general",
+            "use_case": "Pharmaceutical protein production"
+        },
+        {
+            "name": "Green Fluorescent Protein (partial)",
+            "protein": "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQC",
+            "organism": "Escherichia coli general",
+            "use_case": "Research marker protein"
+        },
+        {
+            "name": "Yeast Expression",
+            "protein": "MKTVRQERLKSIVRILERSKEPVSGAQ",
+            "organism": "Saccharomyces cerevisiae",
+            "use_case": "Eukaryotic protein expression"
+        }
+    ]
+    for i, example in enumerate(examples, 1):
+        print(f"\nExample {i}: {example['name']}")
+        print(f"   Protein: {example['protein'][:40]}{'...' if len(example['protein']) > 40 else ''}")
+        print(f"   Organism: {example['organism']}")
+        print(f"   Use case: {example['use_case']}")
+        print(f"   Length: {len(example['protein'])} amino acids")
+def demo_launch_instructions():
+    """Show how to launch the GUI"""
+    print_section("How to Launch the GUI")
+    print("Launch Options:")
+    print()
+    print("   Option 1 - Using the launcher script:")
+    print("   $ cd ecoli/streamlit_gui")
+    print("   $ python run_gui.py")
+    print()
+    print("   Option 2 - Direct streamlit command:")
+    print("   $ cd ecoli/streamlit_gui")
+    print("   $ source ../codon_env/bin/activate")
+    print("   $ streamlit run app.py")
+    print()
+    print("   Option 3 - With custom port:")
+    print("   $ streamlit run app.py --server.port 8502")
+    print()
+    print("Access the GUI:")
+    print("   Web browser: http://localhost:8501")
+    print("   The GUI will automatically open in your default browser")
+    print()
+    print("Performance Tips:")
+    print("   • Use GPU if available for faster processing")
+    print("   • Start with shorter sequences for testing")
+    print("   • Adjust beam size based on sequence length")
+    print("   • Close other applications to free up memory")
+def main():
+    """Run the complete demo"""
+    print_header()
+    print("This demo showcases the ENCOT Streamlit GUI capabilities.")
+    print("The GUI provides an interface for protein codon optimization.")
+    print()
+    try:
+        demo_validation()
+        demo_metrics()
+        demo_visualization()
+        demo_codon_evaluation()
+        demo_model_info()
+        demo_gui_features()
+        demo_usage_examples()
+        demo_launch_instructions()
+        print("\nDemo completed successfully.")
+        print()
+        print("Next steps:")
+        print("1. Launch the GUI using one of the methods above")
+        print("2. Try the example sequences provided")
+        print("3. Experiment with different organisms and settings")
+        print("4. Compare optimization results")
+        print()
+        print("Happy optimizing.")
+    except Exception as e:
+        print(f"\nDemo error: {e}")
+        print("Make sure you're running from the correct directory and all dependencies are installed.")
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())

streamlit_gui/requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+streamlit>=1.28.0
+torch>=1.13.0
+pandas>=1.5.0
+numpy>=1.21.0
+plotly>=5.0.0
+transformers>=4.21.0
+scipy>=1.9.0
+tokenizers>=0.13.0
+tqdm>=4.64.0
+matplotlib>=3.5.0
+seaborn>=0.11.0
+onnxruntime>=1.15.0
+python-codon-tables>=0.1.12
+biopython>=1.79
+scikit-learn>=1.0.0
+requests>=2.25.0
+ipywidgets>=7.6.0
+huggingface-hub>=0.20.0
+datasets>=2.0.0
+git+https://github.com/Benjamin-Lee/CodonAdaptationIndex.git

streamlit_gui/run_gui.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Launcher script for ColiFormer Streamlit GUI
+This script sets up the environment and launches the Streamlit application.
+"""
+import os
+import sys
+import subprocess
+from pathlib import Path
+def main():
+    """Launch the Streamlit GUI application"""
+    # Get the directory containing this script
+    script_dir = Path(__file__).parent
+    # Add the parent directory to Python path so we can import CodonTransformer
+    parent_dir = script_dir.parent
+    sys.path.insert(0, str(parent_dir))
+    # Set working directory to parent directory so model paths work correctly
+    os.chdir(parent_dir)
+    print("Starting ENCOT GUI...")
+    print(f"   Working directory: {parent_dir}")
+    print(f"   Python path includes: {parent_dir}")
+    # Check for model checkpoint
+    model_path = parent_dir / "models" / "alm-enhanced-training" / "balanced_alm_finetune.ckpt"
+    if model_path.exists():
+        print(f"Found fine-tuned model: {model_path}")
+    else:
+        print("Fine-tuned model not found, will use base model")
+    # Check for virtual environment
+    venv_path = parent_dir / "codon_env"
+    if venv_path.exists():
+        # Set up virtual environment paths
+        venv_bin = venv_path / "bin"
+        venv_python = venv_bin / "python"
+        if venv_python.exists():
+            print(f"Found virtual environment: {venv_path}")
+            # Update PATH to include virtual environment
+            current_path = os.environ.get("PATH", "")
+            os.environ["PATH"] = f"{venv_bin}:{current_path}"
+            # Use virtual environment Python
+            python_executable = str(venv_python)
+        else:
+            print("Virtual environment found but Python executable missing")
+            python_executable = sys.executable
+    else:
+        print("No virtual environment found, using system Python")
+        python_executable = sys.executable
+    print(f"   Using Python: {python_executable}")
+    print()
+    # Check if streamlit is installed
+    try:
+        import streamlit
+        print(f"Streamlit version: {streamlit.__version__}")
+    except ImportError:
+        print("Streamlit not found. Please install requirements:")
+        print("   pip install -r requirements.txt")
+        return 1
+    # Check if torch is available
+    try:
+        import torch
+        device = "GPU" if torch.cuda.is_available() else "CPU"
+        print(f"PyTorch available, using: {device}")
+    except ImportError:
+        print("PyTorch not found. Please install requirements:")
+        print("   pip install -r requirements.txt")
+        return 1
+    print()
+    print("Launching GUI...")
+    print("   The application will open in your default web browser")
+    print("   Press Ctrl+C to stop the server")
+    print()
+    # Launch streamlit
+    try:
+        subprocess.run([
+            python_executable, "-m", "streamlit", "run", "streamlit_gui/app.py",
+            "--server.headless", "false",
+            "--server.port", "8501",
+            "--server.address", "0.0.0.0"
+        ])
+    except KeyboardInterrupt:
+        print("\nShutting down ENCOT GUI...")
+        return 0
+    except Exception as e:
+        print(f"Error launching Streamlit: {e}")
+        return 1
+if __name__ == "__main__":
+    exit(main())

streamlit_gui/test_gui.py ADDED Viewed

	@@ -0,0 +1,321 @@

+#!/usr/bin/env python3
+"""
+Test script for ColiFormer Streamlit GUI
+This script tests the core functionality of the GUI without running the full Streamlit application.
+"""
+import sys
+import os
+import traceback
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent.parent))
+def test_imports():
+    """Test if all required imports work"""
+    print("Testing imports...")
+    try:
+        import streamlit as st
+        print(f"  OK: Streamlit: {st.__version__}")
+    except ImportError as e:
+        print(f"  FAIL: Streamlit: {e}")
+        return False
+    try:
+        import torch
+        device = "GPU" if torch.cuda.is_available() else "CPU"
+        print(f"  OK: PyTorch: {torch.__version__} ({device})")
+    except ImportError as e:
+        print(f"  FAIL: PyTorch: {e}")
+        return False
+    try:
+        import plotly
+        print(f"  OK: Plotly: {plotly.__version__}")
+    except ImportError as e:
+        print(f"  FAIL: Plotly: {e}")
+        return False
+    try:
+        from CodonTransformer.CodonPrediction import predict_dna_sequence
+        print("  OK: CodonTransformer.CodonPrediction")
+    except ImportError as e:
+        print(f"  FAIL: CodonTransformer.CodonPrediction: {e}")
+        return False
+    try:
+        from CodonTransformer.CodonEvaluation import get_GC_content, calculate_tAI
+        print("  OK: CodonTransformer.CodonEvaluation")
+    except ImportError as e:
+        print(f"  FAIL: CodonTransformer.CodonEvaluation: {e}")
+        return False
+    return True
+def test_protein_validation():
+    """Test protein sequence validation"""
+    print("\nTesting protein sequence validation...")
+    try:
+        # Import the validation function
+        from app import validate_protein_sequence
+        # Test cases
+        test_cases = [
+            ("MKTVRQERLK", True, "Valid short sequence"),
+            ("", False, "Empty sequence"),
+            ("MKTVRQERLKX", False, "Invalid character X"),
+            ("MK", False, "Too short"),
+            ("M" * 501, False, "Too long"),
+            ("mktvrqerlk", True, "Lowercase (should work)"),
+            ("MKTVRQERLK*", True, "With stop codon"),
+            ("MKTVRQERLK_", True, "With underscore stop"),
+        ]
+        for seq, expected_valid, description in test_cases:
+            is_valid, message = validate_protein_sequence(seq)
+            status = "OK" if is_valid == expected_valid else "FAIL"
+            print(f"  {status} {description}: {message}")
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in validation test: {e}")
+        traceback.print_exc()
+        return False
+def test_metrics_calculation():
+    """Test metrics calculation"""
+    print("\nTesting metrics calculation...")
+    try:
+        from app import calculate_input_metrics
+        test_protein = "MKTVRQERLK"
+        organism = "Escherichia coli general"
+        metrics = calculate_input_metrics(test_protein, organism)
+        # Check if all expected metrics are present
+        expected_keys = ['length', 'gc_content', 'baseline_dna', 'cai', 'tai']
+        for key in expected_keys:
+            if key in metrics:
+                print(f"  OK: {key}: {metrics[key]}")
+            else:
+                print(f"  FAIL: Missing metric: {key}")
+                return False
+        # Validate metric values
+        if metrics['length'] == len(test_protein):
+            print("  OK: Length calculation correct")
+        else:
+            print("  FAIL: Length calculation incorrect")
+            return False
+        if 0 <= metrics['gc_content'] <= 100:
+            print("  OK: GC content in valid range")
+        else:
+            print("  FAIL: GC content out of range")
+            return False
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in metrics calculation: {e}")
+        traceback.print_exc()
+        return False
+def test_visualization_functions():
+    """Test visualization functions"""
+    print("\nTesting visualization functions...")
+    try:
+        from app import create_gc_content_plot, create_metrics_comparison_chart
+        # Test GC content plot
+        test_dna = "ATGGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGCGAAAGCGCTGTATCGC"
+        fig = create_gc_content_plot(test_dna)
+        print("  OK: GC content plot created")
+        # Test metrics comparison chart
+        before_metrics = {'gc_content': 50.0, 'cai': 0.5, 'tai': 0.3}
+        after_metrics = {'gc_content': 52.0, 'cai': 0.6, 'tai': 0.4}
+        fig = create_metrics_comparison_chart(before_metrics, after_metrics)
+        print("  OK: Metrics comparison chart created")
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in visualization test: {e}")
+        traceback.print_exc()
+        return False
+def test_codon_evaluation():
+    """Test CodonEvaluation functions directly"""
+    print("\nTesting CodonEvaluation functions...")
+    try:
+        from CodonTransformer.CodonEvaluation import get_GC_content, calculate_tAI, get_ecoli_tai_weights
+        # Test GC content calculation
+        test_dna = "ATGGCGAAAGCG"
+        gc_content = get_GC_content(test_dna)
+        print(f"  OK: GC content calculation: {gc_content:.1f}%")
+        # Test tAI calculation
+        try:
+            tai_weights = get_ecoli_tai_weights()
+            tai_value = calculate_tAI(test_dna, tai_weights)
+            print(f"  OK: tAI calculation: {tai_value:.3f}")
+        except Exception as e:
+            print(f"  NOTE: tAI calculation (may need scipy): {e}")
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in CodonEvaluation test: {e}")
+        traceback.print_exc()
+        return False
+def test_model_loading():
+    """Test model loading functionality"""
+    print("\nTesting model loading (mock)...")
+    try:
+        import torch
+        from transformers import AutoTokenizer
+        from CodonTransformer.CodonPrediction import load_model
+        # Test tokenizer loading (this is fast)
+        print("  Testing tokenizer loading...")
+        tokenizer = AutoTokenizer.from_pretrained("adibvafa/CodonTransformer")
+        print("  OK: Tokenizer loaded successfully")
+        # Test load_model function
+        print("  Testing load_model function...")
+        from transformers import BigBirdForMaskedLM
+        print("  OK: Model class available: BigBirdForMaskedLM")
+        # Check if fine-tuned model exists
+        import os
+        model_path = "models/alm-enhanced-training/balanced_alm_finetune.ckpt"
+        if os.path.exists(model_path):
+            print(f"  OK: Fine-tuned model found: {model_path}")
+        else:
+            print(f"  NOTE: Fine-tuned model not found at: {model_path}")
+        # Note: We won't actually load the full model here as it's ~2GB
+        print("  NOTE: Full model loading skipped in test (too large)")
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in model loading test: {e}")
+        traceback.print_exc()
+        return False
+def test_file_structure():
+    """Test if all required files exist"""
+    print("\nTesting file structure...")
+    gui_dir = Path(__file__).parent
+    parent_dir = gui_dir.parent
+    required_files = [
+        "app.py",
+        "run_gui.py",
+        "requirements.txt",
+        "README.md"
+    ]
+    all_present = True
+    for file_name in required_files:
+        file_path = gui_dir / file_name
+        if file_path.exists():
+            print(f"  OK: {file_name}")
+        else:
+            print(f"  FAIL: {file_name} missing")
+            all_present = False
+    # Check for model checkpoint
+    model_path = parent_dir / "models" / "alm-enhanced-training" / "balanced_alm_finetune.ckpt"
+    if model_path.exists():
+        print("  OK: Fine-tuned model checkpoint found")
+    else:
+        print("  NOTE: Fine-tuned model checkpoint not found")
+    return all_present
+def test_post_processing():
+    """Test post-processing functionality"""
+    print("\nTesting post-processing features...")
+    try:
+        from app import POST_PROCESSING_AVAILABLE, DNACHISEL_AVAILABLE
+        if POST_PROCESSING_AVAILABLE:
+            print("  OK: Post-processing module available")
+            if DNACHISEL_AVAILABLE:
+                print("  OK: DNAChisel available")
+            else:
+                print("  NOTE: DNAChisel not available")
+        else:
+            print("  NOTE: Post-processing module not available")
+        return True
+    except Exception as e:
+        print(f"  FAIL: Error in post-processing test: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("ENCOT GUI Test Suite")
+    print("=" * 50)
+    tests = [
+        ("File Structure", test_file_structure),
+        ("Imports", test_imports),
+        ("Protein Validation", test_protein_validation),
+        ("Metrics Calculation", test_metrics_calculation),
+        ("Visualization Functions", test_visualization_functions),
+        ("CodonEvaluation Functions", test_codon_evaluation),
+        ("Model Loading", test_model_loading),
+        ("Post-Processing", test_post_processing),
+    ]
+    passed = 0
+    total = len(tests)
+    for test_name, test_func in tests:
+        try:
+            result = test_func()
+            if result:
+                passed += 1
+                print(f"OK: {test_name}: PASSED")
+            else:
+                print(f"FAIL: {test_name}: FAILED")
+        except Exception as e:
+            print(f"FAIL: {test_name}: ERROR - {e}")
+    print("\n" + "=" * 50)
+    print(f"Test Results: {passed}/{total} tests passed")
+    if passed == total:
+        print("All tests passed. The GUI should work correctly.")
+        print("\nTo run the GUI:")
+        print("  python run_gui.py")
+        print("  or")
+        print("  cd streamlit_gui && streamlit run app.py --server.address=0.0.0.0")
+    else:
+        print("Some tests failed. Please check the issues above.")
+    print("\nNotes:")
+    print("  • Fine-tuned model integration")
+    print("  • Enhanced constrained beam search")
+    print("  • Post-processing with DNAChisel")
+    print("  • Advanced sequence analysis")
+    print("  • Improved parameter controls")
+    return passed == total
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)