# ==============================================================================
#  Neuromodulatory Control Network (NCN) 2.0 Model Catalogue Interface
#  Copyright (c) 2026 Michael Morgan. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
#  ============================================================================
#  MANDATORY ATTRIBUTION & ARCHITECTURAL CITATION RIDER:
#  ============================================================================
#  By using, copying, modifying, distributing, or re-implementing this model 
#  architecture (the Neuromodulatory Control Network / NCN), you agree to 
#  prominently credit and cite the original creator, Michael Morgan, in all 
#  academic papers, technical reports, repositories, and commercial/for-profit 
#  product documentation that leverage these ideas.
#
#  The mandatory citation format must include:
#  - Author: Michael Morgan (2026)
#  - Project: Neuromodulatory Control Network (NCN) Architecture
#  - Links: GitHub:        https://github.com/Mmorgan-ML
#           Hugging Face:  https://huggingface.co/Mmorgan-ML
#           Twitter/X:     @Mmorgan_ML
#           Email:         mmorgankorea@gmail.com
# ==============================================================================

import os
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import PreTrainedTokenizerFast
from safetensors.torch import load_file

# Import your dynamic architecture package
from ncn_architecture.config import NCNConfig
from ncn_architecture.model import ModulatedLLM

# Determine execution hardware
device = "cuda" if torch.cuda.is_available() else "cpu"

# Memory cache to prevent reloading models from disk on every click
MODEL_CACHE = {}
TOKENIZER_CACHE = {}

def load_and_configure_model(model_choice):
    """
    Dynamically reconstructs model configurations from weight shapes 
    to guarantee absolute compatibility before loading.
    """
    if model_choice in MODEL_CACHE:
        return MODEL_CACHE[model_choice], TOKENIZER_CACHE[model_choice]

    if model_choice == "NCN 2M (TinyStories)":
        weights_path = "models/ncn_2m_tinystories/model.safetensors"
        tokenizer_path = "models/ncn_2m_tinystories/tokenizer.json"
        
        if not os.path.exists(weights_path) or not os.path.exists(tokenizer_path):
            raise FileNotFoundError("Model weight file or tokenizer file could not be located in the specified path.")

        # Load weights safely
        state_dict = load_file(weights_path)

        # Reconstruct hyper-parameters based on the structural shapes of the weights
        vocab_size, d_model = state_dict["token_embeddings.weight"].shape
        max_position_embeddings = state_dict["position_embeddings.weight"].shape[0]
        dim_feedforward = state_dict["transformer_layers.0.feed_forward.linear1.weight"].shape[0]
        
        # Calculate the number of layers
        layer_indices = set()
        for key in state_dict.keys():
            if key.startswith("transformer_layers."):
                layer_indices.add(int(key.split(".")[1]))
        num_layers = len(layer_indices) if layer_indices else 12

        # Solve for number of heads (nhead) mathematically using the NCN projection bias dimension
        nhead = 12
        if "ncn.layer2.bias" in state_dict:
            bias_length = state_dict["ncn.layer2.bias"].shape[0]
            try:
                # ncn_output_dim = (2 * nhead + 1) * num_layers
                nhead = int(((bias_length / num_layers) - 1) / 2)
            except Exception:
                nhead = 12

        # Extract tonic hidden dimension
        ncn_hidden_dim = 128
        if "ncn.layer1.weight" in state_dict:
            ncn_hidden_dim = state_dict["ncn.layer1.weight"].shape[0]

        # 1. Initialize configuration with matched dimensions
        config = NCNConfig(
            vocab_size=vocab_size,
            d_model=d_model,
            nhead=nhead,
            num_layers=num_layers,
            dim_feedforward=dim_feedforward,
            max_position_embeddings=max_position_embeddings,
            ncn_hidden_dim=ncn_hidden_dim
        )

        # 2. Instantiate and map weights to architecture
        model = ModulatedLLM(config)
        model.load_state_dict(state_dict, strict=True)
        model.to(device)
        model.eval()

        # 3. Load tokenizer directly from local JSON file
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Cache model instance
        MODEL_CACHE[model_choice] = model
        TOKENIZER_CACHE[model_choice] = tokenizer

        return model, tokenizer

    raise ValueError("Selected model profile is not registered.")


@torch.no_grad()
def generate_text(model_choice, prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
    """
    Autoregressive inference loop with standard KV caching and logits filtering.
    """
    if not prompt.strip():
        return "Please enter a starting prompt to begin generating a story."

    try:
        model, tokenizer = load_and_configure_model(model_choice)
    except Exception as e:
        return f"Error loading model: {str(e)}"

    # Tokenize input
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    generated_ids = input_ids.clone()
    
    past_key_values = None
    past_rnn_state = None

    for _ in range(max_new_tokens):
        # KV Cache feeding optimization: process only newly added tokens
        if past_key_values is None:
            outputs = model(
                input_ids=generated_ids,
                past_key_values=None,
                use_cache=True,
                past_rnn_state=None
            )
        else:
            outputs = model(
                input_ids=generated_ids[:, -1:],
                past_key_values=past_key_values,
                use_cache=True,
                past_rnn_state=past_rnn_state
            )

        logits, past_key_values, _, past_rnn_state = outputs
        next_token_logits = logits[:, -1, :]

        # Repetition Penalty logic
        if repetition_penalty != 1.0:
            for batch_idx in range(next_token_logits.shape[0]):
                for prev_token_id in set(generated_ids[batch_idx].tolist()):
                    logit = next_token_logits[batch_idx, prev_token_id]
                    if logit < 0:
                        next_token_logits[batch_idx, prev_token_id] = logit * repetition_penalty
                    else:
                        next_token_logits[batch_idx, prev_token_id] = logit / repetition_penalty

        # Temperature / Sampling Selection
        if temperature == 0.0:
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
        else:
            next_token_logits = next_token_logits / temperature

            # Top-K Filtering
            if top_k > 0:
                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1:]
                next_token_logits[indices_to_remove] = float("-inf")

            # Top-P (Nucleus) Filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                # Shift indices to protect first token exceeding the top-p boundary
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                
                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                next_token_logits[indices_to_remove] = float("-inf")

            # Sample token from filtered probability distribution
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        generated_ids = torch.cat([generated_ids, next_token], dim=-1)

        # Halt generation if End-Of-Sequence token is hit
        if next_token.item() == tokenizer.eos_token_id:
            break

    # Decode final output sequence
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)


# --- GRADIO INTERFACE CONSTRUCTION ---

css = """
footer {visibility: hidden}
.primary-btn {background-color: #5c4ff2 !important; color: white !important;}
.clear-btn {background-color: #374151 !important; color: white !important;}
"""

with gr.Blocks(title="Michael Morgan Model Catalogue", css=css) as demo:
    gr.Markdown("# Michael Morgan Model Catalogue")
    gr.Markdown("Select a model, enter a story starter, and adjust the generation settings.")

    with gr.Row():
        # Left-hand Interaction & Settings Column
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=["NCN 2M (TinyStories)"],
                value="NCN 2M (TinyStories)",
                label="Model",
                interactive=True
            )
            
            prompt_input = gr.Textbox(
                lines=5,
                placeholder="Type your story starter here...",
                label="Story starter"
            )
            
            # Collapsible generation panel to preserve clean layout
            with gr.Accordion("Generation settings", open=False):
                max_tokens = gr.Slider(
                    minimum=1,
                    maximum=512,
                    value=128,
                    step=1,
                    label="Max new tokens"
                )
                temperature = gr.Slider(
                    minimum=0.0,
                    maximum=1.5,
                    value=0.7,
                    step=0.05,
                    label="Temperature (0 = greedy)"
                )
                top_p = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.8,
                    step=0.05,
                    label="Top-p"
                )
                top_k = gr.Slider(
                    minimum=1,
                    maximum=100,
                    value=25,
                    step=1,
                    label="Top-k"
                )
                rep_penalty = gr.Slider(
                    minimum=1.0,
                    maximum=2.0,
                    value=1.1,
                    step=0.05,
                    label="Repetition penalty"
                )

            with gr.Row():
                clear_btn = gr.Button("Clear", elem_classes=["clear-btn"])
                generate_btn = gr.Button("Generate", variant="primary", elem_classes=["primary-btn"])

        # Right-hand Generation Output Column
        with gr.Column(scale=1):
            output_display = gr.Textbox(
                lines=12,
                placeholder="The generated story will appear here...",
                label="Generated story",
                interactive=False
            )

    # Examples Section matching TinyStories V2 style
    gr.Markdown("### Try these examples")
    gr.Examples(
        examples=[
            ["Once upon a time, there was a little dragon who"],
            ["Lily found a tiny wooden key buried in the sand box. She wondered what"],
            ["One sunny morning, a big friendly dog named Max decided to"],
            ["Tom had a bright yellow balloon. When he let go of the string, the balloon"]
        ],
        inputs=prompt_input
    )

    # Technical footer details
    gr.Markdown(
        "Model: SupraLabs/NCN-2M-TinyStories | License: Apache 2.0 | CPU-only | © 2026 Michael Morgan"
    )

    # Event Handlers
    generate_btn.click(
        fn=generate_text,
        inputs=[model_dropdown, prompt_input, max_tokens, temperature, top_p, top_k, rep_penalty],
        outputs=output_display
    )
    
    # Simple UI clear button callback
    clear_btn.click(
        fn=lambda: ("", ""),
        inputs=None,
        outputs=[prompt_input, output_display]
    )

if __name__ == "__main__":
    demo.launch()