import gradio as gr
import torch
import numpy as np
import struct
import lzma
import json
from huggingface_hub import hf_hub_download
from transformers import T5Config, T5ForConditionalGeneration, AutoTokenizer

# Download quantized model
model_path = hf_hub_download(repo_id="ag14850/Mosquito", filename="mosquito_tiny.bin.xz")

def unpack_nbits(data, bits, count):
    if bits == 8:
        return np.frombuffer(data, dtype=np.uint8)[:count]
    result = []
    if bits == 4:
        for byte in data:
            result.append((byte >> 4) & 0x0F)
            result.append(byte & 0x0F)
    elif bits == 6:
        for i in range(0, len(data), 3):
            if i + 2 >= len(data):
                break
            b0, b1, b2 = data[i], data[i+1], data[i+2]
            result.append((b0 >> 2) & 0x3F)
            result.append(((b0 & 0x03) << 4) | ((b1 >> 4) & 0x0F))
            result.append(((b1 & 0x0F) << 2) | ((b2 >> 6) & 0x03))
            result.append(b2 & 0x3F)
    elif bits == 5:
        for i in range(0, len(data), 5):
            if i + 4 >= len(data):
                break
            packed = int.from_bytes(data[i:i+5], 'little')
            for j in range(8):
                result.append((packed >> (j * 5)) & 0x1F)
    elif bits == 7:
        for i in range(0, len(data), 7):
            if i + 6 >= len(data):
                break
            packed = int.from_bytes(data[i:i+7], 'little')
            for j in range(8):
                result.append((packed >> (j * 7)) & 0x7F)
    return np.array(result[:count], dtype=np.uint8)

def load_quantized_model(path):
    with lzma.open(path, 'rb') as f:
        data = f.read()
    
    offset = 0
    version, default_bits, num_params = struct.unpack_from('<BBH', data, offset)
    offset += 4
    
    state_dict = {}
    
    for _ in range(num_params):
        name_len = struct.unpack_from('<H', data, offset)[0]
        offset += 2
        name = data[offset:offset + name_len].decode('utf-8')
        offset += name_len
        
        ndim = struct.unpack_from('<B', data, offset)[0]
        offset += 1
        shape = tuple(struct.unpack_from('<I', data, offset + i*4)[0] for i in range(ndim))
        offset += ndim * 4
        numel = int(np.prod(shape)) if shape else 1
        
        bits = struct.unpack_from('<B', data, offset)[0]
        offset += 1
        
        if bits < 16:
            scale, zp = struct.unpack_from('<ff', data, offset)
            offset += 8
            packed_len = struct.unpack_from('<I', data, offset)[0]
            offset += 4
            packed_data = data[offset:offset + packed_len]
            offset += packed_len
            
            quantized = unpack_nbits(packed_data, bits, numel)
            tensor_data = ((quantized.astype(np.float32) - zp) * scale).reshape(shape)
            state_dict[name] = torch.from_numpy(tensor_data)
        else:
            fp16_len = struct.unpack_from('<I', data, offset)[0]
            offset += 4
            fp16_data = data[offset:offset + fp16_len]
            offset += fp16_len
            
            tensor_data = np.frombuffer(fp16_data, dtype=np.float16).reshape(shape)
            state_dict[name] = torch.from_numpy(tensor_data.astype(np.float32))
    
    config_len = struct.unpack_from('<I', data, offset)[0]
    offset += 4
    config_json = data[offset:offset + config_len].decode('utf-8')
    
    config = T5Config.from_dict(json.loads(config_json))
    model = T5ForConditionalGeneration(config)
    model.load_state_dict(state_dict)
    model.eval()
    
    return model

# Load model
model = load_quantized_model(model_path)
tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-base", legacy=False)

def ask(question):
    inputs = tokenizer(f"question: {question}", return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(
        **inputs,
        max_new_tokens=24,
        num_beams=6,
        no_repeat_ngram_size=2,
        repetition_penalty=20.0,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Sample Q&A to display
sample_qa = """
## 📊 Sample Questions & Answers

| Question | Answer |
|----------|--------|
| How do vaccines work? | Vaccines stimulate the immune system to recognize and fight specific pathogens. |
| Why do we sneeze? | Sneezes clear irritants from the nasal passages. |
| What is empathy? | Empathy is the ability to understand and share the feelings of another person. |
"""

with gr.Blocks() as demo:
    gr.Markdown("# 🦟 Mosquito - Tiny Knowledge Model")
    gr.Markdown("A **7.3M parameter** model that answers general knowledge questions. Smaller than a mosquito's brain!")
    
    gr.Markdown(sample_qa)
    
    gr.Markdown("---")
    gr.Markdown("## Try it yourself:")
    
    with gr.Row():
        question = gr.Textbox(label="Question", placeholder="Why do we dream?")
        answer = gr.Textbox(label="Answer")
    
    submit_btn = gr.Button("Ask", variant="primary")
    submit_btn.click(fn=ask, inputs=question, outputs=answer)
    
    gr.Examples(
        examples=[
            ["How do vaccines work?"],
            ["Why do we sneeze?"],
            ["What is empathy?"],
            ["Why is the sky blue?"],
            ["What causes earthquakes?"],
        ],
        inputs=question,
    )

demo.launch()