import gradio as gr
import torch
from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM, AutoConfig
import requests
import json
from peft import PeftModel
from threading import Thread
import os

# --- Configuration ---
# The model is loaded from the Hugging Face Hub
BASE_MODEL_PATH = "algorythmtechnologies/zenith_coder_v1.1"
# Name of the environment variable for the Hugging Face token
HF_TOKEN_ENV_VAR = "HUGGING_FACE_HUB_TOKEN"

# --- Model Loading ---

# Get the Hugging Face token from environment variables
hf_token = os.environ.get(HF_TOKEN_ENV_VAR)

if not hf_token:
    raise ValueError(f"Environment variable {HF_TOKEN_ENV_VAR} not set. Please set it in your Space secrets.")

# Load the tokenizer from the Hub, using the token for private models
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, use_auth_token=hf_token)

from transformers import AutoConfig

# Load the config from the user's repo
config = AutoConfig.from_pretrained(BASE_MODEL_PATH, use_auth_token=hf_token)

# Correct the base model path in the config
config._name_or_path = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"

# Load the base model from the Hub using the corrected config
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    config=config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_auth_token=hf_token
)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)

# The PEFT model is loaded from the same repository.
# PeftModel will automatically find the adapter configuration.
model = PeftModel.from_pretrained(base_model, BASE_MODEL_PATH, use_auth_token=hf_token)
model.eval()

# --- Web Search Function ---
def search(query):
    """Performs a web search using the Serper API."""
    serper_api_key = os.environ.get("SERPER_API_KEY")
    if not serper_api_key:
        return "SERPER_API_KEY not found. Please set it as an environment variable in your Hugging Face Space secrets."
    
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": query})
    headers = {
        'X-API-KEY': serper_api_key,
        'Content-Type': 'application/json'
    }
    try:
        response = requests.request("POST", url, headers=headers, data=payload)
        response.raise_for_status()
        results = response.json()
        return results.get('organic', [])
    except requests.exceptions.RequestException as e:
        return f"Error during web search: {e}"

# --- Response Generation ---
def generate_response(message, history):
    """Generates a response from the model, with optional web search."""
    
    # Handle web search command
    if message.lower().startswith("search for "):
        search_query = message[len("search for "):]
        search_results = search(search_query)
        
        if isinstance(search_results, str): # Error case
            yield search_results
            return

        if not search_results:
            yield "No search results found."
            return

        context = " ".join([res.get('snippet', '') for res in search_results[:5]])
        # Prepend context to the user's message
        message = f"Based on the following search results, answer the user's query.\nContext: {context}\n\nUser Query: {message}"

    # Format chat history and new message using the tokenizer's chat template
    chat_messages = []
    for user_msg, assistant_msg in history:
        chat_messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            chat_messages.append({"role": "assistant", "content": assistant_msg})
    chat_messages.append({"role": "user", "content": message})

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(chat_messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
    
    # Run generation in a separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield generated text as it becomes available
    generated_text = ""
    for new_text in streamer:
        generated_text += new_text
        yield generated_text

# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky", secondary_hue="blue")) as demo:
    gr.HTML("<h1 align='center'>Zenith V1.1 Coder</h1>")
    gr.Markdown("This Space is running [zenith_coder_v1.1](https://huggingface.co/algorythmtechnologies/zenith_coder_v1.1).<br>You can ask coding questions or use the 'search for <query>' command to browse the web.")
    
    gr.ChatInterface(
        generate_response,
        chatbot=gr.Chatbot(
            height=600,
            avatar_images=(None, "https://i.imgur.com/9kAC4pG.png"),
            bubble_full_width=False,
        ),
        textbox=gr.Textbox(
            placeholder="Ask me a question or type 'search for <your query>'...",
            container=False,
            scale=7,
        ),
        submit_btn="Send",
        retry_btn=None,
        undo_btn=None,
        clear_btn="Clear History",
    )

if __name__ == "__main__":
    # Before launching, remind the user to set the token if it's not found.
    if not os.environ.get(HF_TOKEN_ENV_VAR):
        print(f"CRITICAL: Environment variable {HF_TOKEN_ENV_VAR} not found.")
        print("Please set this as a secret in your Hugging Face Space settings.")
    demo.launch(share=True)