import os
import urllib.request
from collections.abc import Iterator
import gradio as gr
from llama_cpp import Llama

# 💾 Download GGUF from Hugging Face if not already present
GGUF_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
MODEL_FILENAME = "TinyLlama-1.1B-Chat.Q4_K_M.gguf"

if not os.path.exists(MODEL_FILENAME):
    print(f"🔽 Downloading model from Hugging Face: {GGUF_URL}")
    urllib.request.urlretrieve(GGUF_URL, MODEL_FILENAME)
    print("✅ Download complete!")

# 🧠 Load GGUF model using llama-cpp
llm = Llama(model_path=MODEL_FILENAME, n_ctx=4096, n_threads=os.cpu_count())

DESCRIPTION = "# Sheikh AI – TinyLlama (GGUF from HF)"
DESCRIPTION += "<p><strong>Note:</strong> Running on CPU with GGUF – downloaded automatically.</p>"

MAX_NEW_TOKENS = 1024

def format_conversation(system_prompt: str, chat_history: list[dict], user_input: str) -> str:
    chat = f"<|system|>\n{system_prompt.strip()}</s>\n"
    for turn in chat_history:
        if turn["role"] == "user":
            chat += f"<|user|>\n{turn['content'].strip()}</s>\n"
        elif turn["role"] == "assistant":
            chat += f"<|assistant|>\n{turn['content'].strip()}</s>\n"
    chat += f"<|user|>\n{user_input.strip()}</s>\n<|assistant|>\n"
    return chat

def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repeat_penalty: float = 1.2,
) -> Iterator[str]:
    system_prompt = (
        "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
        "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
        "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
    )

    prompt = format_conversation(system_prompt, chat_history, message)

    stream = llm(
        prompt,
        max_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repeat_penalty,
        stop=["</s>"],
        stream=True,
    )

    partial = ""
    for chunk in stream:
        partial += chunk["choices"][0]["text"]
        yield partial


demo = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=32, maximum=2048, value=MAX_NEW_TOKENS, step=32),
        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1),
        gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05),
        gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1),
        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05),
    ],
    examples=[
        ["What are the five pillars of Islam?"],
        ["Is it allowed to pray in shoes?"],
        ["Explain the meaning of Surah Al-Fatiha."],
        ["Is music haram according to Islamic scholars?"],
        ["Can I make up missed fasts after Ramadan?"]
    ],
    description=DESCRIPTION,
    css_paths="style.css"
)

if __name__ == "__main__":
    demo.launch()