import os
import threading

import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama


MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "EREN121232/MAJESTIC-FIN-R1-gguf")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MAJESTIC-FIN-R1-Q8_0.gguf")
MODEL_LABEL = os.getenv("MODEL_LABEL", "MAJESTIC-FIN-R1 Q8_0")
N_CTX = int(os.getenv("N_CTX", "4096"))
N_THREADS = int(os.getenv("CPU_CORES", os.getenv("N_THREADS", str(os.cpu_count() or 2))))

_MODEL = None
_MODEL_LOCK = threading.Lock()
_INFER_LOCK = threading.Lock()


def get_model() -> Llama:
    global _MODEL
    with _MODEL_LOCK:
        if _MODEL is None:
            model_path = hf_hub_download(
                repo_id=MODEL_REPO_ID,
                filename=MODEL_FILENAME,
            )
            _MODEL = Llama(
                model_path=model_path,
                n_ctx=N_CTX,
                n_threads=N_THREADS,
                n_gpu_layers=0,
                verbose=False,
            )
    return _MODEL


def generate(prompt: str, system_prompt: str, temperature: float, max_tokens: int, top_p: float, repeat_penalty: float) -> str:
    prompt = prompt.strip()
    system_prompt = system_prompt.strip()

    if not prompt:
        return "Please enter a prompt."

    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    llm = get_model()
    with _INFER_LOCK:
        response = llm.create_chat_completion(
            messages=messages,
            temperature=float(temperature),
            max_tokens=int(max_tokens),
            top_p=float(top_p),
            repeat_penalty=float(repeat_penalty),
        )

    return response["choices"][0]["message"]["content"].strip()


with gr.Blocks(title="MAJESTIC FIN R1 Free API") as demo:
    gr.Markdown(
        f"""
        # MAJESTIC FIN R1 Free API

        Public CPU deployment for `{MODEL_LABEL}` backed by `llama-cpp-python`.
        The API endpoint name is `/chat`.
        """
    )

    prompt = gr.Textbox(
        label="Prompt",
        lines=8,
        placeholder="Ask about finance, markets, accounting, or your fine-tuned task.",
    )
    output = gr.Textbox(label="Response", lines=14)

    with gr.Accordion("Generation Settings", open=False):
        system_prompt = gr.Textbox(
            label="System Prompt",
            lines=4,
            value="You are MAJESTIC-FIN-R1, a helpful finance-focused assistant.",
        )
        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature")
        max_tokens = gr.Slider(64, 1024, value=256, step=32, label="Max Tokens")
        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
        repeat_penalty = gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repeat Penalty")

    run_button = gr.Button("Generate", variant="primary")

    gr.Examples(
        examples=[
            ["Summarize the key risks in a company's balance sheet."],
            ["Explain EBITDA vs free cash flow in simple terms."],
            ["Give a short market outlook for a cautious investor."],
        ],
        inputs=prompt,
    )

    run_button.click(
        fn=generate,
        inputs=[prompt, system_prompt, temperature, max_tokens, top_p, repeat_penalty],
        outputs=output,
        api_name="chat",
        show_progress="minimal",
        concurrency_limit=1,
    )

    prompt.submit(
        fn=generate,
        inputs=[prompt, system_prompt, temperature, max_tokens, top_p, repeat_penalty],
        outputs=output,
        show_progress="minimal",
        concurrency_limit=1,
    )


if __name__ == "__main__":
    demo.queue(max_size=16).launch()