from time import sleep
import streamlit as st
# for GPU inference, uncomment the following line
# from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

AI_MODE = "ON"

if AI_MODE == "ON":
    # model_id = "choco-conoz/TwinLlama-3.1-8B"
    model_id = "choco-conoz/TwinLlama-3.2-1B"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    # for GPU inference, uncomment the following line
    # model = FastLanguageModel.for_inference(model)

    processor = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=10
    )

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids(""),
    ]


def main():
    st.title('DEMO - SFT')
    st.subheader('Instruction/Response')
    st.markdown('<div style="text-align: right;">produced by Conoz (https://www.conoz.com)</div>',
                unsafe_allow_html=True)
    st.markdown(
        '<div><br />basic space hardware에서 응답시간은 3분 정도 소요됩니다. '
        '영어, 한국어 등으로 질문할 수 있습니다.<br />'
        '코노즈에서 Llama-3.2-1B model을 SFT로 학습한 모델을 사용합니다. '
        '알파카 chat template을 사용합니다.<br />'
        '코노즈에서 Llama-3.1-8B 모델을 SFT로 학습한 모델로 사용할 수도 있지만 basic space hardware 에선 동작하지 않습니다.</div>',
        unsafe_allow_html=True
    )
    st.markdown(
        '<div>Response time on basic space hardware takes about 3 minutes. '
        'You can ask questions in English, Korean, etc. '
        'It is a model fine-tuned on the Llama-3.2-1B model by Conoz. '
        'It uses the Alpaca chat template.<br />'
        'You can also use the model fine-tuned on the Llama-3.1-8B model by Conoz, but it does not work on the basic space hardware.<br /></div>',
        unsafe_allow_html=True
    )
    st.markdown('<hr />', unsafe_allow_html=True)
    query = st.text_input('input your topic of interest. (10 ~ 1000 characters)',
                          placeholder='e.g. What is the capital of South Korea?')

    alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Response:
    {}
    """

    if st.button("Send"):
        if not query:
            st.error("Please enter a query.")
            return
        if len(query) < 10:
            st.error("Please enter a query with at least 10 characters.")
            return
        if len(query) > 1000:
            st.error("Please enter a query with less than 1000 characters.")
            return
        with st.spinner("Generating response..."):
            user_prompt = alpaca_template.format(query, "")
            if AI_MODE == "ON":
                # for chat models
                # user_prompt = tokenizer.apply_chat_template(
                #     user_prompt, tokenize=False, add_generation_prompt=True)
                outputs = processor(user_prompt,
                                    max_new_tokens=256,
                                    use_cache=True,
                                    do_sample=True,
                                    temperature=0.6,
                                    top_p=0.9,
                                    )
                # eos_token_id=terminators,
                response = outputs[0]["generated_text"][len(user_prompt):]
            else:
                sleep(3)
                response = "AI_MODE is OFF. Please turn it ON to get a response."
            st.subheader('Response:')
        st.write(response)


if __name__ == "__main__":
    main()