from time import sleep import streamlit as st # for GPU inference, uncomment the following line # from unsloth import FastLanguageModel, is_bfloat16_supported from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline AI_MODE = "ON" if AI_MODE == "ON": # model_id = "choco-conoz/TwinLlama-3.1-8B" model_id = "choco-conoz/TwinLlama-3.2-1B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) # for GPU inference, uncomment the following line # model = FastLanguageModel.for_inference(model) processor = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10 ) terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(""), ] def main(): st.title('DEMO - SFT') st.subheader('Instruction/Response') st.markdown('
produced by Conoz (https://www.conoz.com)
', unsafe_allow_html=True) st.markdown( '

basic space hardware에서 응답시간은 3분 정도 소요됩니다. ' '영어, 한국어 등으로 질문할 수 있습니다.
' '코노즈에서 Llama-3.2-1B model을 SFT로 학습한 모델을 사용합니다. ' '알파카 chat template을 사용합니다.
' '코노즈에서 Llama-3.1-8B 모델을 SFT로 학습한 모델로 사용할 수도 있지만 basic space hardware 에선 동작하지 않습니다.
', unsafe_allow_html=True ) st.markdown( '
Response time on basic space hardware takes about 3 minutes. ' 'You can ask questions in English, Korean, etc. ' 'It is a model fine-tuned on the Llama-3.2-1B model by Conoz. ' 'It uses the Alpaca chat template.
' 'You can also use the model fine-tuned on the Llama-3.1-8B model by Conoz, but it does not work on the basic space hardware.
', unsafe_allow_html=True ) st.markdown('
', unsafe_allow_html=True) query = st.text_input('input your topic of interest. (10 ~ 1000 characters)', placeholder='e.g. What is the capital of South Korea?') alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {} ### Response: {} """ if st.button("Send"): if not query: st.error("Please enter a query.") return if len(query) < 10: st.error("Please enter a query with at least 10 characters.") return if len(query) > 1000: st.error("Please enter a query with less than 1000 characters.") return with st.spinner("Generating response..."): user_prompt = alpaca_template.format(query, "") if AI_MODE == "ON": # for chat models # user_prompt = tokenizer.apply_chat_template( # user_prompt, tokenize=False, add_generation_prompt=True) outputs = processor(user_prompt, max_new_tokens=256, use_cache=True, do_sample=True, temperature=0.6, top_p=0.9, ) # eos_token_id=terminators, response = outputs[0]["generated_text"][len(user_prompt):] else: sleep(3) response = "AI_MODE is OFF. Please turn it ON to get a response." st.subheader('Response:') st.write(response) if __name__ == "__main__": main()