import streamlit as st import torch from transformers import BitsAndBytesConfig from llama_index.llms.huggingface import HuggingFaceLLM # Function to convert messages to prompt def messages_to_prompt(messages): prompt = "" for message in messages: if message.role == 'system': prompt += f"\n{message.content}\n" elif message.role == 'user': prompt += f"\n{message.content}\n" elif message.role == 'assistant': prompt += f"\n{message.content}\n" # ensure we start with a system prompt, insert blank if needed if not prompt.startswith("\n"): prompt = "\n\n" + prompt # add final assistant prompt prompt = prompt + "\n" return prompt # Function to convert completion to prompt def completion_to_prompt(completion): return f"\n\n\n{completion}\n\n" # Load the LLM without quantization @st.cache_resource def load_llm(): return HuggingFaceLLM( model_name="HuggingFaceH4/zephyr-7b-beta", tokenizer_name="HuggingFaceH4/zephyr-7b-beta", context_window=3900, max_new_tokens=256, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, device_map="cpu" # Use CPU ) llm = load_llm() # Streamlit app interface st.title("LLM Text Generation App") # Text input for the prompt user_input = st.text_area("Enter your prompt:", "") # Button to generate response if st.button("Generate Response"): if user_input.strip() != "": # Generate response based on the prompt with st.spinner("Generating response..."): response = llm.complete(user_input) # Display the generated response st.write("Generated Response:") st.write(str(response)) else: st.warning("Please enter a valid prompt.")