import torch
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer from Hugging Face
model_name = "khaledsayed1/llama_QA"  # Replace with your actual model if different
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Title of the web page
st.title("Question Answering using LLaMA Model")

# Description for the user
st.write("Enter your question below, and the model will generate an answer.")

# Sidebar for controlling model parameters
st.sidebar.header("Model Parameters")

# Get the user-controlled values from the sidebar
temperature = st.sidebar.slider("Temperature", 0.0, 1.5, 0.7, 0.1)
top_k = st.sidebar.slider("Top-k", 1, 100, 50, 1)
top_p = st.sidebar.slider("Top-p (nucleus sampling)", 0.0, 1.0, 0.95, 0.01)
max_new_tokens = st.sidebar.slider("Max New Tokens", 1, 200, 128, 1)

# User input (question)
user_question = st.text_input("Your Question:", "")

# If a question is entered, process it and show the answer
if user_question:
    # Define the prompt with the user's question
    alpaca_prompt = """
    السؤال: {}
    الإجابة:
    """
    formatted_prompt = alpaca_prompt.format(user_question)

    # Tokenize the input and move it to GPU
    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    # Generate the output using the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,  # Number of tokens to generate
        temperature=temperature,        # Controls randomness of output
        top_k=top_k,                    # Restricts to top-k most likely next tokens
        top_p=top_p,                    # Nucleus sampling
        use_cache=True                  # Use cached model weights for faster inference
    )

    # Decode the output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Clean up the output and split into bullet points
    clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()

    # Split the output into sentences or phrases, and format them as bullet points
    bullet_points = clean_output.split(".")

    # Display the model's answer as bullet points
    st.subheader("Model's Answer:")
    for point in bullet_points:
        if point.strip() and point.strip() != user_question:  # Ignore empty sentences and the question itself
            st.markdown(f"- {point.strip()}")  # Use markdown to display bullets