import os
import gradio as gr
from transformers import pipeline

# Load token from environment
token = os.getenv("HF_TOKEN")

# Use a pipeline as a high-level helper
pipe = pipeline(
    "text-generation",
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    token=token,
    torch_dtype="auto",
    device_map="auto"
)

# Inference function
def generate_response(prompt):
    messages = [{"role": "user", "content": prompt}]
    response = pipe(messages, max_new_tokens=160, temperature=0.7)
    # Extract only the assistant's response
    for msg in response[0]["generated_text"]:
        if isinstance(msg, dict) and msg.get("role") == "assistant":
            return msg.get("content")
    return "No assistant response found."


# Gradio interface
gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=4, label="Prompt"),
    outputs=gr.Textbox(label="Generated Response"),
    title="Meta LLaMA 3 8B Instruct",
    description="Gradio demo for Meta-Llama-3-8B-Instruct using Hugging Face Transformers pipeline"
).launch()