import time
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load model
MODEL_PATH = "Janushi/FoodExtract-gemma-3-270m-fine-tune-v1"

loaded_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    dtype="auto",
    device_map="auto",
    attn_implementation="eager"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

loaded_model_pipeline = pipeline(
    "text-generation",
    model=loaded_model,
    tokenizer=tokenizer
)

def pred_on_text(input_text):
    start_time = time.time()
    raw_output = loaded_model_pipeline(
        text_inputs=[{"role": "user", "content": input_text}],
        max_new_tokens=256,
        disable_compile=True
    )
    end_time = time.time()
    total_time = round(end_time - start_time, 4)
    generated_text = raw_output[0]["generated_text"][1]["content"]
    return generated_text, raw_output, total_time

description = """Extract food and drink items from text using a fine-tuned Gemma-3-270M.
Fine-tuned on mrdbourke/FoodExtract-1k dataset.

**Input:** Any text or image caption
**Output:** Structured food/drink extraction

**Example:**
- Input: "eggs, bacon and toast with orange juice"
- Output: food_or_drink: 1, foods: eggs, bacon, toast, drinks: orange juice
"""

demo = gr.Interface(
    fn=pred_on_text,
    inputs=gr.TextArea(lines=4, label="Input Text"),
    outputs=[
        gr.TextArea(lines=4, label="Generated Text"),
        gr.TextArea(lines=7, label="Raw Output"),
        gr.Number(label="Generation Time (s)")
    ],
    title="🍳 BiteSight — Food Extraction with Fine-Tuned Gemma-3-270M",
    description=description,
    examples=[
        ["A plate of grilled tofu, salad with avocado and tomatoes"],
        ["Indian breakfast with roti, tea and fried potatoes"],
        ["cheese tacos"],
        ["A photo of a dog sitting on a beach"]
    ]
)

if __name__ == "__main__":
    demo.launch(share=False)