Spaces:

NatLibFi
/

Caption-Annif-Demo

Sleeping

File size: 4,787 Bytes

dea4bf5
 
 
 
 
 
 
 
5c3f18e
 
dea4bf5
 
 
 
bc4258e
 
 
 
 
 
 
 
dea4bf5
 
034f47f
dea4bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
034f47f
dea4bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3f18e
 
dea4bf5
 
 
bc4258e
dea4bf5
bc4258e
dea4bf5
 
5c3f18e
dea4bf5
 
5c3f18e
 
dea4bf5
 
bc4258e
034f47f
 
 
 
 
 
 
bc4258e
dea4bf5
 
 
0e83ca4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4258e
 
 
 
 
 
 
0e83ca4
 
 
 
 
 
bc4258e
 
0e83ca4
 
 
bc4258e
 
 
0e83ca4
 
dea4bf5
bc4258e
 
 
 
 
dea4bf5

import gradio as gr
import requests
from annif_client import AnnifClient
import os


# Get VLM API base URL and API key from environment variables
VLM_API_BASE_URL = os.getenv("VLM_API_BASE_URL")
if not VLM_API_BASE_URL:
    raise RuntimeError("VLM_API_BASE_URL environment variable must be set.")
VLM_API_KEY = os.getenv("VLM_API_KEY", "")
VLM_API_ENDPOINT = f"{VLM_API_BASE_URL}/v1/chat/completions"


# Get Annif API base URL from environment variable, fallback to default
ANNIF_API_BASE_URL = os.getenv("ANNIF_API_BASE_URL")
if ANNIF_API_BASE_URL:
    if not ANNIF_API_BASE_URL.endswith("v1/"):
        raise RuntimeError("ANNIF_API_BASE_URL should end with 'v1/'")
    annif = AnnifClient(api_base=ANNIF_API_BASE_URL)
else:
    annif = AnnifClient()


def get_caption(image, prompt):
    # Convert image to base64 JPEG
    import io
    import base64

    buf = io.BytesIO()
    image.save(buf, format="JPEG")
    img_bytes = buf.getvalue()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")

    # Prepare payload for VLM (OpenAI schema)
    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
                    },
                ],
            }
        ],
        "max_tokens": 300,
    }
    headers = {"X-API-Key": VLM_API_KEY} if VLM_API_KEY else {}
    try:
        response = requests.post(VLM_API_ENDPOINT, json=payload, headers=headers)
        response.raise_for_status()
        data = response.json()
        # Assume caption is in data['choices'][0]['message']['content']
        caption = data["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"VLM API error: {e}")  # Detailed error for admin
        raise gr.Error("Sorry, there was a problem generating a caption.")
    return caption


def get_subjects(caption, project_id):
    try:
        results = annif.suggest(project_id=project_id, text=caption)
        label_scores = {result["label"]: result["score"] for result in results}
        if not label_scores:
            return {}
        return label_scores
    except Exception as e:
        print(f"Annif API error: {e}")  # Detailed error for admin
        raise gr.Error("Sorry, there was a problem getting subject suggestions.")


def process_image(image, project_id):
    prompt = (
        "Generate an alt-text description, which is a description for people who can't see the image. "
        "Be sure to talk about the actual contents of it, do not interpret anything. "
        "Start with a general description, then focus on details. Answer only with the "
        "alt-text description, do not include 'Here's an alt-text description', explanations or subheadings."
    )
    caption = get_caption(image, prompt)
    subjects = get_subjects(caption, project_id)
    return image, caption, subjects


with gr.Blocks(title="VLM Caption & Annif Subject Demo") as demo:
    gr.Markdown("# VLM Caption & Annif Subject Demo")
    gr.Markdown(
        """
    **How it works:**
    1. Upload or take a photo in the input section below.
    2. The image is sent to a Visual Language Model to generate a caption.
    3. Annif suggests subjects based on the caption.
    """
    )
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input")
            image_input = gr.Image(
                type="pil", label="Image Input (upload or take a photo)"
            )
            project_dropdown = gr.Dropdown(
                choices=[("YSO", "yso-en"), ("YKL", "ykl-en")],
                value="yso-en",
                label="Annif Project",
                info="Select the vocabulary from where subject suggestions are drawn",
            )
            submit_btn = gr.Button("Submit", interactive=False)
            clear_btn = gr.Button("Clear")
        with gr.Column():
            gr.Markdown("### Output")
            caption_output = gr.Textbox(label="Caption", lines=10, interactive=False)
            subjects_output = gr.Label(label="Subject Suggestions", show_heading=False)

    def run_app(image, project_id):
        caption, subjects = process_image(image, project_id)[1:]
        return caption, subjects

    submit_btn.click(
        run_app,
        inputs=[image_input, project_dropdown],
        outputs=[caption_output, subjects_output],
    )
    clear_btn.click(lambda: ("", {}), outputs=[caption_output, subjects_output])

    def update_submit_btn(img):
        return gr.update(interactive=img is not None)

    image_input.upload(update_submit_btn, inputs=image_input, outputs=submit_btn)

demo.launch()