Spaces:

NatLibFi
/

Caption-Annif-Demo

Running

File size: 5,202 Bytes

dea4bf5
 
 
 
 
 
 
 
5c3f18e
 
dea4bf5
 
 
 
bc4258e
 
 
 
 
 
 
 
dea4bf5
 
034f47f
dea4bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
034f47f
dea4bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3f18e
 
dea4bf5
 
 
bc4258e
dea4bf5
bc4258e
dea4bf5
 
5c3f18e
dea4bf5
 
5c3f18e
 
dea4bf5
 
bc4258e
034f47f
0a5f395
 
 
 
 
 
034f47f
 
bc4258e
dea4bf5
 
 
e7c842f
 
0e83ca4
 
 
 
 
 
 
 
 
 
 
 
3b58ccc
0e83ca4
bc4258e
5e22b19
 
 
 
0a5f395
bc4258e
 
 
 
0e83ca4
 
 
 
 
 
bc4258e
 
0e83ca4
 
 
bc4258e
 
 
0e83ca4
 
dea4bf5
bc4258e
 
 
 
 
dea4bf5

import gradio as gr
import requests
from annif_client import AnnifClient
import os


# Get VLM API base URL and API key from environment variables
VLM_API_BASE_URL = os.getenv("VLM_API_BASE_URL")
if not VLM_API_BASE_URL:
    raise RuntimeError("VLM_API_BASE_URL environment variable must be set.")
VLM_API_KEY = os.getenv("VLM_API_KEY", "")
VLM_API_ENDPOINT = f"{VLM_API_BASE_URL}/v1/chat/completions"


# Get Annif API base URL from environment variable, fallback to default
ANNIF_API_BASE_URL = os.getenv("ANNIF_API_BASE_URL")
if ANNIF_API_BASE_URL:
    if not ANNIF_API_BASE_URL.endswith("v1/"):
        raise RuntimeError("ANNIF_API_BASE_URL should end with 'v1/'")
    annif = AnnifClient(api_base=ANNIF_API_BASE_URL)
else:
    annif = AnnifClient()


def get_caption(image, prompt):
    # Convert image to base64 JPEG
    import io
    import base64

    buf = io.BytesIO()
    image.save(buf, format="JPEG")
    img_bytes = buf.getvalue()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")

    # Prepare payload for VLM (OpenAI schema)
    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
                    },
                ],
            }
        ],
        "max_tokens": 300,
    }
    headers = {"X-API-Key": VLM_API_KEY} if VLM_API_KEY else {}
    try:
        response = requests.post(VLM_API_ENDPOINT, json=payload, headers=headers)
        response.raise_for_status()
        data = response.json()
        # Assume caption is in data['choices'][0]['message']['content']
        caption = data["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"VLM API error: {e}")  # Detailed error for admin
        raise gr.Error("Sorry, there was a problem generating a caption.")
    return caption


def get_subjects(caption, project_id):
    try:
        results = annif.suggest(project_id=project_id, text=caption)
        label_scores = {result["label"]: result["score"] for result in results}
        if not label_scores:
            return {}
        return label_scores
    except Exception as e:
        print(f"Annif API error: {e}")  # Detailed error for admin
        raise gr.Error("Sorry, there was a problem getting subject suggestions.")


def process_image(image, project_id):
    prompt = (
        'Luo vaihtoehtoinen tekstikuvaus, joka on tarkoitettu henkilöille, jotka eivät näe kuvaa. '
        'Kuvaile kuvan todellista sisältöä, älä tulkitse mitään. '
        'Aloita yleisellä kuvauksella ja siirry sitten yksityiskohtiin. '
        'Kuvaile yksityiskohtia ainakin viiden lauseen verran. '
        'Jos kuvassa näkyy tekstiä, kerro mitä siinä lukee ja jos teksti ei ole suomea, käännä se myös suomeksi. '
        'Vastaa vain lopullisella alt-tekstillä, älä lisää "tässä on alt-teksti", selityksiä tai väliotsikoita. '
    )
    caption = get_caption(image, prompt)
    subjects = get_subjects(caption, project_id)
    return image, caption, subjects


with gr.Blocks(title="VLM Caption & Annif Demo") as demo:
    gr.Markdown("# VLM Caption & Annif Demo")
    gr.Markdown(
        """
    **How it works:**
    1. Upload or take a photo in the input section below.
    2. The image is sent to a Visual Language Model to generate a caption.
    3. Annif suggests subjects based on the caption.
    """
    )
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input")
            image_input = gr.Image(
                type="pil", label="Image Input (upload or take a photo)", mirror_webcam=False,
            )
            project_dropdown = gr.Dropdown(
                choices=[("YSO Finnish - Yleinen suomalainen ontologia", "yso-fi"),
                         ("YKL Finnish - Yleisten kirjastojen luokitusjärjestelmä ", "ykl-fi"),
                         ("KAUNO Finnish - Fiktiivisen aineiston ontologia ", "kauno-fi")
                         ],
                value="yso-fi",
                label="Annif Project",
                info="Select the vocabulary from where subject suggestions are drawn",
            )
            submit_btn = gr.Button("Submit", interactive=False)
            clear_btn = gr.Button("Clear")
        with gr.Column():
            gr.Markdown("### Output")
            caption_output = gr.Textbox(label="Caption", lines=10, interactive=False)
            subjects_output = gr.Label(label="Subject Suggestions", show_heading=False)

    def run_app(image, project_id):
        caption, subjects = process_image(image, project_id)[1:]
        return caption, subjects

    submit_btn.click(
        run_app,
        inputs=[image_input, project_dropdown],
        outputs=[caption_output, subjects_output],
    )
    clear_btn.click(lambda: ("", {}), outputs=[caption_output, subjects_output])

    def update_submit_btn(img):
        return gr.update(interactive=img is not None)

    image_input.upload(update_submit_btn, inputs=image_input, outputs=submit_btn)

demo.launch()