Spaces:
Sleeping
Sleeping
File size: 4,787 Bytes
dea4bf5 5c3f18e dea4bf5 bc4258e dea4bf5 034f47f dea4bf5 034f47f dea4bf5 5c3f18e dea4bf5 bc4258e dea4bf5 bc4258e dea4bf5 5c3f18e dea4bf5 5c3f18e dea4bf5 bc4258e 034f47f bc4258e dea4bf5 0e83ca4 bc4258e 0e83ca4 bc4258e 0e83ca4 bc4258e 0e83ca4 dea4bf5 bc4258e dea4bf5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import requests
from annif_client import AnnifClient
import os
# Get VLM API base URL and API key from environment variables
VLM_API_BASE_URL = os.getenv("VLM_API_BASE_URL")
if not VLM_API_BASE_URL:
raise RuntimeError("VLM_API_BASE_URL environment variable must be set.")
VLM_API_KEY = os.getenv("VLM_API_KEY", "")
VLM_API_ENDPOINT = f"{VLM_API_BASE_URL}/v1/chat/completions"
# Get Annif API base URL from environment variable, fallback to default
ANNIF_API_BASE_URL = os.getenv("ANNIF_API_BASE_URL")
if ANNIF_API_BASE_URL:
if not ANNIF_API_BASE_URL.endswith("v1/"):
raise RuntimeError("ANNIF_API_BASE_URL should end with 'v1/'")
annif = AnnifClient(api_base=ANNIF_API_BASE_URL)
else:
annif = AnnifClient()
def get_caption(image, prompt):
# Convert image to base64 JPEG
import io
import base64
buf = io.BytesIO()
image.save(buf, format="JPEG")
img_bytes = buf.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
# Prepare payload for VLM (OpenAI schema)
payload = {
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_b64}"},
},
],
}
],
"max_tokens": 300,
}
headers = {"X-API-Key": VLM_API_KEY} if VLM_API_KEY else {}
try:
response = requests.post(VLM_API_ENDPOINT, json=payload, headers=headers)
response.raise_for_status()
data = response.json()
# Assume caption is in data['choices'][0]['message']['content']
caption = data["choices"][0]["message"]["content"]
except Exception as e:
print(f"VLM API error: {e}") # Detailed error for admin
raise gr.Error("Sorry, there was a problem generating a caption.")
return caption
def get_subjects(caption, project_id):
try:
results = annif.suggest(project_id=project_id, text=caption)
label_scores = {result["label"]: result["score"] for result in results}
if not label_scores:
return {}
return label_scores
except Exception as e:
print(f"Annif API error: {e}") # Detailed error for admin
raise gr.Error("Sorry, there was a problem getting subject suggestions.")
def process_image(image, project_id):
prompt = (
"Generate an alt-text description, which is a description for people who can't see the image. "
"Be sure to talk about the actual contents of it, do not interpret anything. "
"Start with a general description, then focus on details. Answer only with the "
"alt-text description, do not include 'Here's an alt-text description', explanations or subheadings."
)
caption = get_caption(image, prompt)
subjects = get_subjects(caption, project_id)
return image, caption, subjects
with gr.Blocks(title="VLM Caption & Annif Subject Demo") as demo:
gr.Markdown("# VLM Caption & Annif Subject Demo")
gr.Markdown(
"""
**How it works:**
1. Upload or take a photo in the input section below.
2. The image is sent to a Visual Language Model to generate a caption.
3. Annif suggests subjects based on the caption.
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("### Input")
image_input = gr.Image(
type="pil", label="Image Input (upload or take a photo)"
)
project_dropdown = gr.Dropdown(
choices=[("YSO", "yso-en"), ("YKL", "ykl-en")],
value="yso-en",
label="Annif Project",
info="Select the vocabulary from where subject suggestions are drawn",
)
submit_btn = gr.Button("Submit", interactive=False)
clear_btn = gr.Button("Clear")
with gr.Column():
gr.Markdown("### Output")
caption_output = gr.Textbox(label="Caption", lines=10, interactive=False)
subjects_output = gr.Label(label="Subject Suggestions", show_heading=False)
def run_app(image, project_id):
caption, subjects = process_image(image, project_id)[1:]
return caption, subjects
submit_btn.click(
run_app,
inputs=[image_input, project_dropdown],
outputs=[caption_output, subjects_output],
)
clear_btn.click(lambda: ("", {}), outputs=[caption_output, subjects_output])
def update_submit_btn(img):
return gr.update(interactive=img is not None)
image_input.upload(update_submit_btn, inputs=image_input, outputs=submit_btn)
demo.launch()
|