Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from annif_client import AnnifClient | |
| import os | |
| # Get VLM API base URL and API key from environment variables | |
| VLM_API_BASE_URL = os.getenv("VLM_API_BASE_URL") | |
| if not VLM_API_BASE_URL: | |
| raise RuntimeError("VLM_API_BASE_URL environment variable must be set.") | |
| VLM_API_KEY = os.getenv("VLM_API_KEY", "") | |
| VLM_API_ENDPOINT = f"{VLM_API_BASE_URL}/v1/chat/completions" | |
| # Get Annif API base URL from environment variable, fallback to default | |
| ANNIF_API_BASE_URL = os.getenv("ANNIF_API_BASE_URL") | |
| if ANNIF_API_BASE_URL: | |
| if not ANNIF_API_BASE_URL.endswith("v1/"): | |
| raise RuntimeError("ANNIF_API_BASE_URL should end with 'v1/'") | |
| annif = AnnifClient(api_base=ANNIF_API_BASE_URL) | |
| else: | |
| annif = AnnifClient() | |
| def get_caption(image, prompt): | |
| # Convert image to base64 JPEG | |
| import io | |
| import base64 | |
| buf = io.BytesIO() | |
| image.save(buf, format="JPEG") | |
| img_bytes = buf.getvalue() | |
| img_b64 = base64.b64encode(img_bytes).decode("utf-8") | |
| # Prepare payload for VLM (OpenAI schema) | |
| payload = { | |
| "model": "gemma3", | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}, | |
| }, | |
| ], | |
| } | |
| ], | |
| "max_tokens": 300, | |
| } | |
| headers = {"X-API-Key": VLM_API_KEY} if VLM_API_KEY else {} | |
| try: | |
| response = requests.post(VLM_API_ENDPOINT, json=payload, headers=headers) | |
| response.raise_for_status() | |
| data = response.json() | |
| # Assume caption is in data['choices'][0]['message']['content'] | |
| caption = data["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| print(f"VLM API error: {e}") # Detailed error for admin | |
| raise gr.Error("Sorry, there was a problem generating a caption.") | |
| return caption | |
| def get_subjects(caption, project_id): | |
| try: | |
| results = annif.suggest(project_id=project_id, text=caption) | |
| label_scores = {result["label"]: result["score"] for result in results} | |
| if not label_scores: | |
| return {} | |
| return label_scores | |
| except Exception as e: | |
| print(f"Annif API error: {e}") # Detailed error for admin | |
| raise gr.Error("Sorry, there was a problem getting subject suggestions.") | |
| def process_image(image, project_id): | |
| prompt = ( | |
| "Luo vaihtoehtoinen tekstikuvaus, joka on tarkoitettu henkilöille, jotka eivät näe kuvaa. " | |
| "Kuvaile kuvan todellista sisältöä, älä tulkitse mitään. " | |
| "Aloita yleisellä kuvauksella ja siirry sitten yksityiskohtiin. " | |
| "Kuvaile yksityiskohtia ainakin viiden lauseen verran. " | |
| "Jos kuvassa näkyy tekstiä, kerro mitä siinä lukee ja jos teksti ei ole suomea, käännä se myös suomeksi. " | |
| 'Vastaa vain lopullisella alt-tekstillä, älä lisää "tässä on alt-teksti", selityksiä tai väliotsikoita. ' | |
| ) | |
| caption = get_caption(image, prompt) | |
| subjects = get_subjects(caption, project_id) | |
| return image, caption, subjects | |
| with gr.Blocks(title="VLM Caption & Annif Demo") as demo: | |
| gr.Markdown("# VLM Caption & Annif Demo") | |
| gr.Markdown( | |
| """ | |
| **How it works:** | |
| 1. Upload or take a photo in the input section below. | |
| 2. The image is sent to a Visual Language Model to generate a caption. | |
| 3. [Annif](https://github.com/NatLibFi/Annif) suggests subjects based on the caption via the API of [Finto AI](https://ai.finto.fi). | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Input") | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Image Input (upload or take a photo)", | |
| mirror_webcam=False, | |
| ) | |
| language_dropdown = gr.Dropdown( | |
| choices=[("Finnish", "fi"), ("Swedish", "sv"), ("English", "en")], | |
| value="fi", | |
| label="Output Language", | |
| info="Select the output language for caption and subject suggestions", | |
| ) | |
| project_dropdown = gr.Dropdown( | |
| choices=[ | |
| ("YSO - General Finnish Ontology", "yso"), | |
| ("YKL - Finnish Public Library Classification ", "ykl"), | |
| ("KAUNO - Ontology for Fiction (for Finnish only)", "kauno"), | |
| ], | |
| value="yso", | |
| label="Annif Project", | |
| info="Select the vocabulary from where subject suggestions are drawn "\ | |
| "([YSO](https://finto.fi/yso/), [YKL](https://finto.fi/ykl/), [KAUNO](https://finto.fi/kauno/))", | |
| ) | |
| submit_btn = gr.Button("Submit", interactive=False) | |
| clear_btn = gr.Button("Clear") | |
| with gr.Column(): | |
| gr.Markdown("### Output") | |
| caption_output = gr.Textbox(label="Caption", lines=10, interactive=False) | |
| subjects_output = gr.Label(label="Subject Suggestions", show_heading=False) | |
| # Translated prompts for VLM | |
| VLM_PROMPTS = { | |
| "fi": ( | |
| "Luo vaihtoehtoinen tekstikuvaus, joka on tarkoitettu henkilöille, jotka eivät näe kuvaa. " | |
| "Kuvaile kuvan todellista sisältöä, älä tulkitse mitään. " | |
| "Aloita yleisellä kuvauksella ja siirry sitten yksityiskohtiin. " | |
| "Kuvaile yksityiskohtia ainakin viiden lauseen verran. " | |
| "Jos kuvassa näkyy tekstiä, kerro mitä siinä lukee ja jos teksti ei ole suomea, käännä se myös suomeksi. " | |
| 'Vastaa vain lopullisella alt-tekstillä, älä lisää "tässä on alt-teksti", selityksiä tai väliotsikoita. ' | |
| ), | |
| "en": ( | |
| "Create an alternative text description for people who cannot see the image. " | |
| "Describe the actual content of the image, do not interpret anything. " | |
| "Start with a general description and then move to details. " | |
| "Describe details in at least five sentences. " | |
| "If there is text in the image, state what it says and translate it into English if it is not in English. " | |
| "Respond only with the final alt text, do not add explanations or headings." | |
| ), | |
| "sv": ( | |
| "Skapa en alternativ textbeskrivning för personer som inte kan se bilden. " | |
| "Beskriv bildens faktiska innehåll, tolka ingenting. " | |
| "Börja med en allmän beskrivning och gå sedan vidare till detaljer. " | |
| "Beskriv detaljerna med minst fem meningar. " | |
| "Om det finns text i bilden, ange vad det står och översätt det till svenska om det inte är på svenska. " | |
| "Svara endast med den slutliga alt-texten, lägg inte till förklaringar eller rubriker." | |
| ), | |
| } | |
| def run_app(image, language, project): | |
| prompt = VLM_PROMPTS.get(language, VLM_PROMPTS["fi"]) | |
| # Compose Annif project identifier | |
| project_id = f"{project}-{language}" | |
| caption = get_caption(image, prompt) | |
| try: | |
| subjects = get_subjects(caption, project_id) | |
| return caption, subjects | |
| except gr.Error: | |
| gr.Warning("Sorry, there was a problem getting subject suggestions.") | |
| return caption, {} | |
| submit_btn.click( | |
| run_app, | |
| inputs=[image_input, language_dropdown, project_dropdown], | |
| outputs=[caption_output, subjects_output], | |
| ) | |
| clear_btn.click(lambda: ("", {}), outputs=[caption_output, subjects_output]) | |
| def update_submit_btn(img): | |
| return gr.update(interactive=img is not None) | |
| image_input.upload(update_submit_btn, inputs=image_input, outputs=submit_btn) | |
| demo.launch() | |