File size: 2,939 Bytes
38abc6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659bfbb
 
 
 
 
 
38abc6a
 
 
 
 
 
 
659bfbb
 
 
ee8a4e6
659bfbb
ee8a4e6
659bfbb
 
ee8a4e6
 
 
659bfbb
 
 
 
 
 
 
 
 
 
 
 
 
 
38abc6a
 
 
 
 
 
 
659bfbb
 
 
 
 
 
 
 
 
 
38abc6a
 
 
 
 
1d0a556
b609efe
38abc6a
 
 
 
 
 
 
 
 
659bfbb
38abc6a
659bfbb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from dotenv import load_dotenv, find_dotenv
import os
import io
from PIL import Image
import requests
import warnings
import gradio as gr

# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*Using the model-agnostic default `max_length`.*")

# Load environment variables from .env file
load_dotenv(find_dotenv())
hf_api_key = os.getenv('HF_API_KEY')
endpoint_url = os.getenv('HF_API_ITT_BASE')

# Validate environment variables
if not hf_api_key:
    raise ValueError("HF_API_KEY is not set in the .env file.")
if not endpoint_url:
    raise ValueError("HF_API_ITT_BASE is not set in the .env file.")


def generate_caption(image):
    """
    Sends an image to the Hugging Face Inference Endpoint for caption generation.
    :param image: An image in PIL format.
    :return: Generated caption or error message.
    """
    try:
        headers = {"Authorization": f"Bearer {hf_api_key}"}
        
        # Convert the PIL image to a binary stream in JPEG format
        buffered = io.BytesIO()
        image = image.convert("RGB")  # Ensure the image is in RGB mode
        image.save(buffered, format="JPEG")
        buffered.seek(0)
        
        # Create the appropriate payload for the API
        files = {"file": ("image.jpg", buffered, "image/jpeg")}
        
        # Make the POST request to the endpoint
        response = requests.post(endpoint_url, headers=headers, files=files)
        
        if response.status_code == 200:
            return response.json().get("generated_text", "No caption generated.")
        else:
            # Log the error response for debugging
            return (
                f"Error: {response.status_code} - {response.text}\n"
                f"Headers: {headers}\nEndpoint: {endpoint_url}"
            )
    except Exception as e:
        return f"An error occurred: {str(e)}"


# Predefined sample images
def get_sample_images():
    """
    Returns a list of predefined sample images in the assets directory.
    """
    sample_dir = "CreatureCaptures"  # Ensure this directory exists and contains sample images
    try:
        return [
            os.path.join(sample_dir, file)
            for file in os.listdir(sample_dir)
            if file.lower().endswith((".png", ".jpg", ".jpeg"))
        ]
    except FileNotFoundError:
        return []


# Gradio interface
sample_images = get_sample_images()  # Load predefined sample images
demo = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Textbox(label="Generated Caption"),
    examples=sample_images,
    title="Image Captioning App",
    description=(
        "Upload an image or use one of the predefined samples to generate a caption. "
        "This app uses a Hugging Face Inference Endpoint for the `Salesforce/blip-image-captioning-base` model."
    ),
)

if __name__ == "__main__":
    # Launch the Gradio demo
    demo.launch()