ChatGPT-ImageCaptioner2

Build error

File size: 4,864 Bytes

8fbb31d
8b45a4d
 
 
 
 
4de593d
6a29a0c
4de593d
6a29a0c
4de593d
80af8f2
4de593d
6a29a0c
 
 
4de593d
 
6a29a0c
 
 
4de593d
9a28f36
6a29a0c
cfc8e52
6a949be
cfc8e52
6a949be
9a28f36
6b4347b
4de593d
 
6a29a0c
4de593d
 
6a29a0c
4de593d
 
 
 
 
 
6a29a0c
 
7c280f9
 
6a29a0c
b30f1b3
6a29a0c

import os 
try:
    import detectron2
except:
    import os 
    os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
import detectron2
from detectron2.utils.logger import setup_logger
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog
from langchain.llms import OpenAI, OpenAIChat
from centernet.config import add_centernet_config
from detic.config import add_detic_config
from detic.modeling.utils import reset_cls_test
from pydantic import BaseModel, Field, PydanticUserError
#cmd1 = "pip -m pip install 'torch'"

# cmd0 = "pip -m pip install 'https://github.com/facebookresearch/detectron2.git@5aeb252b194b93dc2879b4ac34bc51a31b5aee13'"
#cmd0 = "python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@5aeb252b194b93dc2879b4ac34bc51a31b5aee13'"
# cmd0 = "python -m pip install 'https://github.com/facebookresearch/detectron2.git'"
#os.system(cmd0)
#os.system(cmd1)

class BaseModelWithA(BaseModel):
    a: float

class Foo(BaseModelWithA):
    pass

try:
    class Bar(Foo):
        x: float = 12.3
        a: float = 123.0  # Add type annotation here
except PydanticUserError as exc_info:
    assert exc_info.code == 'model-field-overridden'

def generate_caption(object_list_str, api_key, temperature):
    query = f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description that IS BOTH SUPER CONCISE AND SHORT for the photo. In this photo we have the following objects\n{object_list_str}"

    llm = OpenAIChat(
        model_name="gpt-3.5-turbo", openai_api_key=api_key, temperature=temperature
    )

    try:
        caption = llm(query)
        caption = caption.strip()
    except:
        caption = "Sorry, something went wrong!"

    return caption

def inference(img, vocabulary, api_key, temperature):
    metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
    classifier = BUILDIN_CLASSIFIER[vocabulary]
    num_classes = len(metadata.thing_classes)
    reset_cls_test(predictor.model, classifier, num_classes)

    im = cv2.imread(img)

    outputs = predictor(im)
    v = Visualizer(im[:, :, ::-1], metadata)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))

    detected_objects = []
    object_list_str = []

    box_locations = outputs["instances"].pred_boxes
    box_loc_screen = box_locations.tensor.cpu().numpy()

    for i, box_coord in enumerate(box_loc_screen):
        x0, y0, x1, y1 = box_coord
        width = x1 - x0
        height = y1 - y0
        predicted_label = metadata.thing_classes[outputs["instances"].pred_classes[i]]
        detected_objects.append(
            {
                "prediction": predicted_label,
                "x": int(x0),
                "y": int(y0),
                "w": int(width),
                "h": int(height),
            }
        )
        object_list_str.append(
            f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})"
        )

    if api_key is not None:
        gpt_response = generate_caption(object_list_str, api_key, temperature)
    else:
        gpt_response = "Please paste your OpenAI key to use"

    return (
        Image.fromarray(np.uint8(out.get_image())).convert("RGB"),
        gpt_response,
    )

with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown("# Image Captioning using Detic and ChatGPT with LangChain 🦜️🔗")
        gr.Markdown(
            "Use Detic to detect objects in an image and then use `gpt-3.5-turbo` to describe the image."
        )

    with gr.Row():
        with gr.Column():
            inp = gr.Image(label="Input Image", type="filepath")
            with gr.Column():
                openai_api_key_textbox = gr.Textbox(
                    placeholder="Paste your OpenAI API key (sk-...)",
                    show_label=False,
                    lines=1,
                    type="password",
                )
                temperature = gr.Slider(0, 1, 0.1, label="Temperature")
                vocab = gr.Dropdown(
                    ["lvis", "objects365", "openimages", "coco"],
                    label="Detic Vocabulary",
                    value="lvis",
                )

            btn_detic = gr.Button("Run Detic and ChatGPT")
        with gr.Column():
            output_desc = gr.Textbox(label="Description Description", lines=5)
            outviz = gr.Image(label="Visualization", type="pil")

    btn_detic.click(
        fn=inference,
        inputs=[inp, vocab, openai_api_key_textbox, temperature],
        outputs=[outviz, output_desc],
    )

demo.launch(debug=False)