Spaces:

Akshayram1
/

palligemma_experiments

Sleeping

App Files Files Community

Akshayram1 commited on Mar 3

Commit

57c929f

verified ·

1 Parent(s): 900613f

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -39

app.py CHANGED Viewed

@@ -36,19 +36,113 @@ def infer(image: PIL.Image.Image, text: str, max_new_tokens: int) -> str:
     result = processor.batch_decode(generated_ids, skip_special_tokens=True)
     return result[0][len(text):].lstrip("\n")
-# Image Captioning
-def generate_caption(image: PIL.Image.Image) -> str:
-    return infer(image, "caption", max_new_tokens=50)
-# Object Detection
-def detect_objects(image: PIL.Image.Image) -> str:
-    return infer(image, "detect objects", max_new_tokens=200)
-# Visual Question Answering (VQA)
-def vqa(image: PIL.Image.Image, question: str) -> str:
-    return infer(image, f"Q: {question} A:", max_new_tokens=50)
-# Gradio App
 with gr.Blocks() as demo:
     gr.Markdown("# PaliGemma Multi-Modal App")
     gr.Markdown("Upload an image and explore its features using the PaliGemma model!")
@@ -59,43 +153,23 @@ with gr.Blocks() as demo:
             with gr.Row():
                 with gr.Column():
                     caption_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
                     caption_btn = gr.Button("Generate Caption")
                 with gr.Column():
                     caption_output = gr.Text(label="Generated Caption")
-            caption_btn.click(fn=generate_caption, inputs=[caption_image], outputs=[caption_output])
-        # Tab 2: Object Detection
-        with gr.Tab("Object Detection"):
             with gr.Row():
                 with gr.Column():
                     detect_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
-                    detect_btn = gr.Button("Detect Objects")
-                with gr.Column():
-                    detect_output = gr.Text(label="Detected Objects")
-            detect_btn.click(fn=detect_objects, inputs=[detect_image], outputs=[detect_output])
-        # Tab 3: Visual Question Answering (VQA)
-        with gr.Tab("Visual Question Answering"):
-            with gr.Row():
-                with gr.Column():
-                    vqa_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
-                    vqa_question = gr.Text(label="Ask a Question", placeholder="What is in the image?")
-                    vqa_btn = gr.Button("Ask")
-                with gr.Column():
-                    vqa_output = gr.Text(label="Answer")
-            vqa_btn.click(fn=vqa, inputs=[vqa_image, vqa_question], outputs=[vqa_output])
-        # Tab 4: Text Generation (Original Feature)
-        with gr.Tab("Text Generation"):
-            with gr.Row():
-                with gr.Column():
-                    text_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
-                    text_input = gr.Text(label="Input Text", placeholder="Describe the image...")
-                    text_btn = gr.Button("Generate Text")
                 with gr.Column():
-                    text_output = gr.Text(label="Generated Text")
-            text_btn.click(fn=infer, inputs=[text_image, text_input, gr.Slider(10, 200, value=50)], outputs=[text_output])
 # Launch the App
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(debug=True)

     result = processor.batch_decode(generated_ids, skip_special_tokens=True)
     return result[0][len(text):].lstrip("\n")
+# Image Captioning (with user input for improvement)
+def generate_caption(image: PIL.Image.Image, caption_improvement: str) -> str:
+    return infer(image, f"caption: {caption_improvement}", max_new_tokens=50)
+# Object Detection/Segmentation
+def parse_segmentation(input_image, input_text):
+    out = infer(input_image, input_text, max_new_tokens=200)
+    objs = extract_objs(out.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True)
+    labels = set(obj.get('name') for obj in objs if obj.get('name'))
+    color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)}
+    highlighted_text = [(obj['content'], obj.get('name')) for obj in objs]
+    annotated_img = (
+        input_image,
+        [
+            (
+                obj['mask'] if obj.get('mask') is not None else obj['xyxy'],
+                obj['name'] or '',
+            )
+            for obj in objs
+            if 'mask' in obj or 'xyxy' in obj
+        ],
+    )
+    has_annotations = bool(annotated_img[1])
+    return annotated_img
+# Helper functions for object detection/segmentation
+def _get_params(checkpoint):
+    def transp(kernel):
+        return np.transpose(kernel, (2, 3, 1, 0))
+    def conv(name):
+        return {
+            'bias': checkpoint[name + '.bias'],
+            'kernel': transp(checkpoint[name + '.weight']),
+        }
+    def resblock(name):
+        return {
+            'Conv_0': conv(name + '.0'),
+            'Conv_1': conv(name + '.2'),
+            'Conv_2': conv(name + '.4'),
+        }
+    return {
+        '_embeddings': checkpoint['_vq_vae._embedding'],
+        'Conv_0': conv('decoder.0'),
+        'ResBlock_0': resblock('decoder.2.net'),
+        'ResBlock_1': resblock('decoder.3.net'),
+        'ConvTranspose_0': conv('decoder.4'),
+        'ConvTranspose_1': conv('decoder.6'),
+        'ConvTranspose_2': conv('decoder.8'),
+        'ConvTranspose_3': conv('decoder.10'),
+        'Conv_1': conv('decoder.12'),
+    }
+def _quantized_values_from_codebook_indices(codebook_indices, embeddings):
+    batch_size, num_tokens = codebook_indices.shape
+    assert num_tokens == 16, codebook_indices.shape
+    unused_num_embeddings, embedding_dim = embeddings.shape
+    encodings = jnp.take(embeddings, codebook_indices.reshape((-1)), axis=0)
+    encodings = encodings.reshape((batch_size, 4, 4, embedding_dim))
+    return encodings
+def extract_objs(text, width, height, unique_labels=False):
+    objs = []
+    seen = set()
+    while text:
+        m = _SEGMENT_DETECT_RE.match(text)
+        if not m:
+            break
+        gs = list(m.groups())
+        before = gs.pop(0)
+        name = gs.pop()
+        y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]]
+        y1, x1, y2, x2 = map(round, (y1*height, x1*width, y2*height, x2*width))
+        seg_indices = gs[4:20]
+        if seg_indices[0] is None:
+            mask = None
+        else:
+            seg_indices = np.array([int(x) for x in seg_indices], dtype=np.int32)
+            m64, = _get_reconstruct_masks()(seg_indices[None])[..., 0]
+            m64 = np.clip(np.array(m64) * 0.5 + 0.5, 0, 1)
+            m64 = PIL.Image.fromarray((m64 * 255).astype('uint8'))
+            mask = np.zeros([height, width])
+            if y2 > y1 and x2 > x1:
+                mask[y1:y2, x1:x2] = np.array(m64.resize([x2 - x1, y2 - y1])) / 255.0
+        content = m.group()
+        if before:
+            objs.append(dict(content=before))
+            content = content[len(before):]
+        while unique_labels and name in seen:
+            name = (name or '') + "'"
+        seen.add(name)
+        objs.append(dict(
+            content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name))
+        text = text[len(before) + len(content):]
+    if text:
+        objs.append(dict(content=text))
+    return objs
+# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("# PaliGemma Multi-Modal App")
     gr.Markdown("Upload an image and explore its features using the PaliGemma model!")
             with gr.Row():
                 with gr.Column():
                     caption_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
+                    caption_improvement_input = gr.Textbox(label="Improvement Input", placeholder="Enter description to improve caption")
                     caption_btn = gr.Button("Generate Caption")
                 with gr.Column():
                     caption_output = gr.Text(label="Generated Caption")
+            caption_btn.click(fn=generate_caption, inputs=[caption_image, caption_improvement_input], outputs=[caption_output])
+        # Tab 2: Segment/Detect
+        with gr.Tab("Segment/Detect"):
             with gr.Row():
                 with gr.Column():
                     detect_image = gr.Image(type="pil", label="Upload Image", width=512, height=512)
+                    detect_text = gr.Textbox(label="Entities to Detect", placeholder="List entities to segment/detect")
+                    detect_btn = gr.Button("Detect/Segment")
                 with gr.Column():
+                    detect_output = gr.AnnotatedImage(label="Annotated Image")
+            detect_btn.click(fn=parse_segmentation, inputs=[detect_image, detect_text], outputs=[detect_output])
 # Launch the App
 if __name__ == "__main__":
+    demo.queue(max_size=10).launch(debug=True)