nasim-raj-laskar commited on
Commit ·
aeba2d0
1
Parent(s): 1330d31
initial deploy
Browse files- .gitignore +11 -0
- .python-version +1 -0
- app.py +90 -0
- pyproject.toml +13 -0
- requirements.txt +5 -0
- src/__init__.py +0 -0
- src/captioning.py +14 -0
- src/detection.py +18 -0
- src/reasoning.py +53 -0
- src/utils.py +9 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
| 11 |
+
.vscode/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
app.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from PIL import ImageDraw, ImageFont
|
| 3 |
+
from src.utils import run_pipeline
|
| 4 |
+
|
| 5 |
+
COLORS = [
|
| 6 |
+
"#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
|
| 7 |
+
"#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
|
| 8 |
+
]
|
| 9 |
+
|
| 10 |
+
def draw_boxes(image, boxes):
|
| 11 |
+
img = image.copy()
|
| 12 |
+
draw = ImageDraw.Draw(img)
|
| 13 |
+
label_color = {}
|
| 14 |
+
for i, det in enumerate(boxes):
|
| 15 |
+
label = det["label"]
|
| 16 |
+
color = label_color.setdefault(label, COLORS[i % len(COLORS)])
|
| 17 |
+
b = det["box"]
|
| 18 |
+
draw.rectangle([b["xmin"], b["ymin"], b["xmax"], b["ymax"]], outline=color, width=3)
|
| 19 |
+
text = f"{label} {det['score']:.0%}"
|
| 20 |
+
draw.rectangle([b["xmin"], b["ymin"] - 18, b["xmin"] + len(text) * 7, b["ymin"]], fill=color)
|
| 21 |
+
draw.text((b["xmin"] + 2, b["ymin"] - 16), text, fill="white")
|
| 22 |
+
return img
|
| 23 |
+
|
| 24 |
+
def make_chips(labels):
|
| 25 |
+
if not labels:
|
| 26 |
+
return "<p style='color:#888'>No objects detected</p>"
|
| 27 |
+
chips = "".join(
|
| 28 |
+
f"<span style='background:#2d2d2d;border:1px solid #555;color:#e0e0e0;"
|
| 29 |
+
f"padding:4px 10px;border-radius:20px;margin:3px;display:inline-block;"
|
| 30 |
+
f"font-size:13px'>{l}</span>"
|
| 31 |
+
for l in sorted(labels)
|
| 32 |
+
)
|
| 33 |
+
return f"<div style='line-height:2'>{chips}</div>"
|
| 34 |
+
|
| 35 |
+
def process(image, question):
|
| 36 |
+
if image is None:
|
| 37 |
+
return None, "<p style='color:#f66'>No image provided</p>", "", ""
|
| 38 |
+
|
| 39 |
+
caption, labels, boxes, answer = run_pipeline(image, question)
|
| 40 |
+
annotated = draw_boxes(image, boxes)
|
| 41 |
+
chips = make_chips(labels)
|
| 42 |
+
return annotated, chips, caption, answer
|
| 43 |
+
|
| 44 |
+
CSS = """
|
| 45 |
+
body, .gradio-container { background:#1a1a2e !important; color:#e0e0e0 !important; font-family:'Segoe UI',sans-serif; }
|
| 46 |
+
.gr-button-primary { background:linear-gradient(135deg,#667eea,#764ba2) !important; border:none !important; color:#fff !important; font-weight:600 !important; }
|
| 47 |
+
.gr-button-primary:hover { opacity:.9 !important; transform:translateY(-1px); }
|
| 48 |
+
.gr-box, .gr-form, .gr-panel { background:#16213e !important; border:1px solid #2d2d5e !important; border-radius:12px !important; }
|
| 49 |
+
label { color:#a0a8c0 !important; font-size:12px !important; text-transform:uppercase; letter-spacing:.5px; }
|
| 50 |
+
textarea, input[type=text] { background:#0f3460 !important; color:#e0e0e0 !important; border:1px solid #2d2d5e !important; border-radius:8px !important; }
|
| 51 |
+
.output-card { background:#16213e; border:1px solid #2d2d5e; border-radius:12px; padding:16px; margin-top:8px; }
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
|
| 55 |
+
gr.Markdown(
|
| 56 |
+
"<h1 style='text-align:center;background:linear-gradient(135deg,#667eea,#764ba2);"
|
| 57 |
+
"-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:4px'>"
|
| 58 |
+
"🧠 Visual Reasoning Engine</h1>"
|
| 59 |
+
"<p style='text-align:center;color:#888;margin-top:0'>Object detection · Captioning · Visual Q&A</p>"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
with gr.Row(equal_height=True):
|
| 63 |
+
with gr.Column(scale=1):
|
| 64 |
+
image_input = gr.Image(type="pil", label="📷 Upload Image")
|
| 65 |
+
question_input = gr.Textbox(
|
| 66 |
+
label="❓ Ask a question",
|
| 67 |
+
placeholder="What is happening in this image?",
|
| 68 |
+
lines=2,
|
| 69 |
+
)
|
| 70 |
+
submit_btn = gr.Button("▶ Run Analysis", variant="primary")
|
| 71 |
+
|
| 72 |
+
with gr.Column(scale=1):
|
| 73 |
+
annotated_output = gr.Image(label="🔍 Detected Objects", type="pil")
|
| 74 |
+
|
| 75 |
+
with gr.Row():
|
| 76 |
+
with gr.Column():
|
| 77 |
+
gr.Markdown("<p style='color:#a0a8c0;font-size:12px;text-transform:uppercase;letter-spacing:.5px'>🧱 Detected Objects</p>")
|
| 78 |
+
objects_output = gr.HTML()
|
| 79 |
+
with gr.Column():
|
| 80 |
+
caption_output = gr.Textbox(label="🧾 Caption", interactive=False)
|
| 81 |
+
with gr.Column():
|
| 82 |
+
answer_output = gr.Textbox(label="🧠 Reasoned Answer", interactive=False)
|
| 83 |
+
|
| 84 |
+
submit_btn.click(
|
| 85 |
+
fn=process,
|
| 86 |
+
inputs=[image_input, question_input],
|
| 87 |
+
outputs=[annotated_output, objects_output, caption_output, answer_output],
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
demo.launch()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "vre"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.12"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"gradio>=6.14.0",
|
| 9 |
+
"pillow>=12.2.0",
|
| 10 |
+
"timm>=1.0.26",
|
| 11 |
+
"torch>=2.11.0",
|
| 12 |
+
"transformers>=5.7.0",
|
| 13 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.14.0
|
| 2 |
+
pillow>=12.2.0
|
| 3 |
+
timm>=1.0.26
|
| 4 |
+
torch>=2.11.0
|
| 5 |
+
transformers>=5.7.0
|
src/__init__.py
ADDED
|
File without changes
|
src/captioning.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 5 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 6 |
+
|
| 7 |
+
def generate_caption(image):
|
| 8 |
+
inputs = processor(image, return_tensors="pt")
|
| 9 |
+
|
| 10 |
+
with torch.no_grad():
|
| 11 |
+
out = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
|
| 12 |
+
|
| 13 |
+
caption = processor.decode(out[0], skip_special_tokens=True)
|
| 14 |
+
return caption
|
src/detection.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
# Load once
|
| 4 |
+
detector = pipeline(
|
| 5 |
+
"object-detection",
|
| 6 |
+
model="facebook/detr-resnet-50",
|
| 7 |
+
device=-1 # CPU)
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
def detect_objects(image, threshold=0.7):
|
| 11 |
+
results = detector(image)
|
| 12 |
+
filtered = [r for r in results if r["score"] >= threshold]
|
| 13 |
+
labels = list({r["label"] for r in filtered})
|
| 14 |
+
boxes = [
|
| 15 |
+
{"label": r["label"], "score": round(r["score"], 2), "box": r["box"]}
|
| 16 |
+
for r in filtered
|
| 17 |
+
]
|
| 18 |
+
return labels, boxes
|
src/reasoning.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
|
| 5 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
|
| 6 |
+
|
| 7 |
+
def reason(objects, caption, question):
|
| 8 |
+
prompt = f"""
|
| 9 |
+
You are a visual reasoning system.
|
| 10 |
+
|
| 11 |
+
Use ONLY the given objects and scene.
|
| 12 |
+
Do NOT invent new events or actions.
|
| 13 |
+
|
| 14 |
+
If an action is visible, describe it.
|
| 15 |
+
If no clear action is visible, describe the scene simply.
|
| 16 |
+
|
| 17 |
+
Example:
|
| 18 |
+
Objects: person, dog
|
| 19 |
+
Scene: a man walking a dog on a path
|
| 20 |
+
Question: What is happening in this image?
|
| 21 |
+
Answer: A person is walking a dog outdoors.
|
| 22 |
+
|
| 23 |
+
Objects: car
|
| 24 |
+
Scene: a car on a race track
|
| 25 |
+
Question: What is happening in this image?
|
| 26 |
+
Answer: A car is driving on a race track.
|
| 27 |
+
|
| 28 |
+
Now answer:
|
| 29 |
+
|
| 30 |
+
Objects: {objects}
|
| 31 |
+
Scene: {caption}
|
| 32 |
+
Question: {question}
|
| 33 |
+
|
| 34 |
+
Answer:
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 38 |
+
|
| 39 |
+
with torch.no_grad():
|
| 40 |
+
outputs = model.generate(
|
| 41 |
+
**inputs,
|
| 42 |
+
max_new_tokens=40
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
answer = raw.split("Answer:")[-1].strip()
|
| 49 |
+
|
| 50 |
+
# remove accidental extra parts
|
| 51 |
+
answer = answer.split("\n")[0]
|
| 52 |
+
|
| 53 |
+
return answer
|
src/utils.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .captioning import generate_caption
|
| 2 |
+
from .detection import detect_objects
|
| 3 |
+
from .reasoning import reason
|
| 4 |
+
|
| 5 |
+
def run_pipeline(image, question):
|
| 6 |
+
caption = generate_caption(image)
|
| 7 |
+
labels, boxes = detect_objects(image)
|
| 8 |
+
answer = reason(", ".join(labels), caption, question)
|
| 9 |
+
return caption, labels, boxes, answer
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|