nasim-raj-laskar commited on
Commit
aeba2d0
·
1 Parent(s): 1330d31

initial deploy

Browse files
Files changed (11) hide show
  1. .gitignore +11 -0
  2. .python-version +1 -0
  3. app.py +90 -0
  4. pyproject.toml +13 -0
  5. requirements.txt +5 -0
  6. src/__init__.py +0 -0
  7. src/captioning.py +14 -0
  8. src/detection.py +18 -0
  9. src/reasoning.py +53 -0
  10. src/utils.py +9 -0
  11. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .vscode/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import ImageDraw, ImageFont
3
+ from src.utils import run_pipeline
4
+
5
+ COLORS = [
6
+ "#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
7
+ "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
8
+ ]
9
+
10
+ def draw_boxes(image, boxes):
11
+ img = image.copy()
12
+ draw = ImageDraw.Draw(img)
13
+ label_color = {}
14
+ for i, det in enumerate(boxes):
15
+ label = det["label"]
16
+ color = label_color.setdefault(label, COLORS[i % len(COLORS)])
17
+ b = det["box"]
18
+ draw.rectangle([b["xmin"], b["ymin"], b["xmax"], b["ymax"]], outline=color, width=3)
19
+ text = f"{label} {det['score']:.0%}"
20
+ draw.rectangle([b["xmin"], b["ymin"] - 18, b["xmin"] + len(text) * 7, b["ymin"]], fill=color)
21
+ draw.text((b["xmin"] + 2, b["ymin"] - 16), text, fill="white")
22
+ return img
23
+
24
+ def make_chips(labels):
25
+ if not labels:
26
+ return "<p style='color:#888'>No objects detected</p>"
27
+ chips = "".join(
28
+ f"<span style='background:#2d2d2d;border:1px solid #555;color:#e0e0e0;"
29
+ f"padding:4px 10px;border-radius:20px;margin:3px;display:inline-block;"
30
+ f"font-size:13px'>{l}</span>"
31
+ for l in sorted(labels)
32
+ )
33
+ return f"<div style='line-height:2'>{chips}</div>"
34
+
35
+ def process(image, question):
36
+ if image is None:
37
+ return None, "<p style='color:#f66'>No image provided</p>", "", ""
38
+
39
+ caption, labels, boxes, answer = run_pipeline(image, question)
40
+ annotated = draw_boxes(image, boxes)
41
+ chips = make_chips(labels)
42
+ return annotated, chips, caption, answer
43
+
44
+ CSS = """
45
+ body, .gradio-container { background:#1a1a2e !important; color:#e0e0e0 !important; font-family:'Segoe UI',sans-serif; }
46
+ .gr-button-primary { background:linear-gradient(135deg,#667eea,#764ba2) !important; border:none !important; color:#fff !important; font-weight:600 !important; }
47
+ .gr-button-primary:hover { opacity:.9 !important; transform:translateY(-1px); }
48
+ .gr-box, .gr-form, .gr-panel { background:#16213e !important; border:1px solid #2d2d5e !important; border-radius:12px !important; }
49
+ label { color:#a0a8c0 !important; font-size:12px !important; text-transform:uppercase; letter-spacing:.5px; }
50
+ textarea, input[type=text] { background:#0f3460 !important; color:#e0e0e0 !important; border:1px solid #2d2d5e !important; border-radius:8px !important; }
51
+ .output-card { background:#16213e; border:1px solid #2d2d5e; border-radius:12px; padding:16px; margin-top:8px; }
52
+ """
53
+
54
+ with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
55
+ gr.Markdown(
56
+ "<h1 style='text-align:center;background:linear-gradient(135deg,#667eea,#764ba2);"
57
+ "-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:4px'>"
58
+ "🧠 Visual Reasoning Engine</h1>"
59
+ "<p style='text-align:center;color:#888;margin-top:0'>Object detection · Captioning · Visual Q&A</p>"
60
+ )
61
+
62
+ with gr.Row(equal_height=True):
63
+ with gr.Column(scale=1):
64
+ image_input = gr.Image(type="pil", label="📷 Upload Image")
65
+ question_input = gr.Textbox(
66
+ label="❓ Ask a question",
67
+ placeholder="What is happening in this image?",
68
+ lines=2,
69
+ )
70
+ submit_btn = gr.Button("▶ Run Analysis", variant="primary")
71
+
72
+ with gr.Column(scale=1):
73
+ annotated_output = gr.Image(label="🔍 Detected Objects", type="pil")
74
+
75
+ with gr.Row():
76
+ with gr.Column():
77
+ gr.Markdown("<p style='color:#a0a8c0;font-size:12px;text-transform:uppercase;letter-spacing:.5px'>🧱 Detected Objects</p>")
78
+ objects_output = gr.HTML()
79
+ with gr.Column():
80
+ caption_output = gr.Textbox(label="🧾 Caption", interactive=False)
81
+ with gr.Column():
82
+ answer_output = gr.Textbox(label="🧠 Reasoned Answer", interactive=False)
83
+
84
+ submit_btn.click(
85
+ fn=process,
86
+ inputs=[image_input, question_input],
87
+ outputs=[annotated_output, objects_output, caption_output, answer_output],
88
+ )
89
+
90
+ demo.launch()
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "vre"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "gradio>=6.14.0",
9
+ "pillow>=12.2.0",
10
+ "timm>=1.0.26",
11
+ "torch>=2.11.0",
12
+ "transformers>=5.7.0",
13
+ ]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=6.14.0
2
+ pillow>=12.2.0
3
+ timm>=1.0.26
4
+ torch>=2.11.0
5
+ transformers>=5.7.0
src/__init__.py ADDED
File without changes
src/captioning.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BlipProcessor, BlipForConditionalGeneration
2
+ import torch
3
+
4
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
5
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
6
+
7
+ def generate_caption(image):
8
+ inputs = processor(image, return_tensors="pt")
9
+
10
+ with torch.no_grad():
11
+ out = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
12
+
13
+ caption = processor.decode(out[0], skip_special_tokens=True)
14
+ return caption
src/detection.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Load once
4
+ detector = pipeline(
5
+ "object-detection",
6
+ model="facebook/detr-resnet-50",
7
+ device=-1 # CPU)
8
+ )
9
+
10
+ def detect_objects(image, threshold=0.7):
11
+ results = detector(image)
12
+ filtered = [r for r in results if r["score"] >= threshold]
13
+ labels = list({r["label"] for r in filtered})
14
+ boxes = [
15
+ {"label": r["label"], "score": round(r["score"], 2), "box": r["box"]}
16
+ for r in filtered
17
+ ]
18
+ return labels, boxes
src/reasoning.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
5
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
6
+
7
+ def reason(objects, caption, question):
8
+ prompt = f"""
9
+ You are a visual reasoning system.
10
+
11
+ Use ONLY the given objects and scene.
12
+ Do NOT invent new events or actions.
13
+
14
+ If an action is visible, describe it.
15
+ If no clear action is visible, describe the scene simply.
16
+
17
+ Example:
18
+ Objects: person, dog
19
+ Scene: a man walking a dog on a path
20
+ Question: What is happening in this image?
21
+ Answer: A person is walking a dog outdoors.
22
+
23
+ Objects: car
24
+ Scene: a car on a race track
25
+ Question: What is happening in this image?
26
+ Answer: A car is driving on a race track.
27
+
28
+ Now answer:
29
+
30
+ Objects: {objects}
31
+ Scene: {caption}
32
+ Question: {question}
33
+
34
+ Answer:
35
+ """
36
+
37
+ inputs = tokenizer(prompt, return_tensors="pt")
38
+
39
+ with torch.no_grad():
40
+ outputs = model.generate(
41
+ **inputs,
42
+ max_new_tokens=40
43
+ )
44
+
45
+ raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
+
47
+
48
+ answer = raw.split("Answer:")[-1].strip()
49
+
50
+ # remove accidental extra parts
51
+ answer = answer.split("\n")[0]
52
+
53
+ return answer
src/utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .captioning import generate_caption
2
+ from .detection import detect_objects
3
+ from .reasoning import reason
4
+
5
+ def run_pipeline(image, question):
6
+ caption = generate_caption(image)
7
+ labels, boxes = detect_objects(image)
8
+ answer = reason(", ".join(labels), caption, question)
9
+ return caption, labels, boxes, answer
uv.lock ADDED
The diff for this file is too large to render. See raw diff