Spaces:

Nasim435
/

VRE

Sleeping

App Files Files Community

nasim-raj-laskar commited on 25 days ago

Commit

aeba2d0

1 Parent(s): 1330d31

initial deploy

Browse files

Files changed (11) hide show

.gitignore +11 -0
.python-version +1 -0
app.py +90 -0
pyproject.toml +13 -0
requirements.txt +5 -0
src/__init__.py +0 -0
src/captioning.py +14 -0
src/detection.py +18 -0
src/reasoning.py +53 -0
src/utils.py +9 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.vscode/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import gradio as gr
+from PIL import ImageDraw, ImageFont
+from src.utils import run_pipeline
+COLORS = [
+    "#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
+    "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F",
+]
+def draw_boxes(image, boxes):
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    label_color = {}
+    for i, det in enumerate(boxes):
+        label = det["label"]
+        color = label_color.setdefault(label, COLORS[i % len(COLORS)])
+        b = det["box"]
+        draw.rectangle([b["xmin"], b["ymin"], b["xmax"], b["ymax"]], outline=color, width=3)
+        text = f"{label} {det['score']:.0%}"
+        draw.rectangle([b["xmin"], b["ymin"] - 18, b["xmin"] + len(text) * 7, b["ymin"]], fill=color)
+        draw.text((b["xmin"] + 2, b["ymin"] - 16), text, fill="white")
+    return img
+def make_chips(labels):
+    if not labels:
+        return "<p style='color:#888'>No objects detected</p>"
+    chips = "".join(
+        f"<span style='background:#2d2d2d;border:1px solid #555;color:#e0e0e0;"
+        f"padding:4px 10px;border-radius:20px;margin:3px;display:inline-block;"
+        f"font-size:13px'>{l}</span>"
+        for l in sorted(labels)
+    )
+    return f"<div style='line-height:2'>{chips}</div>"
+def process(image, question):
+    if image is None:
+        return None, "<p style='color:#f66'>No image provided</p>", "", ""
+    caption, labels, boxes, answer = run_pipeline(image, question)
+    annotated = draw_boxes(image, boxes)
+    chips = make_chips(labels)
+    return annotated, chips, caption, answer
+CSS = """
+body, .gradio-container { background:#1a1a2e !important; color:#e0e0e0 !important; font-family:'Segoe UI',sans-serif; }
+.gr-button-primary { background:linear-gradient(135deg,#667eea,#764ba2) !important; border:none !important; color:#fff !important; font-weight:600 !important; }
+.gr-button-primary:hover { opacity:.9 !important; transform:translateY(-1px); }
+.gr-box, .gr-form, .gr-panel { background:#16213e !important; border:1px solid #2d2d5e !important; border-radius:12px !important; }
+label { color:#a0a8c0 !important; font-size:12px !important; text-transform:uppercase; letter-spacing:.5px; }
+textarea, input[type=text] { background:#0f3460 !important; color:#e0e0e0 !important; border:1px solid #2d2d5e !important; border-radius:8px !important; }
+.output-card { background:#16213e; border:1px solid #2d2d5e; border-radius:12px; padding:16px; margin-top:8px; }
+"""
+with gr.Blocks(css=CSS, theme=gr.themes.Base()) as demo:
+    gr.Markdown(
+        "<h1 style='text-align:center;background:linear-gradient(135deg,#667eea,#764ba2);"
+        "-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:4px'>"
+        "🧠 Visual Reasoning Engine</h1>"
+        "<p style='text-align:center;color:#888;margin-top:0'>Object detection · Captioning · Visual Q&A</p>"
+    )
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="📷 Upload Image")
+            question_input = gr.Textbox(
+                label="❓ Ask a question",
+                placeholder="What is happening in this image?",
+                lines=2,
+            )
+            submit_btn = gr.Button("▶ Run Analysis", variant="primary")
+        with gr.Column(scale=1):
+            annotated_output = gr.Image(label="🔍 Detected Objects", type="pil")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("<p style='color:#a0a8c0;font-size:12px;text-transform:uppercase;letter-spacing:.5px'>🧱 Detected Objects</p>")
+            objects_output = gr.HTML()
+        with gr.Column():
+            caption_output = gr.Textbox(label="🧾 Caption", interactive=False)
+        with gr.Column():
+            answer_output = gr.Textbox(label="🧠 Reasoned Answer", interactive=False)
+    submit_btn.click(
+        fn=process,
+        inputs=[image_input, question_input],
+        outputs=[annotated_output, objects_output, caption_output, answer_output],
+    )
+demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "vre"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "gradio>=6.14.0",
+    "pillow>=12.2.0",
+    "timm>=1.0.26",
+    "torch>=2.11.0",
+    "transformers>=5.7.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=6.14.0
+pillow>=12.2.0
+timm>=1.0.26
+torch>=2.11.0
+transformers>=5.7.0

src/__init__.py ADDED Viewed

File without changes

src/captioning.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import BlipProcessor, BlipForConditionalGeneration
+import torch
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+def generate_caption(image):
+    inputs = processor(image, return_tensors="pt")
+    with torch.no_grad():
+        out = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption

src/detection.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import pipeline
+# Load once
+detector = pipeline(
+    "object-detection",
+    model="facebook/detr-resnet-50",
+    device=-1  # CPU)
+)
+def detect_objects(image, threshold=0.7):
+    results = detector(image)
+    filtered = [r for r in results if r["score"] >= threshold]
+    labels = list({r["label"] for r in filtered})
+    boxes = [
+        {"label": r["label"], "score": round(r["score"], 2), "box": r["box"]}
+        for r in filtered
+    ]
+    return labels, boxes

src/reasoning.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+def reason(objects, caption, question):
+    prompt = f"""
+You are a visual reasoning system.
+Use ONLY the given objects and scene.
+Do NOT invent new events or actions.
+If an action is visible, describe it.
+If no clear action is visible, describe the scene simply.
+Example:
+Objects: person, dog
+Scene: a man walking a dog on a path
+Question: What is happening in this image?
+Answer: A person is walking a dog outdoors.
+Objects: car
+Scene: a car on a race track
+Question: What is happening in this image?
+Answer: A car is driving on a race track.
+Now answer:
+Objects: {objects}
+Scene: {caption}
+Question: {question}
+Answer:
+"""
+    inputs = tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=40
+        )
+    raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    answer = raw.split("Answer:")[-1].strip()
+    # remove accidental extra parts
+    answer = answer.split("\n")[0]
+    return answer

src/utils.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .captioning import generate_caption
+from .detection import detect_objects
+from .reasoning import reason
+def run_pipeline(image, question):
+    caption = generate_caption(image)
+    labels, boxes = detect_objects(image)
+    answer = reason(", ".join(labels), caption, question)
+    return caption, labels, boxes, answer

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff