Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Dec 3, 2025

Commit

f2f22f7

verified ·

1 Parent(s): 7ddee74

Gailey - Sanity Check 3

Browse files

Files changed (1) hide show

app.py +317 -67

app.py CHANGED Viewed

@@ -1,72 +1,322 @@
 import gradio as gr
-from transformers import pipeline
-# BLIP captioning
-caption_pipeline = pipeline(
-    task="image-to-text",
-    model="Salesforce/blip-image-captioning-base"
-)
-# BLIP VQA
-vqa_pipeline = pipeline(
-    task="visual-question-answering",
-    model="Salesforce/blip-vqa-base"
-)
-# CLIP zero-shot classification
-clip_pipeline = pipeline(
-    task="zero-shot-image-classification",
-    model="openai/clip-vit-base-patch32"
-)
-def process_image(image, question, labels):
-    # Caption
-    caption_result = caption_pipeline(image)
-    caption = caption_result[0]["generated_text"]
-    # VQA
-    if question and question.strip():
-        vqa_result = vqa_pipeline(image=image, question=question)
-        vqa_answer = vqa_result[0]["answer"]
-    else:
-        vqa_answer = "No question provided."
-    # CLIP Classification
-    if labels and labels.strip():
-        candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
-        if candidate_labels:
-            # NOTE: use 'images=' or positional arg
-            clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
-            clip_output = "\n".join(
-                f"{item['label']}: {round(item['score'] * 100, 1)}%"
-                for item in clip_result
             )
-        else:
-            clip_output = "No valid labels provided."
     else:
-        clip_output = "No labels provided."
-    return caption, vqa_answer, clip_output
-demo = gr.Interface(
-    fn=process_image,
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(label="Ask a question about the image (optional)"),
-        gr.Textbox(
-            label="Enter CLIP classification labels (comma-separated)",
-            placeholder="e.g., man, boy, park, snow, happiness",
-        ),
-    ],
-    outputs=[
-        gr.Textbox(label="Generated Caption"),
-        gr.Textbox(label="VQA Answer"),
-        gr.Textbox(label="CLIP Classification Scores"),
-    ],
-    title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
-)
-demo.launch()

+# app.py — Lazy Loaded Multimodal AI System
+#
+# Models load ONLY when needed to avoid memory overflow
+# Works on Hugging Face free CPU Spaces
+import torch
 import gradio as gr
+device = torch.device("cpu")
+# ---------------------------------------------------------
+# LAZY MODEL LOADERS
+# ---------------------------------------------------------
+def load_caption_model():
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    model_name = "Salesforce/blip-image-captioning-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
+    return processor, model
+def load_sentiment_model():
+    from transformers import pipeline
+    return pipeline(
+        "sentiment-analysis",
+        model="distilbert-base-uncased-finetuned-sst-2-english"
+    )
+def load_vqa_model():
+    from transformers import BlipProcessor, BlipForQuestionAnswering
+    model_name = "Salesforce/blip-vqa-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
+    return processor, model
+def load_detr_model():
+    from transformers import DetrImageProcessor, DetrForObjectDetection
+    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
+    return processor, model
+def load_vit_model():
+    from transformers import ViTImageProcessor, ViTForImageClassification
+    model_name = "google/vit-base-patch16-224"
+    processor = ViTImageProcessor.from_pretrained(model_name)
+    model = ViTForImageClassification.from_pretrained(model_name).to(device)
+    return processor, model
+# NEW — more verbose, less repetitive rewrite model
+def load_llm():
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+    name = "google/flan-t5-large"
+    tokenizer = AutoTokenizer.from_pretrained(name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(name).to(device)
+    return tokenizer, model
+# ---------------------------------------------------------
+# TASKS
+# ---------------------------------------------------------
+def generate_caption(image):
+    processor, model = load_caption_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out_ids = model.generate(**inputs, max_new_tokens=30)
+    return processor.decode(out_ids[0], skip_special_tokens=True)
+def analyze_sentiment(text):
+    sentiment = load_sentiment_model()
+    out = sentiment(text)[0]
+    return out["label"], round(out["score"] * 100, 2)
+def vqa_answer(image, question):
+    processor, model = load_vqa_model()
+    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = model.generate(**inputs)
+    return processor.decode(out[0], skip_special_tokens=True)
+def detect_objects(image):
+    processor, model = load_detr_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = torch.tensor([image.size[::-1]])
+    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
+    detections = []
+    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        if score > 0.3:
+            detections.append(
+                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
             )
+    if len(detections) == 0:
+        return ["No high-confidence objects detected"]
+    return detections
+def classify_scene(image):
+    processor, model = load_vit_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    label = logits.argmax(-1).item()
+    return model.config.id2label[label]
+# ---------------------------------------------------------
+# REWRITE CAPTIONS (8 STYLE SYSTEM + LENGTH SLIDER)
+# ---------------------------------------------------------
+def _build_style_prompt(caption, style):
+    base = (
+        "Rewrite the following image caption. "
+        "Keep the original meaning and important details, "
+        "but change the wording significantly and avoid repeating sentences verbatim. "
+        "Do not just copy the original text.\n\n"
+        f"Original caption:\n{caption}\n\n"
+    )
+    if style == "Short":
+        return (
+            base
+            + "Now produce a shorter, compact version in one or two sentences."
+        )
+    elif style == "Creative":
+        return (
+            base
+            + "Rewrite it in a colorful, imaginative, and richly descriptive style."
+        )
+    elif style == "Technical":
+        return (
+            base
+            + "Rewrite it in a highly technical, analytical style using precise visual terminology."
+        )
+    elif style == "Humorous":
+        return (
+            base
+            + "Rewrite it with a fun, humorous, witty tone while keeping the meaning."
+        )
+    elif style == "Poetic":
+        return (
+            base
+            + "Rewrite it in a poetic, rhythmic, metaphorical style using sensory language."
+        )
+    elif style == "Cinematic":
+        return (
+            base
+            + "Rewrite it as if describing an epic cinematic movie scene with dramatic, vivid imagery."
+        )
+    elif style == "Journalistic":
+        return (
+            base
+            + "Rewrite it in a factual, neutral, journalistic news-reporting style."
+        )
+    elif style == "Academic":
+        return (
+            base
+            + "Rewrite it in a formal, academic style with clear, analytical phrasing."
+        )
     else:
+        # Fallback: treat unknown style as creative rewrite
+        return (
+            base
+            + "Rewrite it in a natural, descriptive style."
+        )
+def rewrite_caption(caption, style, length):
+    tokenizer, model = load_llm()
+    prompt = _build_style_prompt(caption, style)
+    # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    # First pass: normal creative decoding
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=length,
+            do_sample=True,
+            temperature=0.9,
+            top_p=0.9,
+            no_repeat_ngram_size=3,
+            repetition_penalty=1.2,
+        )
+    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # If the model basically echoed the caption, try a second, more forceful pass.
+    if rewritten.lower().strip() == caption.lower().strip():
+        strong_prompt = (
+            "Paraphrase and expand the following caption. "
+            "Use different wording and add extra detail, but keep the meaning. "
+            "Do not repeat the original sentence exactly.\n\n"
+            f"Original caption:\n{caption}"
+        )
+        strong_inputs = tokenizer(strong_prompt, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs2 = model.generate(
+                **strong_inputs,
+                max_new_tokens=length,
+                do_sample=True,
+                temperature=1.0,
+                top_p=0.95,
+                no_repeat_ngram_size=3,
+                repetition_penalty=1.3,
+            )
+        rewritten2 = tokenizer.decode(outputs2[0], skip_special_tokens=True).strip()
+        # Only replace if it actually changed something
+        if rewritten2 and rewritten2.lower().strip() != caption.lower().strip():
+            rewritten = rewritten2
+    return rewritten
+def extract_metadata(image):
+    width, height = image.size
+    meta = f"Dimensions: {width} x {height}\n"
+    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
+    return meta
+# ---------------------------------------------------------
+# MAIN LOOP
+# ---------------------------------------------------------
+def process_all(image, question, style, length):
+    if image is None:
+        return ["No image"] * 8
+    caption = generate_caption(image)
+    sentiment_label, sentiment_score = analyze_sentiment(caption)
+    vqa = vqa_answer(image, question) if question else "No question asked"
+    objects = detect_objects(image)
+    scene = classify_scene(image)
+    rewritten = rewrite_caption(caption, style, length)
+    metadata = extract_metadata(image)
+    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
+# ---------------------------------------------------------
+# GRADIO UI
+# ---------------------------------------------------------
+with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
+    gr.Markdown("# **Multimodal AI System**")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        question_input = gr.Textbox(label="Ask a Question")
+        style_input = gr.Dropdown(
+            [
+                "Short",
+                "Creative",
+                "Technical",
+                "Humorous",
+                "Poetic",
+                "Cinematic",
+                "Journalistic",
+                "Academic"
+            ],
+            label="Rewrite Style"
+        )
+    # New: length slider
+    length_slider = gr.Slider(
+        minimum=20,
+        maximum=200,
+        value=80,
+        step=10,
+        label="Rewrite Length (Max Tokens)"
+    )
+    run_btn = gr.Button("Run All Tools")
+    caption = gr.Textbox(label="Generated Caption")
+    sentiment_label = gr.Textbox(label="Sentiment Label")
+    sentiment_score = gr.Number(label="Sentiment Score")
+    vqa_output = gr.Textbox(label="VQA Answer")
+    objects_output = gr.JSON(label="Detected Objects")
+    scene_output = gr.Textbox(label="Scene Classification")
+    rewritten_output = gr.Textbox(label="Rewritten Caption")
+    metadata_output = gr.Textbox(label="Image Metadata")
+    run_btn.click(
+        process_all,
+        [image_input, question_input, style_input, length_slider],
+        [
+            caption,
+            sentiment_label,
+            sentiment_score,
+            vqa_output,
+            objects_output,
+            scene_output,
+            rewritten_output,
+            metadata_output
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch()