Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Nov 20, 2025

Commit

75e18d7

verified ·

1 Parent(s): 6539c99

Test Gailey changes

Browse files

Files changed (1) hide show

app.py +199 -9

app.py CHANGED Viewed

@@ -1,13 +1,203 @@
 import gradio as gr
-def greet(name, intensity):
-    return "Hello, " + name + "!" * int(intensity)
-demo = gr.Interface(
-    fn=greet,
-    inputs=["text", "slider"],
-    outputs=["text"],
-)
-print("Demo is launching.")
-demo.launch()

+# app.py — Lazy Loaded Multimodal AI System
+#
+# Models load ONLY when needed to avoid memory overflow
+# Works on Hugging Face free CPU Spaces
+import torch
 import gradio as gr
+device = torch.device("cpu")
+# ---------------------------------------------------------
+# LAZY MODEL LOADERS
+# ---------------------------------------------------------
+def load_caption_model():
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    model_name = "Salesforce/blip-image-captioning-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
+    return processor, model
+def load_sentiment_model():
+    from transformers import pipeline
+    return pipeline(
+        "sentiment-analysis",
+        model="distilbert-base-uncased-finetuned-sst-2-english"
+    )
+def load_vqa_model():
+    from transformers import BlipProcessor, BlipForQuestionAnswering
+    model_name = "Salesforce/blip-vqa-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
+    return processor, model
+def load_detr_model():
+    from transformers import DetrImageProcessor, DetrForObjectDetection
+    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
+    return processor, model
+def load_vit_model():
+    from transformers import ViTImageProcessor, ViTForImageClassification
+    model_name = "google/vit-base-patch16-224"
+    processor = ViTImageProcessor.from_pretrained(model_name)
+    model = ViTForImageClassification.from_pretrained(model_name).to(device)
+    return processor, model
+def load_llm():
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    name = "gpt2"
+    tokenizer = AutoTokenizer.from_pretrained(name)
+    model = AutoModelForCausalLM.from_pretrained(name).to(device)
+    return tokenizer, model
+# ---------------------------------------------------------
+# TASK FUNCTIONS
+# ---------------------------------------------------------
+def generate_caption(image):
+    processor, model = load_caption_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out_ids = model.generate(**inputs, max_new_tokens=30)
+    return processor.decode(out_ids[0], skip_special_tokens=True)
+def analyze_sentiment(text):
+    sentiment = load_sentiment_model()
+    out = sentiment(text)[0]
+    return out["label"], round(out["score"] * 100, 2)
+def vqa_answer(image, question):
+    processor, model = load_vqa_model()
+    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = model.generate(**inputs)
+    return processor.decode(out[0], skip_special_tokens=True)
+def detect_objects(image):
+    processor, model = load_detr_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = torch.tensor([image.size[::-1]])
+    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
+    detections = []
+    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        if score > 0.3:
+            detections.append(
+                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
+            )
+    if len(detections) == 0:
+        return ["No high-confidence objects detected"]
+    return detections
+def classify_scene(image):
+    processor, model = load_vit_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    label = logits.argmax(-1).item()
+    return model.config.id2label[label]
+def rewrite_caption(caption, style):
+    tokenizer, model = load_llm()
+    if style == "Short":
+        prompt = f"Summarize: {caption}"
+    elif style == "Creative":
+        prompt = f"Rewrite creatively: {caption}"
+    elif style == "Technical":
+        prompt = f"Rewrite in technical detail: {caption}"
+    else:
+        prompt = caption
+    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model.generate(inputs, max_new_tokens=60)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def extract_metadata(image):
+    width, height = image.size
+    meta = f"Dimensions: {width} x {height}\n"
+    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
+    return meta
+# ---------------------------------------------------------
+# MAIN LOGIC
+# ---------------------------------------------------------
+def process_all(image, question, style):
+    if image is None:
+        return ["No image"] * 8
+    caption = generate_caption(image)
+    sentiment_label, sentiment_score = analyze_sentiment(caption)
+    vqa = vqa_answer(image, question) if question else "No question asked"
+    objects = detect_objects(image)
+    scene = classify_scene(image)
+    rewritten = rewrite_caption(caption, style)
+    metadata = extract_metadata(image)
+    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
+# ---------------------------------------------------------
+# GRADIO UI - BLOCKS
+# ---------------------------------------------------------
+with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
+    gr.Markdown("# **Multimodal AI System (Emotion Removed)**")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        question_input = gr.Textbox(label="Ask a Question")
+        style_input = gr.Dropdown(["Short", "Creative", "Technical"], label="Caption Style")
+    run_btn = gr.Button("Run All AI Tools")
+    caption = gr.Textbox(label="Generated Caption")
+    sentiment_label = gr.Textbox(label="Sentiment Label")
+    sentiment_score = gr.Number(label="Sentiment Score")
+    vqa_output = gr.Textbox(label="VQA Answer")
+    objects_output = gr.JSON(label="Detected Objects")
+    scene_output = gr.Textbox(label="Scene Classification")
+    rewritten_output = gr.Textbox(label="Rewritten Caption")
+    metadata_output = gr.Textbox(label="Image Metadata")
+    run_btn.click(
+        process_all,
+        [image_input, question_input, style_input],
+        [
+            caption,
+            sentiment_label,
+            sentiment_score,
+            vqa_output,
+            objects_output,
+            scene_output,
+            rewritten_output,
+            metadata_output
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch()