Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Nov 20, 2025

Commit

c39a86b

verified ·

1 Parent(s): 75e18d7

Poojary Sanity Check 1

Browse files

Files changed (1) hide show

app.py +35 -202

app.py CHANGED Viewed

@@ -1,203 +1,36 @@
-# app.py — Lazy Loaded Multimodal AI System
-#
-# Models load ONLY when needed to avoid memory overflow
-# Works on Hugging Face free CPU Spaces
-import torch
-import gradio as gr
-device = torch.device("cpu")
-# ---------------------------------------------------------
-# LAZY MODEL LOADERS
-# ---------------------------------------------------------
-def load_caption_model():
-    from transformers import BlipProcessor, BlipForConditionalGeneration
-    model_name = "Salesforce/blip-image-captioning-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
-    return processor, model
-def load_sentiment_model():
-    from transformers import pipeline
-    return pipeline(
-        "sentiment-analysis",
-        model="distilbert-base-uncased-finetuned-sst-2-english"
-    )
-def load_vqa_model():
-    from transformers import BlipProcessor, BlipForQuestionAnswering
-    model_name = "Salesforce/blip-vqa-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
-    return processor, model
-def load_detr_model():
-    from transformers import DetrImageProcessor, DetrForObjectDetection
-    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
-    return processor, model
-def load_vit_model():
-    from transformers import ViTImageProcessor, ViTForImageClassification
-    model_name = "google/vit-base-patch16-224"
-    processor = ViTImageProcessor.from_pretrained(model_name)
-    model = ViTForImageClassification.from_pretrained(model_name).to(device)
-    return processor, model
-def load_llm():
-    from transformers import AutoTokenizer, AutoModelForCausalLM
-    name = "gpt2"
-    tokenizer = AutoTokenizer.from_pretrained(name)
-    model = AutoModelForCausalLM.from_pretrained(name).to(device)
-    return tokenizer, model
-# ---------------------------------------------------------
-# TASK FUNCTIONS
-# ---------------------------------------------------------
-def generate_caption(image):
-    processor, model = load_caption_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out_ids = model.generate(**inputs, max_new_tokens=30)
-    return processor.decode(out_ids[0], skip_special_tokens=True)
-def analyze_sentiment(text):
-    sentiment = load_sentiment_model()
-    out = sentiment(text)[0]
-    return out["label"], round(out["score"] * 100, 2)
-def vqa_answer(image, question):
-    processor, model = load_vqa_model()
-    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = model.generate(**inputs)
-    return processor.decode(out[0], skip_special_tokens=True)
-def detect_objects(image):
-    processor, model = load_detr_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    target_sizes = torch.tensor([image.size[::-1]])
-    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
-    detections = []
-    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        if score > 0.3:
-            detections.append(
-                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
-            )
-    if len(detections) == 0:
-        return ["No high-confidence objects detected"]
-    return detections
-def classify_scene(image):
-    processor, model = load_vit_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    label = logits.argmax(-1).item()
-    return model.config.id2label[label]
-def rewrite_caption(caption, style):
-    tokenizer, model = load_llm()
-    if style == "Short":
-        prompt = f"Summarize: {caption}"
-    elif style == "Creative":
-        prompt = f"Rewrite creatively: {caption}"
-    elif style == "Technical":
-        prompt = f"Rewrite in technical detail: {caption}"
-    else:
-        prompt = caption
-    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model.generate(inputs, max_new_tokens=60)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-def extract_metadata(image):
-    width, height = image.size
-    meta = f"Dimensions: {width} x {height}\n"
-    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
-    return meta
-# ---------------------------------------------------------
-# MAIN LOGIC
-# ---------------------------------------------------------
-def process_all(image, question, style):
-    if image is None:
-        return ["No image"] * 8
-    caption = generate_caption(image)
-    sentiment_label, sentiment_score = analyze_sentiment(caption)
-    vqa = vqa_answer(image, question) if question else "No question asked"
-    objects = detect_objects(image)
-    scene = classify_scene(image)
-    rewritten = rewrite_caption(caption, style)
-    metadata = extract_metadata(image)
-    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
-# ---------------------------------------------------------
-# GRADIO UI - BLOCKS
-# ---------------------------------------------------------
-with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
-    gr.Markdown("# **Multimodal AI System (Emotion Removed)**")
-    with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        question_input = gr.Textbox(label="Ask a Question")
-        style_input = gr.Dropdown(["Short", "Creative", "Technical"], label="Caption Style")
-    run_btn = gr.Button("Run All AI Tools")
-    caption = gr.Textbox(label="Generated Caption")
-    sentiment_label = gr.Textbox(label="Sentiment Label")
-    sentiment_score = gr.Number(label="Sentiment Score")
-    vqa_output = gr.Textbox(label="VQA Answer")
-    objects_output = gr.JSON(label="Detected Objects")
-    scene_output = gr.Textbox(label="Scene Classification")
-    rewritten_output = gr.Textbox(label="Rewritten Caption")
-    metadata_output = gr.Textbox(label="Image Metadata")
-    run_btn.click(
-        process_all,
-        [image_input, question_input, style_input],
-        [
-            caption,
-            sentiment_label,
-            sentiment_score,
-            vqa_output,
-            objects_output,
-            scene_output,
-            rewritten_output,
-            metadata_output
-        ]
-    )
-if __name__ == "__main__":
-    demo.launch()

+#define model and processor
+processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+device = infer_device()
+# Define inference function
+def process_image(image, prompt):
+   # Process the image and prompt using the processor
+   inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+   try:
+       # Generate output from the model
+       output = model.generate(**inputs, max_new_tokens=10)
+       # Decode and return the output
+       decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
+       #remove prompt from output
+       if decoded_output.startswith(prompt):
+           return decoded_output[len(prompt):].strip()
+       return decoded_output
+   except IndexError as e:
+       print(f"IndexError: {e}")
+       return "An error occurred during processing."
+# Define the Gradio interface
+inputs = [
+   gr.Image(type="pil"),
+   gr.Textbox(label="Prompt", placeholder="Enter your question")
+]
+outputs = gr.Textbox(label="Answer")
+# Create the Gradio app
+demo = gr.Interface(fn=process_image, inputs=inputs, outputs=outputs, title="Visual Question Answering", description="Upload an image and ask questions to get answers.")
+# Launch the app
+demo.launch()