Spaces:

ProfRom
/

TestSpace

Sleeping

App Files Files Community

ProfRom commited on 18 days ago

Commit

a2b25d5

verified ·

1 Parent(s): da5e151

Diaz - Final submission

Browse files

Files changed (1) hide show

app.py +34 -209

app.py CHANGED Viewed

@@ -1,217 +1,42 @@
-import numpy as np
-import librosa
 import torch
 import gradio as gr
 from PIL import Image
-import requests
-from io import BytesIO
-from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
-# Device configuration
-torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-pipeline_device = 0 if torch.cuda.is_available() else -1
-# ---------- LABEL DEFINITIONS ----------
-CANONICAL_LABELS = ["anger", "happiness", "neutral", "sadness"]
-TEXT_MODEL_LABEL_MAP = {
-    "anger": "anger",
-    "joy": "happiness",
-    "neutral": "neutral",
-    "sadness": "sadness",
-    "disgust": None,
-    "fear": None,
-    "surprise": None
-}
-AUDIO_MODEL_LABEL_MAP = {
-    "ang": "anger",
-    "hap": "happiness",
-    "neu": "neutral",
-    "sad": "sadness",
-    "anger": "anger",
-    "happy": "happiness",
-    "neutral": "neutral",
-    "sadness": "sadness"
-}
-TEXT_WEIGHT = 0.40
-AUDIO_WEIGHT = 0.60
-# ---------- LOAD MODELS ----------
-text_classifier = pipeline(
-    "text-classification",
-    model="j-hartmann/emotion-english-distilroberta-base",
-    top_k=None,
-    device=pipeline_device
-)
-audio_classifier = pipeline(
-    "audio-classification",
-    model="superb/wav2vec2-base-superb-er",
-    device=pipeline_device
-)
-image_classifier = pipeline(
-    "image-classification",
-    model="google/vit-base-patch16-224",
-    device=pipeline_device
-)
-image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(torch_device)
-# ---------- HELPER FUNCTIONS ----------
-def initialize_score_dict():
-    return {label: 0.0 for label in CANONICAL_LABELS}
-def normalize_text_label(label):
-    return TEXT_MODEL_LABEL_MAP.get(str(label).lower(), None)
-def normalize_audio_label(label):
-    return AUDIO_MODEL_LABEL_MAP.get(str(label).lower(), None)
-def format_top_predictions(predictions, top_k=3):
-    return "\n".join([f"{p['label']} ({p['score']:.4f})" for p in predictions[:top_k]])
-# ---------- TEXT MODEL ----------
-def predict_text_emotion(transcript):
-    if not transcript or transcript.strip() == "":
-        return [], initialize_score_dict()
-    preds = text_classifier(transcript)
-    if isinstance(preds, list) and isinstance(preds[0], list):
-        preds = preds[0]
-    scores = initialize_score_dict()
-    normalized = []
-    for item in preds:
-        mapped = normalize_text_label(item["label"])
-        if mapped:
-            scores[mapped] += item["score"]
-            normalized.append({"label": mapped, "score": item["score"]})
-    return sorted(normalized, key=lambda x: x["score"], reverse=True), scores
-# ---------- AUDIO MODEL ----------
-def predict_audio_emotion(audio):
-    array = audio["array"]
-    sr = audio["sampling_rate"]
-    if sr != 16000:
-        array = librosa.resample(array, orig_sr=sr, target_sr=16000)
-        sr = 16000
-    preds = audio_classifier({"array": array, "sampling_rate": sr}, top_k=4)
-    scores = initialize_score_dict()
-    normalized = []
-    for item in preds:
-        mapped = normalize_audio_label(item["label"])
-        if mapped:
-            scores[mapped] += item["score"]
-            normalized.append({"label": mapped, "score": item["score"]})
-    return sorted(normalized, key=lambda x: x["score"], reverse=True), scores
-# ---------- FUSION ----------
-def fuse_scores(text_scores, audio_scores):
-    fused_scores = {}
-    for label in CANONICAL_LABELS:
-        fused_scores[label] = (
-            TEXT_WEIGHT * text_scores.get(label, 0.0) +
-            AUDIO_WEIGHT * audio_scores.get(label, 0.0)
-        )
-    best_label = max(fused_scores, key=fused_scores.get)
-    return best_label, fused_scores[best_label]
-# ---------- IMAGE ----------
-def run_image(image):
     if image is None:
-        return "No image.", "No classification.", "No evaluation."
-    inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
-    with torch.no_grad():
-        output = image_model.generate(**inputs, max_new_tokens=30)
-    caption = image_processor.decode(output[0], skip_special_tokens=True)
-    preds = image_classifier(image)[:3]
-    classification = "\n".join([f"{p['label']} ({p['score']:.4f})" for p in preds])
-    return caption, classification, "Completed"
-# ---------- MAIN MULTIMODAL ----------
-def run_audio_text(audio_input, transcript):
-    if audio_input is None:
-        return "No audio provided.", "", "", ""
-    sr, audio_array = audio_input
-    audio = {
-        "array": np.asarray(audio_array, dtype=np.float32),
-        "sampling_rate": int(sr)
-    }
-    text_preds, text_scores = predict_text_emotion(transcript)
-    audio_preds, audio_scores = predict_audio_emotion(audio)
-    fused_label, fused_score = fuse_scores(text_scores, audio_scores)
-    return (
-        transcript if transcript else "No transcript",
-        format_top_predictions(text_preds),
-        format_top_predictions(audio_preds),
-        f"{fused_label.upper()} (confidence: {fused_score:.4f})"
-    )
-# ---------- UI ----------
-with gr.Blocks() as demo:
-    gr.Markdown("# Multimodal AI System")
-    with gr.Tabs():
-        with gr.Tab("Audio + Text"):
-            audio = gr.Audio(type="numpy")
-            text = gr.Textbox()
-            out1 = gr.Textbox(label="Transcript")
-            out2 = gr.Textbox(label="Text Prediction")
-            out3 = gr.Textbox(label="Audio Prediction")
-            out4 = gr.Textbox(label="Fused Result")
-            btn = gr.Button("Run")
-            btn.click(run_audio_text, [audio, text], [out1, out2, out3, out4])
-        with gr.Tab("Image Analysis"):
-            image = gr.Image(type="pil")
-            cap = gr.Textbox(label="Caption")
-            cls = gr.Textbox(label="Classification")
-            eval = gr.Textbox(label="Status")
-            btn2 = gr.Button("Run Image")
-            btn2.click(run_image, image, [cap, cls, eval])
-demo.launch()

 import torch
+from transformers import BlipProcessor, BlipForQuestionAnswering
 import gradio as gr
 from PIL import Image
+# Load model + processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+# Move to GPU if available (Spaces free tier = CPU, but this keeps it safe)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+def answer_question(image, question):
     if image is None:
+        return "Please upload an image."
+    if not question:
+        return "Please enter a question."
+    # Process inputs
+    inputs = processor(image, question, return_tensors="pt").to(device)
+    # Generate answer
+    output = model.generate(**inputs)
+    answer = processor.decode(output[0], skip_special_tokens=True)
+    return answer
+# Gradio Interface
+demo = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.Image(type="pil", label="Upload an image"),
+        gr.Textbox(label="Question", placeholder="Example: What is in this image?")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="BLIP Visual Question Answering",
+    description="Upload an image and ask a question about it using a multimodal AI model.",
+)
+if __name__ == "__main__":
+    demo.launch()