Spaces:

atharvasc27112001
/

Capstone_Project

Build error

App Files Files Community

atharvasc27112001 commited on Apr 6, 2025

Commit

43d8873

verified ·

1 Parent(s): 3180216

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -17

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 import soundfile as sf
@@ -15,15 +15,16 @@ print("Loading Whisper model...")
 whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
 whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
-print("Loading GPT-2 model (placeholder for your text model)...")
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-text_model = AutoModelForCausalLM.from_pretrained("gpt2")
 # ------------------------------
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
-# Project CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
@@ -33,11 +34,11 @@ image_projection = torch.nn.Linear(512, 768)
 def multimodal_inference(text_input, image_input, audio_input):
     """
     Processes text, image, and audio inputs:
-      - Text: used directly.
-      - Image: processed via CLIP and projected (here, we append a placeholder tag).
-      - Audio: transcribed using Whisper.
-    The final prompt is fed to the text model (GPT-2) to generate a response.
     """
     prompt = ""
@@ -54,7 +55,7 @@ def multimodal_inference(text_input, image_input, audio_input):
             # Normalize and project image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             projected_image = image_projection(image_features)
-            # For demo purposes, we append a placeholder tag.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
@@ -79,16 +80,16 @@ def multimodal_inference(text_input, image_input, audio_input):
     print("Final fused prompt:", prompt)
-    # Generate text response using the text model with advanced decoding parameters
     inputs = tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         generated_ids = text_model.generate(
             **inputs,
             max_length=200,
-            temperature=0.7,       # Controls randomness (0=deterministic, 1=more random)
-            top_p=0.9,             # Limits sampling to the top 90% probability mass
-            repetition_penalty=1.2,# Penalizes repeated phrases
-            do_sample=True         # Enables sampling (instead of greedy decoding)
         )
     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
@@ -106,8 +107,8 @@ iface = gr.Interface(
         gr.Audio(type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
-    title="Multi-Modal LLM Demo",
-    description="This demo accepts text, image, and audio inputs, processes each modality, and produces a text response."
 )
 if __name__ == "__main__":

 import torch
+from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
 import gradio as gr
 import soundfile as sf
 whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
 whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+print("Loading Flan-T5 model (instruction-tuned for better responses)...")
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+text_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
 # ------------------------------
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
+# This linear layer projects CLIP's 512-dimensional image embeddings to Flan-T5's expected dimension.
+# (For a real system, you would fine-tune this layer.)
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
 def multimodal_inference(text_input, image_input, audio_input):
     """
     Processes text, image, and audio inputs:
+      - Text: is used directly.
+      - Image: is processed via CLIP; its embedding is projected and a placeholder is appended.
+      - Audio: is transcribed using Whisper.
+    The combined prompt is then fed into Flan-T5 to generate a text response.
     """
     prompt = ""
             # Normalize and project image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             projected_image = image_projection(image_features)
+            # For this demo, we append a placeholder tag to indicate image information.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
     print("Final fused prompt:", prompt)
+    # Tokenize and generate text using Flan-T5
     inputs = tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         generated_ids = text_model.generate(
             **inputs,
             max_length=200,
+            temperature=0.7,       # Moderate randomness
+            top_p=0.9,             # Nucleus sampling to limit token choices
+            repetition_penalty=1.2,# Penalize repeated tokens
+            do_sample=True         # Enable sampling for more varied responses
         )
     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         gr.Audio(type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
+    title="Multi-Modal LLM Demo with Flan-T5",
+    description="This demo accepts text, image, and audio inputs, processes each modality, and produces a text response using an instruction-tuned model."
 )
 if __name__ == "__main__":