Spaces:

James040
/

Image-to-Text-BLIP-Short

Sleeping

James040 commited on Mar 25

Commit

4fbafed

verified ·

1 Parent(s): 3c880ef

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,29 +1,30 @@
 import gradio as gr
-from transformers import pipeline
 from PIL import Image
-# OPTIMIZATION 1: Explicitly set device="cpu" for faster boot times
-print("Loading AI Model into memory...")
-get_prompt = pipeline(
-    "image-to-text",
-    model="Salesforce/blip-image-captioning-base",
-    device="cpu"
-)
 print("Model loaded successfully!")
 def generate_prompt(input_img):
     if input_img is None:
         return "Please upload an image."
     try:
-        # OPTIMIZATION 2: Convert RGBA (Transparent PNGs) to RGB.
-        # If we don't do this, transparent images will crash the AI!
         clean_image = input_img.convert('RGB')
-        # OPTIMIZATION 3: Force the AI to write longer, detailed prompts
-        # max_new_tokens prevents it from giving lazy 3-word answers
-        result = get_prompt(clean_image, max_new_tokens=75)
-        return result[0]['generated_text']
     except Exception as e:
         print(f"Error processing image: {e}")
         return f"System Error: {str(e)}"

 import gradio as gr
+from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
+print("Loading BLIP Processor and Model...")
+# 1. Load the specific components directly (Bypasses the buggy pipeline names)
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 print("Model loaded successfully!")
 def generate_prompt(input_img):
     if input_img is None:
         return "Please upload an image."
     try:
+        # 2. Convert to RGB to prevent transparent PNG crashes
         clean_image = input_img.convert('RGB')
+        # 3. Process the image into numbers the AI understands
+        inputs = processor(clean_image, return_tensors="pt")
+        # 4. Generate the text (max_new_tokens forces a detailed description)
+        output = model.generate(**inputs, max_new_tokens=75)
+        # 5. Decode the numbers back into human-readable text
+        generated_text = processor.decode(output[0], skip_special_tokens=True)
+        return generated_text
     except Exception as e:
         print(f"Error processing image: {e}")
         return f"System Error: {str(e)}"