manaskhan commited on
Commit
3abbd00
·
verified ·
1 Parent(s): b4acfa1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
4
+ from PIL import Image
5
+
6
+ # Set up the Streamlit page configuration
7
+ st.set_page_config(
8
+ page_title="BLIP-2 Image Captioning",
9
+ page_icon="📸",
10
+ layout="wide",
11
+ )
12
+
13
+ # --- Model Loading (using caching for efficiency) ---
14
+ # The @st.cache_resource decorator ensures the model and processor are loaded only once.
15
+ # This is crucial for a performant Streamlit app on Hugging Face Spaces.
16
+ @st.cache_resource
17
+ def load_model():
18
+ """
19
+ Loads the BLIP-2 model and processor from Hugging Face Hub.
20
+
21
+ We're using a smaller version (`blip2-opt-2.7b`) that is more suitable for
22
+ Hugging Face's free tier, though it may still require significant resources.
23
+ We load the model in 8-bit to reduce memory usage.
24
+ """
25
+ # Check if a CUDA-enabled GPU is available. If not, use CPU.
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+
28
+ try:
29
+ # Load the processor and model
30
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
31
+
32
+ # We load the model in 8-bit to save memory, which is important for
33
+ # deployment on platforms like Hugging Face Spaces.
34
+ model = Blip2ForConditionalGeneration.from_pretrained(
35
+ "Salesforce/blip2-opt-2.7b",
36
+ device_map="auto",
37
+ load_in_8bit=True,
38
+ torch_dtype=torch.float16
39
+ )
40
+ return processor, model, device
41
+ except Exception as e:
42
+ st.error(f"Error loading the model: {e}")
43
+ st.info("The model is very large and may require a GPU with at least 15GB of VRAM. "
44
+ "If you're seeing this error, the free tier of Hugging Face Spaces might not be enough.")
45
+ return None, None, None
46
+
47
+ # --- Main App Interface ---
48
+ st.title("📸 BLIP-2 Image Captioning AI")
49
+ st.write(
50
+ "Upload an image, and this application will generate a descriptive caption using the powerful "
51
+ "[BLIP-2 model](https://huggingface.co/Salesforce/blip2-opt-2.7b) from Hugging Face."
52
+ )
53
+
54
+ # Load the model and processor
55
+ processor, model, device = load_model()
56
+
57
+ if model and processor:
58
+ # Create a file uploader widget
59
+ uploaded_file = st.file_uploader(
60
+ "Choose an image...",
61
+ type=["jpg", "jpeg", "png", "bmp"],
62
+ help="Upload an image file to get a caption."
63
+ )
64
+
65
+ if uploaded_file is not None:
66
+ # Display the uploaded image
67
+ st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
68
+ st.write("")
69
+ st.info("Generating caption...")
70
+
71
+ try:
72
+ # Open the uploaded image file as a PIL Image
73
+ raw_image = Image.open(uploaded_file).convert("RGB")
74
+
75
+ # Preprocess the image and generate the caption
76
+ inputs = processor(images=raw_image, return_tensors="pt").to(device, torch.float16)
77
+ out = model.generate(**inputs, max_new_tokens=50) # Increased max_new_tokens for longer captions
78
+
79
+ # Decode the generated tokens to text
80
+ caption = processor.decode(out[0], skip_special_tokens=True).strip()
81
+
82
+ # Display the generated caption
83
+ st.success(f"**Caption:** {caption}")
84
+
85
+ except Exception as e:
86
+ st.error(f"An error occurred during caption generation: {e}")
87
+
88
+ else:
89
+ st.warning("The application could not be initialized. Please check the logs for details.")