Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on Sep 3

Commit

880b908

verified ·

1 Parent(s): 28cca05

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -57

app.py CHANGED Viewed

@@ -1,68 +1,109 @@
-# app.py
-import gradio as gr
-from transformers import BlipProcessor, BlipForConditionalGeneration
-from gtts import gTTS
-import io
-from PIL import Image
-# -------------------------------
-# Load BLIP-base model (lighter version)
-# -------------------------------
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# -------------------------------
-# Generate caption function
-# -------------------------------
 # def generate_caption_tts(image):
-#     caption = generate_caption(model, processor, image)
-#     audio_file = text_to_audio_file(caption)
-#     return caption, audio_file  # return file path, not BytesIO
-# -------------------------------
-# Convert text to speech using gTTS
-# -------------------------------
-import tempfile
-import pyttsx3
-def text_to_audio_file(text):
-    # Create a temporary file
-    tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-    tmp_path = tmp_file.name
-    tmp_file.close()
-    engine = pyttsx3.init()
-    engine.save_to_file(text, tmp_path)
-    engine.runAndWait()
-    return tmp_path
-def generate_caption_from_image(model, processor, image):
-    # image: PIL.Image
-    inputs = processor(images=image, return_tensors="pt")
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-# -------------------------------
-# Gradio interface: Caption + Audio
-# -------------------------------
-def generate_caption_tts(image):
-    caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
-    # audio_file = text_to_audio_file(caption)
-    return caption
-interface = gr.Interface(
-    fn=generate_caption_tts,
-    inputs=gr.Image(type="numpy"),
-    outputs=[gr.Textbox(label="Generated Caption")],
-    title="Image Captioning for Visually Impaired",
-    description="Upload an image, get a caption and audio description."
 )
 interface.launch()
-# demo.launch(share=True)

+# # app.py
+# import gradio as gr
+# from transformers import BlipProcessor, BlipForConditionalGeneration
+# from gtts import gTTS
+# import io
+# from PIL import Image
+# # -------------------------------
+# # Load BLIP-base model (lighter version)
+# # -------------------------------
+# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# # -------------------------------
+# # Generate caption function
+# # -------------------------------
+# # def generate_caption_tts(image):
+# #     caption = generate_caption(model, processor, image)
+# #     audio_file = text_to_audio_file(caption)
+# #     return caption, audio_file  # return file path, not BytesIO
+# # -------------------------------
+# # Convert text to speech using gTTS
+# # -------------------------------
+# import tempfile
+# import pyttsx3
+# def text_to_audio_file(text):
+#     # Create a temporary file
+#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+#     tmp_path = tmp_file.name
+#     tmp_file.close()
+#     engine = pyttsx3.init()
+#     engine.save_to_file(text, tmp_path)
+#     engine.runAndWait()
+#     return tmp_path
+# def generate_caption_from_image(model, processor, image):
+#     # image: PIL.Image
+#     inputs = processor(images=image, return_tensors="pt")
+#     out = model.generate(**inputs)
+#     caption = processor.decode(out[0], skip_special_tokens=True)
+#     return caption
+# # -------------------------------
+# # Gradio interface: Caption + Audio
+# # -------------------------------
 # def generate_caption_tts(image):
+#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
+#     # audio_file = text_to_audio_file(caption)
+#     return caption
+# interface = gr.Interface(
+#     fn=generate_caption_tts,
+#     inputs=gr.Image(type="numpy"),
+#     outputs=[gr.Textbox(label="Generated Caption")],
+#     title="Image Captioning for Visually Impaired",
+#     description="Upload an image, get a caption and audio description."
+# )
+# interface.launch()
+# # demo.launch(share=True)
+import gradio as gr
+from transformers import AutoProcessor, AutoModelForCausalLM
+import torch
+from PIL import Image
+# Load small LLaVA model
+processor = AutoProcessor.from_pretrained("LLaVA/LLaVA-7B-small")
+model = AutoModelForCausalLM.from_pretrained(
+    "LLaVA/LLaVA-7B-small",
+    torch_dtype=torch.float16,
+    device_map="auto"  # Automatically use GPU if available
 )
+def generate_caption(image):
+    # Convert to PIL if needed
+    if isinstance(image, str):
+        image = Image.open(image).convert("RGB")
+    # Prepare inputs
+    inputs = processor(images=image, return_tensors="pt").to(model.device)
+    # Generate output
+    outputs = model.generate(**inputs, max_new_tokens=50)
+    # Decode result
+    caption = processor.decode(outputs[0], skip_special_tokens=True)
+    return caption
+# Gradio Interface
+interface = gr.Interface(
+    fn=generate_caption,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(label="Generated Caption"),
+    title="LLaVA Image Captioning"
+)
 interface.launch()