Spaces:

gopalagra
/

blind-image-captioning

Sleeping

App Files Files Community

gopalagra commited on Sep 3

Commit

c9f8fb0

verified ·

1 Parent(s): 0576f19

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -89

app.py CHANGED Viewed

@@ -1,110 +1,110 @@
-# # app.py
-# import gradio as gr
-# from transformers import BlipProcessor, BlipForConditionalGeneration
-# from gtts import gTTS
-# import io
-# from PIL import Image
-# # -------------------------------
-# # Load BLIP-base model (lighter version)
-# # -------------------------------
-# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# # -------------------------------
-# # Generate caption function
-# # -------------------------------
-# # def generate_caption_tts(image):
-# #     caption = generate_caption(model, processor, image)
-# #     audio_file = text_to_audio_file(caption)
-# #     return caption, audio_file  # return file path, not BytesIO
-# # -------------------------------
-# # Convert text to speech using gTTS
-# # -------------------------------
-# import tempfile
-# import pyttsx3
-# def text_to_audio_file(text):
-#     # Create a temporary file
-#     tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-#     tmp_path = tmp_file.name
-#     tmp_file.close()
-#     engine = pyttsx3.init()
-#     engine.save_to_file(text, tmp_path)
-#     engine.runAndWait()
-#     return tmp_path
-# def generate_caption_from_image(model, processor, image):
-#     # image: PIL.Image
-#     inputs = processor(images=image, return_tensors="pt")
-#     out = model.generate(**inputs)
-#     caption = processor.decode(out[0], skip_special_tokens=True)
-#     return caption
-# # -------------------------------
-# # Gradio interface: Caption + Audio
-# # -------------------------------
 # def generate_caption_tts(image):
-#     caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
-#     # audio_file = text_to_audio_file(caption)
-#     return caption
-# interface = gr.Interface(
-#     fn=generate_caption_tts,
-#     inputs=gr.Image(type="numpy"),
-#     outputs=[gr.Textbox(label="Generated Caption")],
-#     title="Image Captioning for Visually Impaired",
-#     description="Upload an image, get a caption and audio description."
-# )
-# interface.launch()
 # # demo.launch(share=True)
-import gradio as gr
-from transformers import AutoProcessor, AutoModelForCausalLM
-import torch
-from PIL import Image
-# Load small LLaVA model
-processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
-model = AutoModelForCausalLM.from_pretrained(
-    "llava/LLaVA-7B-llm-small",
-    torch_dtype=torch.float16,
-    device_map="auto"  # Automatically use GPU if available
-)
-def generate_caption(image):
-    # Convert to PIL if needed
-    if isinstance(image, str):
-        image = Image.open(image).convert("RGB")
-    # Prepare inputs
-    inputs = processor(images=image, return_tensors="pt").to(model.device)
-    # Generate output
-    outputs = model.generate(**inputs, max_new_tokens=50)
-    # Decode result
-    caption = processor.decode(outputs[0], skip_special_tokens=True)
-    return caption
-# Gradio Interface
-interface = gr.Interface(
-    fn=generate_caption,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(label="Generated Caption"),
-    title="LLaVA Image Captioning"
-)
-interface.launch()

+# app.py
+import gradio as gr
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from gtts import gTTS
+import io
+from PIL import Image
+# -------------------------------
+# Load BLIP-base model (lighter version)
+# -------------------------------
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# -------------------------------
+# Generate caption function
+# -------------------------------
 # def generate_caption_tts(image):
+#     caption = generate_caption(model, processor, image)
+#     audio_file = text_to_audio_file(caption)
+#     return caption, audio_file  # return file path, not BytesIO
+# -------------------------------
+# Convert text to speech using gTTS
+# -------------------------------
+import tempfile
+import pyttsx3
+def text_to_audio_file(text):
+    # Create a temporary file
+    tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+    tmp_path = tmp_file.name
+    tmp_file.close()
+    engine = pyttsx3.init()
+    engine.save_to_file(text, tmp_path)
+    engine.runAndWait()
+    return tmp_path
+def generate_caption_from_image(model, processor, image):
+    # image: PIL.Image
+    inputs = processor(images=image, return_tensors="pt")
+    out = model.generate(**inputs)
+    caption = processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# -------------------------------
+# Gradio interface: Caption + Audio
+# -------------------------------
+def generate_caption_tts(image):
+    caption = generate_caption_from_image(model, processor, image)  # uses global model/processor
+    # audio_file = text_to_audio_file(caption)
+    return caption
+interface = gr.Interface(
+    fn=generate_caption_tts,
+    inputs=gr.Image(type="numpy"),
+    outputs=[gr.Textbox(label="Generated Caption")],
+    title="Image Captioning for Visually Impaired",
+    description="Upload an image, get a caption and audio description."
+)
+interface.launch()
 # # demo.launch(share=True)
+# import gradio as gr
+# from transformers import AutoProcessor, AutoModelForCausalLM
+# import torch
+# from PIL import Image
+# # Load small LLaVA model
+# processor = AutoProcessor.from_pretrained("llava/LLaVA-7B-llm-small")
+# model = AutoModelForCausalLM.from_pretrained(
+#     "llava/LLaVA-7B-llm-small",
+#     torch_dtype=torch.float16,
+#     device_map="auto"  # Automatically use GPU if available
+# )
+# def generate_caption(image):
+#     # Convert to PIL if needed
+#     if isinstance(image, str):
+#         image = Image.open(image).convert("RGB")
+#     # Prepare inputs
+#     inputs = processor(images=image, return_tensors="pt").to(model.device)
+#     # Generate output
+#     outputs = model.generate(**inputs, max_new_tokens=50)
+#     # Decode result
+#     caption = processor.decode(outputs[0], skip_special_tokens=True)
+#     return caption
+# # Gradio Interface
+# interface = gr.Interface(
+#     fn=generate_caption,
+#     inputs=gr.Image(type="pil"),
+#     outputs=gr.Textbox(label="Generated Caption"),
+#     title="LLaVA Image Captioning"
+# )
+# interface.launch()