gopalagra commited on
Commit
f62268d
·
verified ·
1 Parent(s): bb2f9ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -428
app.py CHANGED
@@ -1,217 +1,217 @@
1
- # app.py
2
- # import gradio as gr
3
- # from transformers import BlipProcessor, BlipForConditionalGeneration
4
- # from gtts import gTTS
5
- # import io
6
- # from PIL import Image
7
-
8
- # # -------------------------------
9
- # # Load BLIP-base model (lighter version)
10
- # # -------------------------------
11
- # processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
12
- # model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
13
-
14
- # # -------------------------------
15
- # # Generate caption function
16
- # # -------------------------------
17
- # # def generate_caption_tts(image):
18
- # # caption = generate_caption(model, processor, image)
19
- # # audio_file = text_to_audio_file(caption)
20
- # # return caption, audio_file # return file path, not BytesIO
21
 
 
 
 
 
 
22
 
23
- # # -------------------------------
24
- # # Convert text to speech using gTTS
25
- # # -------------------------------
26
- # import tempfile
27
- # import pyttsx3
28
-
29
- # def text_to_audio_file(text):
30
- # # Create a temporary file
31
- # tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
32
- # tmp_path = tmp_file.name
33
- # tmp_file.close()
34
-
35
- # engine = pyttsx3.init()
36
- # engine.save_to_file(text, tmp_path)
37
- # engine.runAndWait()
38
-
39
- # return tmp_path
40
-
41
- # def generate_caption_from_image(model, processor, image):
42
- # # image: PIL.Image
43
- # inputs = processor(images=image, return_tensors="pt")
44
- # out = model.generate(**inputs)
45
- # caption = processor.decode(out[0], skip_special_tokens=True)
46
- # return caption
47
- # # -------------------------------
48
- # # Gradio interface: Caption + Audio
49
- # # -------------------------------
50
  # def generate_caption_tts(image):
51
- # caption = generate_caption_from_image(model, processor, image) # uses global model/processor
52
- # # audio_file = text_to_audio_file(caption)
53
- # return caption
54
-
55
 
56
 
57
- # interface = gr.Interface(
58
- # fn=generate_caption_tts,
59
- # inputs=gr.Image(type="numpy"),
60
- # outputs=[gr.Textbox(label="Generated Caption")],
61
- # title="Image Captioning for Visually Impaired",
62
- # description="Upload an image, get a caption and audio description."
63
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
- # interface.launch()
67
- # # demo.launch(share=True)
68
 
69
- # import gradio as gr
70
- # from transformers import (
71
- # BlipProcessor,
72
- # BlipForConditionalGeneration,
73
- # BlipForQuestionAnswering,
74
- # pipeline
75
- # )
76
- # moderation_model = pipeline(
77
- # "text-classification",
78
- # model="Vrandan/Comment-Moderation",
79
- # return_all_scores=True
80
- # )
81
 
82
- # from PIL import Image
83
- # import torch
84
- # from gtts import gTTS
85
- # import tempfile
86
 
87
- # # ----------------------
88
- # # Device setup
89
- # # ----------------------
90
- # device = "cuda" if torch.cuda.is_available() else "cpu"
91
 
92
- # # ----------------------
93
- # # Load Models Once
94
- # # ----------------------
95
- # print("🔄 Loading models...")
96
 
97
- # # Captioning
98
- # caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
99
- # caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
100
 
101
- # # VQA
102
- # vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
103
- # vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
104
 
105
- # # Translation
106
- # translation_models = {
107
- # "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
108
- # "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
109
- # "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
110
- # }
111
 
112
- # # Safety Moderation Pipeline
113
- # moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
114
 
115
- # print("✅ All models loaded!")
116
 
117
- # # ----------------------
118
- # # Safety Filter Function
119
- # # ----------------------
120
- # def is_caption_safe(caption):
121
- # try:
122
- # votes = moderation_model(caption)
123
- # # If return_all_scores=True, it's [[{label, score}, ...]]
124
- # if isinstance(votes, list) and isinstance(votes[0], list):
125
- # votes = votes[0]
126
- # # Now safe to loop
127
- # for item in votes:
128
- # if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
129
- # return False
130
- # except Exception as e:
131
- # print("⚠️ Moderation failed:", e)
132
 
133
- # # Fallback keywords
134
- # unsafe_keywords = [
135
- # "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
136
- # "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
137
- # "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
138
- # "grenade", "horror", "beheaded", "torture", "hostage", "rape",
139
- # "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
140
- # ]
141
- # if any(word in caption.lower() for word in unsafe_keywords):
142
- # return False
143
- # return True
144
 
145
 
146
 
147
 
148
- # # ----------------------
149
- # # Caption + Translate + Speak
150
- # # ----------------------
151
- # def generate_caption_translate_speak(image, target_lang):
152
- # # Step 1: Caption
153
- # inputs = caption_processor(images=image, return_tensors="pt").to(device)
154
- # with torch.no_grad():
155
- # out = caption_model.generate(**inputs, max_new_tokens=50)
156
- # english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
157
 
158
- # # Step 1.5: Safety Check
159
- # if not is_caption_safe(english_caption):
160
- # return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
161
 
162
- # # Step 2: Translate
163
- # if target_lang in translation_models:
164
- # translated = translation_models[target_lang](english_caption)[0]['translation_text']
165
- # else:
166
- # translated = "Translation not available"
167
 
168
- # # Step 3: Generate Speech (English caption for now)
169
- # tts = gTTS(english_caption, lang="en")
170
- # tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
171
- # tts.save(tmp_file.name)
172
 
173
- # return english_caption, translated, tmp_file.name
174
 
175
- # # ----------------------
176
- # # VQA
177
- # # ----------------------
178
- # def vqa_answer(image, question):
179
- # inputs = vqa_processor(image, question, return_tensors="pt").to(device)
180
- # with torch.no_grad():
181
- # out = vqa_model.generate(**inputs, max_new_tokens=50)
182
- # answer = vqa_processor.decode(out[0], skip_special_tokens=True)
183
 
184
- # # Run safety filter on answers too
185
- # if not is_caption_safe(answer):
186
- # return "⚠️ Warning: Unsafe or inappropriate content detected!"
187
 
188
- # return answer
189
 
190
- # # ----------------------
191
- # # Gradio UI
192
- # # ----------------------
193
- # with gr.Blocks(title="BLIP Vision App") as demo:
194
- # gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
195
 
196
- # with gr.Tab("Caption + Translate + Speak"):
197
- # with gr.Row():
198
- # img_in = gr.Image(type="pil", label="Upload Image")
199
- # lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
200
- # eng_out = gr.Textbox(label="English Caption")
201
- # trans_out = gr.Textbox(label="Translated Caption")
202
- # audio_out = gr.Audio(label="Spoken Caption", type="filepath")
203
- # btn1 = gr.Button("Generate Caption, Translate & Speak")
204
- # btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
205
 
206
- # with gr.Tab("Visual Question Answering (VQA)"):
207
- # with gr.Row():
208
- # img_vqa = gr.Image(type="pil", label="Upload Image")
209
- # q_in = gr.Textbox(label="Ask a Question about the Image")
210
- # ans_out = gr.Textbox(label="Answer")
211
- # btn2 = gr.Button("Ask")
212
- # btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
213
 
214
- # demo.launch()
215
 
216
 
217
 
@@ -432,250 +432,4 @@
432
 
433
 
434
 
435
-
436
- import gradio as gr
437
- from transformers import (
438
- BlipProcessor,
439
- BlipForConditionalGeneration,
440
- BlipForQuestionAnswering,
441
- pipeline,
442
- SpeechT5Processor, # <--- NEW
443
- SpeechT5ForTextToSpeech, # <--- NEW
444
- set_seed # <--- NEW
445
- )
446
- from datasets import load_dataset # <--- NEW for speaker embedding
447
- from PIL import Image
448
- import torch
449
- # from gtts import gTTS # <--- REMOVED
450
- import tempfile
451
- import numpy as np
452
- import soundfile as sf
453
- import librosa
454
- import tempfile
455
- import time # <--- Added for potential cleanup, but mostly for future use
456
-
457
- # Set seed for reproducibility in TTS generation
458
- set_seed(42)
459
-
460
- def combine_audio(beep_path, speech_path):
461
- """Combine beep + speech audio into one clip."""
462
- # ... (Keep this function as is)
463
- beep, sr1 = sf.read(beep_path)
464
- speech, sr2 = sf.read(speech_path)
465
-
466
- # Resample beep if needed
467
- if sr1 != sr2:
468
- beep = librosa.resample(y=beep, orig_sr=sr1, target_sr=sr2)
469
- sr1 = sr2
470
-
471
- # Convert multi-channel to mono
472
- if len(beep.shape) > 1:
473
- beep = beep.mean(axis=1)
474
- if len(speech.shape) > 1:
475
- # Check if speech is stereo (channels > 1) and has data
476
- if speech.ndim > 1:
477
- speech = speech.mean(axis=1)
478
- # Ensure speech is treated as a 1D array even if it was originally mono 2D
479
- # For single channel (mono) soundfile output, it might be 2D with shape (N, 1)
480
-
481
- # Concatenate beep + speech
482
- combined = np.concatenate((beep, speech))
483
-
484
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
485
- # SpeechT5 generates 16000Hz WAV, so we use sr1 (which is 16000) for the output
486
- sf.write(tmp_file.name, combined, sr1)
487
- return tmp_file.name
488
-
489
- # ----------------------
490
- # Device setup
491
- # ----------------------
492
- device = "cuda" if torch.cuda.is_available() else "cpu"
493
-
494
- # ----------------------
495
- # Load Models Once
496
- # ----------------------
497
- print("🔄 Loading models...")
498
-
499
- # Captioning
500
- caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
501
- caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
502
-
503
- # VQA
504
- vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
505
- vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
506
-
507
- # Translation
508
- translation_models = {
509
- "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
510
- "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
511
- "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
512
- }
513
-
514
- # Text-to-Speech (TTS) Models # <--- NEW/MODIFIED
515
- print(" Loading SpeechT5 TTS model...")
516
- tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
517
- tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
518
-
519
- # Load a speaker embedding (required for SpeechT5 to define a voice)
520
- # Using a sample speaker from the VCTK dataset
521
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
522
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
523
-
524
- # Safety Moderation Pipeline
525
- moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
526
- print("✅ All models loaded!")
527
-
528
- # ----------------------
529
- # Utility: Generate Local Speech (TTS) # <--- NEW FUNCTION
530
- # ----------------------
531
- def synthesize_speech_local(text, tts_processor, tts_model, speaker_embeddings):
532
- """Generates speech using local HuggingFace SpeechT5 model."""
533
- inputs = tts_processor(text=text, return_tensors="pt").to(device)
534
-
535
- # Generate speech with the loaded model and speaker embedding
536
- speech = tts_model.generate_speech(
537
- inputs["input_ids"],
538
- speaker_embeddings,
539
- do_sample=True # Use sampling for more natural tone
540
- )
541
-
542
- # Convert the Tensor to a NumPy array
543
- speech_np = speech.cpu().numpy()
544
-
545
- # Create a temporary WAV file to save the audio
546
- # SpeechT5's default sampling rate is 16000Hz
547
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
548
- sf.write(tmp_file.name, speech_np, samplerate=16000)
549
-
550
- return tmp_file.name
551
-
552
- # ----------------------
553
- # Utility: Generate a Beep Sound
554
- # ----------------------
555
- def make_beep_sound(duration=0.5, freq=1000):
556
- """Generate a short beep tone and save as temporary .wav file."""
557
- # We use 16000Hz to match SpeechT5's output for combining audio later
558
- samplerate = 16000
559
- t = np.linspace(0, duration, int(samplerate * duration), endpoint=False)
560
- wave = 0.5 * np.sin(2 * np.pi * freq * t)
561
- tmp_beep = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
562
- sf.write(tmp_beep.name, wave, samplerate)
563
- return tmp_beep.name
564
-
565
- # ----------------------
566
- # Safety Filter Function (Keep as is)
567
- # ----------------------
568
- def is_caption_safe(caption):
569
- # ... (Keep this function as is)
570
- try:
571
- votes = moderation_model(caption)
572
- if isinstance(votes, list) and isinstance(votes[0], list):
573
- votes = votes[0]
574
- for item in votes:
575
- # Checking for 'V' or 'V2' (Violent, etc.) labels with high confidence
576
- if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
577
- return False
578
- except Exception as e:
579
- print("⚠️ Moderation failed:", e)
580
-
581
- unsafe_keywords = [
582
- "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
583
- "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
584
- "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
585
- "grenade", "horror", "beheaded", "torture", "hostage", "rape",
586
- "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
587
- ]
588
- if any(word in caption.lower() for word in unsafe_keywords):
589
- return False
590
- return True
591
-
592
- # ----------------------
593
- # Caption + Translate + Speak # <--- MODIFIED
594
- # ----------------------
595
- def generate_caption_translate_speak(image, target_lang):
596
- # Step 1: Caption
597
- inputs = caption_processor(images=image, return_tensors="pt").to(device)
598
- with torch.no_grad():
599
- out = caption_model.generate(**inputs, max_new_tokens=50)
600
- english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
601
-
602
- # Step 1.5: Safety Check (MODIFIED TTS)
603
- if not is_caption_safe(english_caption):
604
- # Generate beep (16000Hz WAV)
605
- beep = make_beep_sound()
606
-
607
- # Generate warning speech (16000Hz WAV)
608
- warning_text = "Warning! Unsafe or inappropriate content detected."
609
- speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
610
-
611
- # Combine beep + speech
612
- combined_audio = combine_audio(beep, speech_tmp_name)
613
-
614
- # Return combined audio automatically
615
- return "⚠️ Warning: Unsafe or inappropriate content detected!", "", combined_audio
616
-
617
- # Step 2: Translate
618
- if target_lang in translation_models:
619
- translated = translation_models[target_lang](english_caption)[0]['translation_text']
620
- else:
621
- translated = "Translation not available"
622
-
623
- # Step 3: Generate Speech (English caption) (MODIFIED TTS)
624
- tmp_file_name = synthesize_speech_local(english_caption, tts_processor, tts_model, speaker_embeddings)
625
-
626
- # The output is a .wav file now, but Gradio's Audio component is flexible
627
- return english_caption, translated, tmp_file_name
628
-
629
- # ----------------------
630
- # VQA # <--- MODIFIED
631
- # ----------------------
632
- def vqa_answer(image, question):
633
- inputs = vqa_processor(image, question, return_tensors="pt").to(device)
634
- with torch.no_grad():
635
- out = vqa_model.generate(**inputs, max_new_tokens=50)
636
- answer = vqa_processor.decode(out[0], skip_special_tokens=True)
637
-
638
- if not is_caption_safe(answer):
639
- # Generate beep (16000Hz WAV)
640
- beep = make_beep_sound()
641
-
642
- # Generate warning speech (16000Hz WAV)
643
- warning_text = "Warning! Unsafe or inappropriate content detected."
644
- speech_tmp_name = synthesize_speech_local(warning_text, tts_processor, tts_model, speaker_embeddings)
645
-
646
- combined = combine_audio(beep, speech_tmp_name)
647
- return "⚠️ Warning: Unsafe or inappropriate content detected!", combined
648
-
649
- return answer, None
650
-
651
- # ----------------------
652
- # Gradio UI (Keep as is)
653
- # ----------------------
654
- with gr.Blocks(title="BLIP Vision App") as demo:
655
- gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (Auto-Play TTS + Safety Beep)")
656
- gr.Markdown("### Note: Text-to-Speech now uses a local HuggingFace model to prevent 'Too Many Requests' (429) errors.")
657
-
658
- # --- Caption + Translate + Speak ---
659
- with gr.Tab("Caption + Translate + Speak"):
660
- with gr.Row():
661
- img_in = gr.Image(type="pil", label="Upload Image")
662
- lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
663
- eng_out = gr.Textbox(label="English Caption")
664
- trans_out = gr.Textbox(label="Translated Caption")
665
- # Note: We changed the output to WAV but Gradio handles it fine.
666
- audio_out = gr.Audio(label="Speech Output (WAV format)", type="filepath", autoplay=True)
667
- btn1 = gr.Button("Generate Caption, Translate & Speak")
668
- btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in],
669
- outputs=[eng_out, trans_out, audio_out])
670
-
671
- # --- Visual Question Answering (VQA) ---
672
- with gr.Tab("Visual Question Answering (VQA)"):
673
- with gr.Row():
674
- img_vqa = gr.Image(type="pil", label="Upload Image")
675
- q_in = gr.Textbox(label="Ask a Question about the Image")
676
- ans_out = gr.Textbox(label="Answer")
677
- beep_out = gr.Audio(label="Alert Sound (WAV format)", type="filepath", autoplay=True)
678
- btn2 = gr.Button("Ask")
679
- btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=[ans_out, beep_out])
680
-
681
- demo.launch()
 
1
+ app.py
2
+ import gradio as gr
3
+ from transformers import BlipProcessor, BlipForConditionalGeneration
4
+ from gtts import gTTS
5
+ import io
6
+ from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # -------------------------------
9
+ # Load BLIP-base model (lighter version)
10
+ # -------------------------------
11
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
12
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
13
 
14
+ # -------------------------------
15
+ # Generate caption function
16
+ # -------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # def generate_caption_tts(image):
18
+ # caption = generate_caption(model, processor, image)
19
+ # audio_file = text_to_audio_file(caption)
20
+ # return caption, audio_file # return file path, not BytesIO
 
21
 
22
 
23
+ # -------------------------------
24
+ # Convert text to speech using gTTS
25
+ # -------------------------------
26
+ import tempfile
27
+ import pyttsx3
28
+
29
+ def text_to_audio_file(text):
30
+ # Create a temporary file
31
+ tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
32
+ tmp_path = tmp_file.name
33
+ tmp_file.close()
34
+
35
+ engine = pyttsx3.init()
36
+ engine.save_to_file(text, tmp_path)
37
+ engine.runAndWait()
38
+
39
+ return tmp_path
40
+
41
+ def generate_caption_from_image(model, processor, image):
42
+ # image: PIL.Image
43
+ inputs = processor(images=image, return_tensors="pt")
44
+ out = model.generate(**inputs)
45
+ caption = processor.decode(out[0], skip_special_tokens=True)
46
+ return caption
47
+ # -------------------------------
48
+ # Gradio interface: Caption + Audio
49
+ # -------------------------------
50
+ def generate_caption_tts(image):
51
+ caption = generate_caption_from_image(model, processor, image) # uses global model/processor
52
+ # audio_file = text_to_audio_file(caption)
53
+ return caption
54
+
55
+
56
+
57
+ interface = gr.Interface(
58
+ fn=generate_caption_tts,
59
+ inputs=gr.Image(type="numpy"),
60
+ outputs=[gr.Textbox(label="Generated Caption")],
61
+ title="Image Captioning for Visually Impaired",
62
+ description="Upload an image, get a caption and audio description."
63
+ )
64
 
65
 
66
+ interface.launch()
67
+ # demo.launch(share=True)
68
 
69
+ import gradio as gr
70
+ from transformers import (
71
+ BlipProcessor,
72
+ BlipForConditionalGeneration,
73
+ BlipForQuestionAnswering,
74
+ pipeline
75
+ )
76
+ moderation_model = pipeline(
77
+ "text-classification",
78
+ model="Vrandan/Comment-Moderation",
79
+ return_all_scores=True
80
+ )
81
 
82
+ from PIL import Image
83
+ import torch
84
+ from gtts import gTTS
85
+ import tempfile
86
 
87
+ # ----------------------
88
+ # Device setup
89
+ # ----------------------
90
+ device = "cuda" if torch.cuda.is_available() else "cpu"
91
 
92
+ # ----------------------
93
+ # Load Models Once
94
+ # ----------------------
95
+ print("🔄 Loading models...")
96
 
97
+ # Captioning
98
+ caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
99
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
100
 
101
+ # VQA
102
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
103
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
104
 
105
+ # Translation
106
+ translation_models = {
107
+ "Hindi": pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi"),
108
+ "French": pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr"),
109
+ "Spanish": pipeline("translation", model="Helsinki-NLP/opus-mt-en-es"),
110
+ }
111
 
112
+ # Safety Moderation Pipeline
113
+ moderation_model = pipeline("text-classification", model="unitary/toxic-bert")
114
 
115
+ print("✅ All models loaded!")
116
 
117
+ # ----------------------
118
+ # Safety Filter Function
119
+ # ----------------------
120
+ def is_caption_safe(caption):
121
+ try:
122
+ votes = moderation_model(caption)
123
+ # If return_all_scores=True, it's [[{label, score}, ...]]
124
+ if isinstance(votes, list) and isinstance(votes[0], list):
125
+ votes = votes[0]
126
+ # Now safe to loop
127
+ for item in votes:
128
+ if isinstance(item, dict) and item.get("label") in ["V", "V2"] and item.get("score", 0) > 0.5:
129
+ return False
130
+ except Exception as e:
131
+ print("⚠️ Moderation failed:", e)
132
 
133
+ # Fallback keywords
134
+ unsafe_keywords = [
135
+ "gun", "blood", "skull", "kill", "corpse", "gore", "knife", "weapon",
136
+ "fire", "murder", "dead", "death", "suicide", "bomb", "explosion",
137
+ "terrorist", "assault", "stab", "shoot", "pistol", "rifle", "shotgun",
138
+ "grenade", "horror", "beheaded", "torture", "hostage", "rape",
139
+ "war", "massacre", "chainsaw", "poison", "strangle", "hang", "drown"
140
+ ]
141
+ if any(word in caption.lower() for word in unsafe_keywords):
142
+ return False
143
+ return True
144
 
145
 
146
 
147
 
148
+ # ----------------------
149
+ # Caption + Translate + Speak
150
+ # ----------------------
151
+ def generate_caption_translate_speak(image, target_lang):
152
+ # Step 1: Caption
153
+ inputs = caption_processor(images=image, return_tensors="pt").to(device)
154
+ with torch.no_grad():
155
+ out = caption_model.generate(**inputs, max_new_tokens=50)
156
+ english_caption = caption_processor.decode(out[0], skip_special_tokens=True)
157
 
158
+ # Step 1.5: Safety Check
159
+ if not is_caption_safe(english_caption):
160
+ return "⚠️ Warning: Unsafe or inappropriate content detected!", "", None
161
 
162
+ # Step 2: Translate
163
+ if target_lang in translation_models:
164
+ translated = translation_models[target_lang](english_caption)[0]['translation_text']
165
+ else:
166
+ translated = "Translation not available"
167
 
168
+ # Step 3: Generate Speech (English caption for now)
169
+ tts = gTTS(english_caption, lang="en")
170
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
171
+ tts.save(tmp_file.name)
172
 
173
+ return english_caption, translated, tmp_file.name
174
 
175
+ # ----------------------
176
+ # VQA
177
+ # ----------------------
178
+ def vqa_answer(image, question):
179
+ inputs = vqa_processor(image, question, return_tensors="pt").to(device)
180
+ with torch.no_grad():
181
+ out = vqa_model.generate(**inputs, max_new_tokens=50)
182
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
183
 
184
+ # Run safety filter on answers too
185
+ if not is_caption_safe(answer):
186
+ return "⚠️ Warning: Unsafe or inappropriate content detected!"
187
 
188
+ return answer
189
 
190
+ # ----------------------
191
+ # Gradio UI
192
+ # ----------------------
193
+ with gr.Blocks(title="BLIP Vision App") as demo:
194
+ gr.Markdown("## 🖼️ BLIP: Image Captioning + Translation + Speech + VQA (with Safety Filter)")
195
 
196
+ with gr.Tab("Caption + Translate + Speak"):
197
+ with gr.Row():
198
+ img_in = gr.Image(type="pil", label="Upload Image")
199
+ lang_in = gr.Dropdown(["Hindi", "French", "Spanish"], label="Translate To", value="Hindi")
200
+ eng_out = gr.Textbox(label="English Caption")
201
+ trans_out = gr.Textbox(label="Translated Caption")
202
+ audio_out = gr.Audio(label="Spoken Caption", type="filepath")
203
+ btn1 = gr.Button("Generate Caption, Translate & Speak")
204
+ btn1.click(generate_caption_translate_speak, inputs=[img_in, lang_in], outputs=[eng_out, trans_out, audio_out])
205
 
206
+ with gr.Tab("Visual Question Answering (VQA)"):
207
+ with gr.Row():
208
+ img_vqa = gr.Image(type="pil", label="Upload Image")
209
+ q_in = gr.Textbox(label="Ask a Question about the Image")
210
+ ans_out = gr.Textbox(label="Answer")
211
+ btn2 = gr.Button("Ask")
212
+ btn2.click(vqa_answer, inputs=[img_vqa, q_in], outputs=ans_out)
213
 
214
+ demo.launch()
215
 
216
 
217
 
 
432
 
433
 
434
 
435
+