Spaces:

SharathReddy
/

AI_Voice_Assistant

Build error

App Files Files Community

SharathReddy commited on Dec 31, 2024

Commit

3554051

verified ·

1 Parent(s): bd61cc4

Upload 2 files

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +105 -91
img.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+img.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,129 +1,143 @@
-# app.py
 import torch
-from transformers import BitsAndBytesConfig, pipeline
 import whisper
 import gradio as gr
-import warnings
-import os
 from gtts import gTTS
 from PIL import Image
 import nltk
-from nltk import sent_tokenize
 import re
-import datetime
-# Download required NLTK data
-nltk.download('punkt')
-nltk.download('punkt_tab')
-# Configure model settings
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
-# Initialize models
-model_id = "llava-hf/llava-1.5-7b-hf"
-pipe = pipeline("image-to-text",
-                model=model_id,
-                model_kwargs={"quantization_config": quantization_config})
-# Load Whisper model
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-model = whisper.load_model("medium", device=DEVICE)
-# Setup logging
-def setup_logging():
-    tstamp = datetime.datetime.now()
-    tstamp = str(tstamp).replace(' ','_')
-    logfile = f'logs/{tstamp}_log.txt'
-    os.makedirs('logs', exist_ok=True)
-    return logfile
-logfile = setup_logging()
-def writehistory(text):
-    with open(logfile, 'a', encoding='utf-8') as f:
-        f.write(text)
-        f.write('\n')
-# Core functions
 def img2txt(input_text, input_image):
-    image = Image.open(input_image)
-    if type(input_text) == tuple:
-        prompt_instructions = """
-        Describe the image using as much detail as possible, is it a painting,
-        a photograph, what colors are predominant, what is the image about?
-        """
-    else:
-        prompt_instructions = f"""
-        Act as an expert in imagery descriptive analysis, using as much detail
-        as possible from the image, respond to the following prompt: {input_text}
-        """
-    prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:"
-    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
-    if outputs and outputs[0]["generated_text"]:
-        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
-        reply = match.group(1) if match else "No response found."
-    else:
-        reply = "No response generated."
-    return reply
 def transcribe(audio):
-    if not audio:
         return ''
-    audio = whisper.load_audio(audio)
-    audio = whisper.pad_or_trim(audio)
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model, mel, options)
-    return result.text
-def text_to_speech(text, file_path="output.mp3"):
-    tts = gTTS(text=text, lang='en', slow=False)
-    tts.save(file_path)
-    return file_path
-def process_inputs(audio_path, image_path):
     try:
-        speech_to_text_output = transcribe(audio_path)
-        writehistory(f"Speech to text: {speech_to_text_output}")
-        if image_path:
-            chatgpt_output = img2txt(speech_to_text_output, image_path)
-            writehistory(f"Image analysis: {chatgpt_output}")
         else:
             chatgpt_output = "No image provided."
-            writehistory("No image provided")
         audio_output = text_to_speech(chatgpt_output)
         return speech_to_text_output, chatgpt_output, audio_output
     except Exception as e:
-        writehistory(f"Error: {str(e)}")
         return str(e), str(e), None
 # Create Gradio interface
-iface = gr.Interface(
     fn=process_inputs,
     inputs=[
-        gr.Audio(sources=["microphone"], type="filepath"),
-        gr.Image(type="filepath")
     ],
     outputs=[
         gr.Textbox(label="Speech to Text"),
-        gr.Textbox(label="Analysis Output"),
-        gr.Audio()
     ],
-    title="AI Image Analysis Assistant",
-    description="Upload an image and ask questions using your voice. The AI will analyze the image and respond with voice and text.",
 )
-# Launch the app
 if __name__ == "__main__":
-    iface.launch()

 import torch
+from transformers import pipeline
 import whisper
 import gradio as gr
 from gtts import gTTS
 from PIL import Image
 import nltk
 import re
+import tempfile
+import os
+import multiprocessing
+# Enable multiprocessing for MacOS
+multiprocessing.freeze_support()
+# Download NLTK data
+nltk.download('punkt', quiet=True)
+# Configure device
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
+# Initialize a smaller vision model
+model_id = "microsoft/git-base"  # Using a more stable model
+print("Loading image captioning model...")
+pipe = None  # We'll initialize this later to avoid multiprocessing issues
+# Initialize Whisper model
+print("Loading Whisper model...")
+audio_model = None  # We'll initialize this later
+def initialize_models():
+    """Initialize models safely"""
+    global pipe, audio_model
+    if pipe is None:
+        pipe = pipeline("image-to-text", model=model_id)
+    if audio_model is None:
+        audio_model = whisper.load_model("tiny", device=DEVICE)
+    return pipe, audio_model
 def img2txt(input_text, input_image):
+    """Process image with the vision model"""
+    global pipe
+    if pipe is None:
+        pipe, _ = initialize_models()
+    try:
+        # Generate basic caption
+        outputs = pipe(input_image)
+        caption = outputs[0]['generated_text']
+        # If there's a specific question, append it to the response
+        if input_text and input_text.strip():
+            response = f"Based on the image which shows {caption}, "
+            response += f"addressing your question: {input_text}\n"
+            return response
+        return caption
+    except Exception as e:
+        print(f"Error in image processing: {str(e)}")
+        return "Sorry, I couldn't process the image properly."
 def transcribe(audio):
+    """Transcribe audio using Whisper"""
+    global audio_model
+    if audio_model is None:
+        _, audio_model = initialize_models()
+    if audio is None:
         return ''
+    try:
+        audio = whisper.load_audio(audio)
+        audio = whisper.pad_or_trim(audio)
+        mel = whisper.log_mel_spectrogram(audio).to(audio_model.device)
+        result = whisper.decode(audio_model, mel, whisper.DecodingOptions())
+        return result.text
+    except Exception as e:
+        print(f"Error in transcription: {str(e)}")
+        return "Sorry, I couldn't transcribe the audio properly."
+def text_to_speech(text):
+    """Convert text to speech using gTTS"""
     try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+            tts = gTTS(text=text, lang='en', slow=False)
+            tts.save(fp.name)
+            return fp.name
+    except Exception as e:
+        print(f"Error in text-to-speech: {str(e)}")
+        return None
+def process_inputs(audio, image):
+    """Main processing function"""
+    try:
+        # Process speech to text
+        speech_to_text_output = transcribe(audio) if audio is not None else ""
+        # Process image and generate response
+        if image is not None:
+            query = speech_to_text_output if speech_to_text_output else "Describe this image in detail"
+            chatgpt_output = img2txt(query, image)
         else:
             chatgpt_output = "No image provided."
+        # Generate audio response
         audio_output = text_to_speech(chatgpt_output)
         return speech_to_text_output, chatgpt_output, audio_output
     except Exception as e:
+        print(f"Error in process_inputs: {str(e)}")
         return str(e), str(e), None
 # Create Gradio interface
+demo = gr.Interface(
     fn=process_inputs,
     inputs=[
+        gr.Audio(sources=["microphone"], type="filepath", label="Voice Input"),  # Fixed Audio component syntax
+        gr.Image(type="pil", label="Image Input")  # Specified image type
     ],
     outputs=[
         gr.Textbox(label="Speech to Text"),
+        gr.Textbox(label="Image Analysis"),
+        gr.Audio(label="AI Response", type="filepath")
     ],
+    title="Image Analysis with Voice Interface",
+    description="Upload an image and ask questions using your voice. The AI will analyze the image and respond with both text and speech."
 )
 if __name__ == "__main__":
+    print("Starting Gradio interface...")
+    # Initialize models before launching the interface
+    initialize_models()
+    # Launch with minimal GPU memory usage
+    demo.launch(
+        share=True,
+        debug=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+    )

img.png ADDED Viewed

Git LFS Details

SHA256: fdcc8514911bd85335cb8f18bb20e7b5c97e8b325bddd801fb34f06f14bbc8b2
Pointer size: 133 Bytes
Size of remote file: 22.3 MB