Spaces:

agnixcode
/

merged_chat

Sleeping

App Files Files Community

Dua Rajper commited on Mar 3, 2025

Commit

d2af615

verified ·

1 Parent(s): 9a6ae7d

Create app.py

Browse files

Files changed (1) hide show

app.py +197 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import streamlit as st
+from groq import Groq, APIConnectionError, AuthenticationError
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForQuestionAnswering,
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+)
+from espnet2.bin.tts_inference import Text2Speech
+from PIL import Image
+import easyocr
+import soundfile as sf
+from pydub import AudioSegment
+import io
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
+import av
+import numpy as np
+# Load Groq API key from environment variables
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    st.error("Groq API key not found. Please add it to the Hugging Face Space Secrets.")
+    st.stop()
+# Initialize Groq client
+groq_client = Groq(api_key=GROQ_API_KEY)
+# OCR Function
+def extract_text_from_image(image):
+    reader = easyocr.Reader(['en'])
+    result = reader.readtext(image)
+    extracted_text = " ".join([detection[1] for detection in result])
+    return extracted_text
+# Question Answering Function (DistilBERT)
+@st.cache_resource
+def load_qa_model():
+    model_name = "distilbert/distilbert-base-cased-distilled-squad"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+    nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)
+    return nlp
+def answer_question(context, question, qa_model):
+    result = qa_model({'question': question, 'context': context})
+    return result['answer']
+# Load models for voice chatbot
+@st.cache_resource
+def load_voice_models():
+    # Speech-to-Text
+    processor = AutoProcessor.from_pretrained("openai/whisper-small")
+    stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
+    stt_pipe = pipeline(
+        "automatic-speech-recognition",
+        model=stt_model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        return_timestamps=True  # Enable timestamps for long-form audio
+    )
+    # Text-to-Speech
+    tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
+    return stt_pipe, tts_model
+# Groq API Function
+def groq_chat(prompt):
+    try:
+        chat_completion = groq_client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            model="llama-3.3-70b-versatile",
+        )
+        return chat_completion.choices[0].message.content
+    except APIConnectionError as e:
+        return f"Groq API Connection Error: {e}"
+    except AuthenticationError as e:
+        return f"Groq API Authentication Error: {e}"
+    except Exception as e:
+        return f"General Groq API Error: {e}"
+# Streamlit App
+def main():
+    st.title("Multi-Modal Chatbot: Image Text & Voice")
+    # Sidebar for mode selection
+    mode = st.sidebar.radio("Select Mode", ["Image Text & QA", "Voice Chatbot"])
+    if mode == "Image Text & QA":
+        # Image Text Extraction & QA
+        st.header("Image Text Extraction & Question Answering")
+        uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+        if uploaded_file is not None:
+            image = Image.open(uploaded_file)
+            st.image(image, caption="Uploaded Image", use_container_width=True)
+            if st.button("Extract Text and Enable Question Answering"):
+                with st.spinner("Extracting text..."):
+                    extracted_text = extract_text_from_image(image)
+                    st.write("Extracted Text:")
+                    st.write(extracted_text)
+                qa_model = load_qa_model()
+                question = st.text_input("Ask a question about the image text:")
+                if st.button("Answer"):
+                    if question:
+                        with st.spinner("Answering..."):
+                            answer = answer_question(extracted_text, question, qa_model)
+                            st.write("Answer:", answer)
+                    else:
+                        st.warning("Please enter a question.")
+    elif mode == "Voice Chatbot":
+        # Voice Chatbot
+        st.header("Voice-Enabled Chatbot")
+        # Audio recorder
+        st.write("Record your voice:")
+        webrtc_ctx = webrtc_streamer(
+            key="audio-recorder",
+            mode=WebRtcMode.SENDONLY,
+            audio_processor_factory=AudioRecorder,
+            media_stream_constraints={"audio": True, "video": False},
+        )
+        if webrtc_ctx.audio_processor:
+            st.write("Recording... Press 'Stop' to finish recording.")
+            # Save recorded audio to a WAV file
+            if st.button("Stop and Process Recording"):
+                audio_frames = webrtc_ctx.audio_processor.audio_frames
+                if audio_frames:
+                    # Combine audio frames into a single array
+                    audio_data = np.concatenate(audio_frames)
+                    # Save as WAV file
+                    sf.write("recorded_audio.wav", audio_data, samplerate=16000)
+                    st.success("Recording saved as recorded_audio.wav")
+                    # Process the recorded audio
+                    speech, _ = sf.read("recorded_audio.wav")
+                    output = stt_pipe(speech)  # Transcribe with timestamps
+                    # Debug: Print the transcribed text
+                    st.write("Transcribed Text:", output['text'])
+                    # Display the text with timestamps (optional)
+                    if 'chunks' in output:
+                        st.write("Transcribed Text with Timestamps:")
+                        for chunk in output['chunks']:
+                            st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
+                    # Generate response using Groq API
+                    try:
+                        # Debug: Print the input text
+                        st.write("Input Text:", output['text'])
+                        chat_completion = groq_client.chat.completions.create(
+                            messages=[{"role": "user", "content": output['text']}],
+                            model="mixtral-8x7b-32768",
+                            temperature=0.5,
+                            max_tokens=1024,
+                        )
+                        # Debug: Print the API response
+                        st.write("API Response:", chat_completion)
+                        # Extract the generated response
+                        response = chat_completion.choices[0].message.content
+                        st.write("Generated Response:", response)
+                        # Convert response to speech
+                        speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
+                        # Debug: Print the TTS output
+                        st.write("TTS Output:", speech)
+                        # Save and play the speech
+                        sf.write("response.wav", speech, 22050)
+                        st.audio("response.wav")
+                    except Exception as e:
+                        st.error(f"Error generating response: {e}")
+                else:
+                    st.error("No audio recorded. Please try again.")
+    # Groq Chat Section (Common for both modes)
+    st.subheader("General Chat (Powered by Groq)")
+    groq_prompt = st.text_input("Enter your message:")
+    if st.button("Send"):
+        if groq_prompt:
+            with st.spinner("Generating response..."):
+                groq_response = groq_chat(groq_prompt)
+                st.write("Response:", groq_response)
+        else:
+            st.warning("Please enter a message.")
+# Audio recorder class
+class AudioRecorder(AudioProcessorBase):
+    def __init__(self):
+        self.audio_frames = []
+    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
+        self.audio_frames.append(frame.to_ndarray())
+        return frame
+if __name__ == "__main__":
+    main()