zsolnai commited on
Commit
60cffca
Β·
1 Parent(s): 3944a6c

Add tab for texting llm

Browse files
Files changed (1) hide show
  1. app.py +112 -58
app.py CHANGED
@@ -1,27 +1,26 @@
1
  import os
 
2
 
3
  import gradio as gr
4
- import numpy as np
5
- import soundfile as sf
6
  import torch
7
 
8
  # --- Device Setup (Explicitly set to CPU) ---
9
  device = "cpu"
10
 
11
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
12
- from transformers import pipeline
13
 
14
  STT_MODEL_NAME = "openai/whisper-tiny.en"
15
- # Pass device="cpu" to the pipeline
16
  stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
17
 
 
 
 
 
18
  # --- TTS Setup (using coqui-ai/TTS) ---
19
  from TTS.api import TTS
20
 
21
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
22
- OUTPUT_WAV_FILE = "output.wav"
23
-
24
- # Initialize the TTS model on CPU
25
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
26
 
27
 
@@ -29,7 +28,6 @@ def speech_to_text(audio_file_path):
29
  """Performs Speech-to-Text using the Whisper model."""
30
  if audio_file_path is None:
31
  return "Please upload an audio file or record your voice."
32
-
33
  try:
34
  result = stt_pipe(audio_file_path)
35
  return result["text"]
@@ -41,72 +39,128 @@ def text_to_speech(text):
41
  """Performs Text-to-Speech using the Coqui TTS model."""
42
  if not text:
43
  return None, "Please enter text for synthesis."
44
-
45
  try:
 
 
 
 
 
46
  # Generate the speech (slow on CPU)
47
  tts_model.tts_to_file(
48
  text=text,
49
- file_path=OUTPUT_WAV_FILE,
50
  )
51
- return OUTPUT_WAV_FILE, "Speech synthesis complete. (Completed slowly on CPU)"
52
  except Exception as e:
53
  return None, f"Error during TTS: {e}"
54
 
55
 
56
- # --- Gradio Interface ---
 
 
 
57
 
58
- # 1. Define the CSS here
59
- custom_css = "#status {font-weight: bold;}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- # 2. Initialize Blocks without the 'css' argument
62
- with gr.Blocks() as demo:
63
- gr.Markdown("# πŸ—£οΈ STT & TTS App (CPU Only)")
 
 
 
 
 
 
 
 
 
 
 
64
  gr.Markdown(
65
- "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) will be very slow**."
66
  )
67
 
68
- gr.HTML("<hr>")
69
-
70
- # 1. STT Block
71
- with gr.Row():
72
- with gr.Column():
73
- gr.Markdown("## 🎀 Speech-to-Text (STT)")
74
- audio_input = gr.Audio(
75
- sources=["microphone", "upload"],
76
- type="filepath",
77
- label="Input Audio (Mic or Upload)",
78
  )
79
- stt_button = gr.Button("Convert Speech to Text")
80
-
81
- with gr.Column():
82
- stt_output = gr.Textbox(label="Transcribed Text", lines=3)
83
-
84
- stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=stt_output)
85
 
86
- gr.HTML("<hr>")
87
-
88
- # 2. TTS Block
89
- with gr.Row():
90
- with gr.Column():
91
- gr.Markdown("## πŸ”Š Text-to-Speech (TTS)")
92
- text_input = gr.Textbox(
93
- label="Text to Synthesize",
94
- lines=3,
95
- value="Hello there, this is a demonstration of the text to speech model.",
96
  )
97
- tts_button = gr.Button("Synthesize Speech (Will be slow)")
98
-
99
- with gr.Column():
100
- audio_output = gr.Audio(label="Synthesized Audio")
101
- # The id="status" is still correct for applying CSS later
102
- tts_status = gr.Textbox(elem_id="status", label="Status")
103
 
104
- tts_button.click(
105
- fn=text_to_speech, inputs=text_input, outputs=[audio_output, tts_status]
106
- )
107
-
108
- # 3. Pass the 'css' argument to launch()
109
- demo.launch(css=custom_css)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- if os.path.exists(OUTPUT_WAV_FILE):
112
- os.remove(OUTPUT_WAV_FILE)
 
1
  import os
2
+ import tempfile
3
 
4
  import gradio as gr
 
 
5
  import torch
6
 
7
  # --- Device Setup (Explicitly set to CPU) ---
8
  device = "cpu"
9
 
10
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
11
+ from transformers import Conversation, pipeline
12
 
13
  STT_MODEL_NAME = "openai/whisper-tiny.en"
 
14
  stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
15
 
16
+ # --- LLM Setup (using Hugging Face's transformers for text generation) ---
17
+ LLM_MODEL_NAME = "microsoft/DialoGPT-medium"
18
+ chatbot_pipe = pipeline("conversational", model=LLM_MODEL_NAME, device=device)
19
+
20
  # --- TTS Setup (using coqui-ai/TTS) ---
21
  from TTS.api import TTS
22
 
23
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 
 
 
24
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
25
 
26
 
 
28
  """Performs Speech-to-Text using the Whisper model."""
29
  if audio_file_path is None:
30
  return "Please upload an audio file or record your voice."
 
31
  try:
32
  result = stt_pipe(audio_file_path)
33
  return result["text"]
 
39
  """Performs Text-to-Speech using the Coqui TTS model."""
40
  if not text:
41
  return None, "Please enter text for synthesis."
 
42
  try:
43
+ # Create a temporary file for each request
44
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
45
+ output_path = temp_file.name
46
+ temp_file.close()
47
+
48
  # Generate the speech (slow on CPU)
49
  tts_model.tts_to_file(
50
  text=text,
51
+ file_path=output_path,
52
  )
53
+ return output_path, "Speech synthesis complete. (Completed slowly on CPU)"
54
  except Exception as e:
55
  return None, f"Error during TTS: {e}"
56
 
57
 
58
+ def chat_with_bot(message, history):
59
+ """Chat with the conversational AI model."""
60
+ if not message:
61
+ return history
62
 
63
+ try:
64
+ # Create a new conversation with the full history
65
+ conversation = Conversation()
66
+ for user_msg, bot_msg in history:
67
+ conversation.add_user_input(user_msg)
68
+ if bot_msg:
69
+ conversation.append_response(bot_msg)
70
+
71
+ # Add the new user message
72
+ conversation.add_user_input(message)
73
+
74
+ # Get response from the model
75
+ result = chatbot_pipe(conversation)
76
+ response = result.generated_responses[-1]
77
+
78
+ # Append to history
79
+ history.append((message, response))
80
+ return history
81
+ except Exception as e:
82
+ history.append((message, f"Error: {e}"))
83
+ return history
84
 
85
+
86
+ # --- Gradio Interface ---
87
+ custom_css = """
88
+ #status {
89
+ font-weight: bold;
90
+ color: #2563eb;
91
+ }
92
+ .chatbot {
93
+ height: 400px;
94
+ }
95
+ """
96
+
97
+ with gr.Blocks(css=custom_css) as demo:
98
+ gr.Markdown("# πŸ—£οΈ STT, TTS & Chat App (CPU Only)")
99
  gr.Markdown(
100
+ "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) and Chat will be slow**."
101
  )
102
 
103
+ # Create tabs for different features
104
+ with gr.Tabs():
105
+ # Tab 1: Chat Interface
106
+ with gr.TabItem("πŸ’¬ Chat"):
107
+ gr.Markdown("## Chat with AI Assistant")
108
+ gr.Markdown(
109
+ "Have a conversation with the DialoGPT model. It remembers context from your conversation!"
 
 
 
110
  )
 
 
 
 
 
 
111
 
112
+ chatbot = gr.Chatbot(label="Conversation", elem_classes=["chatbot"])
113
+ msg = gr.Textbox(
114
+ label="Your Message",
115
+ placeholder="Type your message here and press Enter...",
116
+ lines=2,
 
 
 
 
 
117
  )
118
+ with gr.Row():
119
+ submit_btn = gr.Button("Send", variant="primary")
120
+ clear_btn = gr.Button("Clear Chat")
 
 
 
121
 
122
+ # Chat functionality
123
+ msg.submit(chat_with_bot, inputs=[msg, chatbot], outputs=chatbot).then(
124
+ lambda: "", None, msg
125
+ )
126
+ submit_btn.click(
127
+ chat_with_bot, inputs=[msg, chatbot], outputs=chatbot
128
+ ).then(lambda: "", None, msg)
129
+ clear_btn.click(lambda: [], None, chatbot)
130
+
131
+ # Tab 2: STT
132
+ with gr.TabItem("🎀 Speech-to-Text"):
133
+ with gr.Row():
134
+ with gr.Column():
135
+ gr.Markdown("## 🎀 Speech-to-Text (STT)")
136
+ audio_input = gr.Audio(
137
+ sources=["microphone", "upload"],
138
+ type="filepath",
139
+ label="Input Audio (Mic or Upload)",
140
+ )
141
+ stt_button = gr.Button("Convert Speech to Text")
142
+ with gr.Column():
143
+ stt_output = gr.Textbox(label="Transcribed Text", lines=3)
144
+
145
+ stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=stt_output)
146
+
147
+ # Tab 3: TTS
148
+ with gr.TabItem("πŸ”Š Text-to-Speech"):
149
+ with gr.Row():
150
+ with gr.Column():
151
+ gr.Markdown("## πŸ”Š Text-to-Speech (TTS)")
152
+ text_input = gr.Textbox(
153
+ label="Text to Synthesize",
154
+ lines=3,
155
+ value="Hello there, this is a demonstration of the text to speech model.",
156
+ )
157
+ tts_button = gr.Button("Synthesize Speech (Will be slow)")
158
+ with gr.Column():
159
+ audio_output = gr.Audio(label="Synthesized Audio")
160
+ tts_status = gr.Textbox(elem_id="status", label="Status")
161
+
162
+ tts_button.click(
163
+ fn=text_to_speech, inputs=text_input, outputs=[audio_output, tts_status]
164
+ )
165
 
166
+ demo.launch()