WWMachine commited on
Commit
29a7fe7
·
verified ·
1 Parent(s): d152984

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -168
app.py CHANGED
@@ -3,204 +3,106 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
- # --- Configuration (Kept from original script) ---
7
  MODEL_REPO = "Kezovic/iris-q4gguf-v2"
8
  MODEL_FILE = "llama-3.2-1b-instruct.Q4_K_M.gguf"
9
  CONTEXT_WINDOW = 4096
10
  MAX_NEW_TOKENS = 512
11
  TEMPERATURE = 0.7
12
 
13
- # --- Model Loading Function (Kept from original script) ---
 
 
14
  def load_llm():
15
  """Downloads the GGUF model and initializes LlamaCPP."""
 
16
  print("Downloading model...")
17
- model_path = hf_hub_download(
18
- repo_id=MODEL_REPO,
19
- filename=MODEL_FILE
20
- )
21
-
22
- llm = Llama(
23
- model_path=model_path,
24
- n_ctx=CONTEXT_WINDOW,
25
- n_threads=2,
26
- verbose=False
27
- )
28
- print("Model loaded successfully!")
29
- return llm
 
 
 
 
30
 
31
- # Load the model only once when the Space starts
32
  llm = load_llm()
33
 
34
- # --- NEW: Audio-to-Text and Text-to-Audio Inference Function ---
35
- def generate_audio_response(audio_file_path):
36
  """
37
- 1. Transcribes user audio (STT).
38
- 2. Generates a text response using the Llama model.
39
- 3. Returns the transcribed text and the generated response text (which Gradio will TTS).
40
-
41
- Args:
42
- audio_file_path (str): The local path to the recorded audio file.
43
-
44
- Returns:
45
- tuple: (Transcribed Text, Generated Text Response)
46
  """
47
- if audio_file_path is None:
48
- return "Please record some audio first.", ""
49
 
50
- # 1. Transcribe the Audio (STT)
51
- # Gradio's Audio Input component automatically performs STT
52
- # if the 'type' parameter is set to "filepath" and the
53
- # 'label' is set to "Microphone with Whisper".
54
- # However, since we are not using the ChatInterface directly,
55
- # we simulate the transcription by asking the user to speak clearly.
56
- # In a real deployed Space, the user would see a transcript in the UI.
57
- # For a fully audio-only demo, we'll focus on the TTS part.
58
-
59
- # ***IMPORTANT***: The Gradio `gr.Audio(type="filepath", sources=["microphone"])` component
60
- # returns the path to the recorded audio file. For true STT, you would need an
61
- # additional STT model (like OpenAI Whisper or similar) here.
62
- # To keep it simple and focus on the UI change, we'll prompt the user for the text
63
- # they want to "transcribe" in the UI setup below.
64
-
65
- # 2. Use the "transcribed" text for generation
66
- # For a placeholder, let's assume the user's intent is in the file name or we use a static prompt
67
- # Since we can't run Whisper here, we'll rely on the UI component structure.
68
-
69
- # To make this function testable, let's assume the user's text input is passed via a separate text box
70
- # and the audio file is just the trigger.
71
- # MODIFICATION: Let's adjust the UI to use the *text* output from an STT component
72
- # that's often paired with an audio recorder.
73
-
74
- # For the purpose of providing a functional script:
75
- # If using gr.Interface, we can pass the transcription as a separate input.
76
- # If using gr.Blocks, we have full control.
77
-
78
- # Let's adjust the function to accept the transcribed text directly (as in a common Gradio STT flow)
79
- # and remove the audio_file_path argument for simplicity.
80
- return "Error: Function signature needs adjustment for Gradio STT/TTS components."
81
-
82
-
83
- # --- NEW: Modified Inference Function for Audio Interface ---
84
- def generate_and_speak(transcribed_text):
85
- """
86
- Generates a response using the Llama model based on transcribed text
87
- and returns the text output for Gradio's TTS feature.
88
- """
89
- if not transcribed_text or transcribed_text.strip() == "":
90
- return "Please speak clearly into the microphone."
91
 
92
  # Use a basic prompt template
93
- full_prompt = f"### Human: {transcribed_text}\n### Assistant:"
94
-
95
- output = llm(
96
- prompt=full_prompt,
97
- max_tokens=MAX_NEW_TOKENS,
98
- temperature=TEMPERATURE,
99
- stop=["### Human:"],
100
- echo=False
101
- )
102
-
103
- # Extract the text and return it. Gradio's output component (Audio)
104
- # will automatically synthesize this text into speech.
105
- response_text = output['choices'][0]['text'].strip()
106
- return response_text
107
-
108
- # --- NEW: Gradio Interface using gr.Interface for STT/TTS flow ---
109
-
110
- # 1. Input: Audio recorder with automatic Speech-to-Text (STT) via Whisper (if available)
111
- audio_input = gr.Audio(
112
- sources=["microphone"],
113
- type="text", # IMPORTANT: This tells Gradio to return the transcribed text (STT)
114
- label="Speak Your Question Here"
115
- )
116
-
117
- # 2. Output: Text box to show the LLM response, which is automatically converted to speech (TTS)
118
- audio_output = gr.Textbox(
119
- label="Assistant Response (Text)",
120
- value="The model's response will appear here."
121
- )
122
-
123
- # 3. Text-to-Speech Output: This component will automatically read the text from 'audio_output'
124
- tts_output = gr.Audio(
125
- label="Assistant Response (Audio)",
126
- autoplay=True
127
- )
128
 
129
- # Use gr.Blocks for the most control over the complex STT/TTS workflow
130
  with gr.Blocks(title=f"Audio Chat with {MODEL_FILE}") as demo:
131
- gr.Markdown(f"## 🎤 Chat with {MODEL_FILE}")
132
- gr.Markdown("Speak your query, and the LLM will reply in audio!")
133
 
134
- # Row for Input and Output
135
- with gr.Row():
136
- # Column for Input (Audio Recording + STT)
137
- with gr.Column(scale=1):
138
- audio_recorder = gr.Audio(
139
- sources=["microphone"],
140
- type="filepath",
141
- label="1. Record Your Query"
142
- )
143
- # Placeholder for Transcription (Whisper STT is often run on the recorded file)
144
- transcribed_text = gr.Textbox(
145
- label="2. Transcribed Text",
146
- placeholder="Transcription appears here (Simulated or by an STT model)"
147
- )
148
- # The Button triggers the generation
149
- generate_button = gr.Button("3. Generate Response")
150
-
151
- # Column for Output (Generation + TTS)
152
- with gr.Column(scale=2):
153
- text_response = gr.Textbox(
154
- label="LLM Text Response",
155
- lines=5
156
- )
157
- gr.Markdown("### Assistant Audio Response")
158
- # The Audio component reads the text from text_response and speaks it.
159
- audio_playback = gr.Audio(
160
- label="",
161
- autoplay=True,
162
- # This ensures the audio is generated from the text_response
163
- # and doesn't rely on a separate audio file path.
164
- interactive=False
165
- )
166
-
167
- # --- Interaction Logic ---
168
-
169
- # Step 1: When audio is recorded, we simulate transcription (or run an actual STT model here)
170
- # For a working Gradio flow without an STT model, we need the user to type the text.
171
- # Since we can't assume a separate STT model, we'll streamline the flow:
172
-
173
- # Instead of a complex multi-step STT workflow, we use a simple text input
174
- # that is *read* by the TTS component for the model's response.
175
-
176
- # **Simpler Audio Flow (Text Input -> LLM -> TTS Output)**
177
- # This is the most reliable way to demonstrate TTS without adding a separate STT model.
178
- gr.Markdown("---")
179
- gr.Markdown("### Simpler Flow: Text Input to Audio Output (TTS)")
180
-
181
  with gr.Row():
182
  text_input = gr.Textbox(
183
- label="Type your query (This is used to generate the LLM response)",
184
- lines=1,
185
  scale=3
186
  )
187
  audio_btn = gr.Button("Generate and Speak")
188
 
189
- text_output_simulated = gr.Textbox(label="LLM Response Text")
190
- audio_output_simulated = gr.Audio(label="Assistant Audio Playback", autoplay=True)
 
 
 
 
 
 
 
191
 
192
- # Set up the event listener for the simplified flow:
193
  audio_btn.click(
194
  fn=generate_and_speak,
195
  inputs=[text_input],
196
- outputs=[text_output_simulated]
197
- ).then(
198
- # The second function call, after the response text is ready,
199
- # is a dummy function that just returns the text to the audio component.
200
- # Gradio handles the TTS synthesis automatically when the target is an Audio component.
201
- lambda x: x,
202
- inputs=[text_output_simulated],
203
- outputs=[audio_output_simulated]
204
  )
205
 
206
  demo.launch()
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
+ # --- Configuration ---
7
  MODEL_REPO = "Kezovic/iris-q4gguf-v2"
8
  MODEL_FILE = "llama-3.2-1b-instruct.Q4_K_M.gguf"
9
  CONTEXT_WINDOW = 4096
10
  MAX_NEW_TOKENS = 512
11
  TEMPERATURE = 0.7
12
 
13
+ # --- Model Loading Function ---
14
+ # Initialize llm as None to avoid the Llama.__del__ 'NoneType' error
15
+ llm = None
16
  def load_llm():
17
  """Downloads the GGUF model and initializes LlamaCPP."""
18
+ global llm # Use the global variable
19
  print("Downloading model...")
20
+ try:
21
+ model_path = hf_hub_download(
22
+ repo_id=MODEL_REPO,
23
+ filename=MODEL_FILE
24
+ )
25
+
26
+ llm = Llama(
27
+ model_path=model_path,
28
+ n_ctx=CONTEXT_WINDOW,
29
+ n_threads=2,
30
+ verbose=False
31
+ )
32
+ print("Model loaded successfully!")
33
+ return llm
34
+ except Exception as e:
35
+ print(f"Error loading model: {e}")
36
+ return None
37
 
38
+ # Load the model only once
39
  llm = load_llm()
40
 
41
+ # --- Inference Function ---
42
+ def generate_and_speak(text_prompt):
43
  """
44
+ Generates a text response using the Llama model.
45
+ The output text is automatically synthesized into speech by Gradio's Audio component.
 
 
 
 
 
 
 
46
  """
47
+ if llm is None:
48
+ return "Error: LLM failed to load. Please check model configuration.", None
49
 
50
+ if not text_prompt or text_prompt.strip() == "":
51
+ return "Please enter a query.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Use a basic prompt template
54
+ full_prompt = f"### Human: {text_prompt}\n### Assistant:"
55
+
56
+ try:
57
+ output = llm(
58
+ prompt=full_prompt,
59
+ max_tokens=MAX_NEW_TOKENS,
60
+ temperature=TEMPERATURE,
61
+ stop=["### Human:"],
62
+ echo=False
63
+ )
64
+
65
+ response_text = output['choices'][0]['text'].strip()
66
+ # Return the text. It will update the Textbox AND the Audio component.
67
+ return response_text, response_text
68
+ except Exception as e:
69
+ return f"LLM Generation Error: {e}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # --- Gradio Interface (TTS Flow) ---
72
  with gr.Blocks(title=f"Audio Chat with {MODEL_FILE}") as demo:
73
+ gr.Markdown(f"## 🗣️ LLM Chat with Text-to-Speech (TTS)")
74
+ gr.Markdown("Type your query (Text Input) and the LLM will reply in both text and auto-generated audio (TTS).")
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  with gr.Row():
77
  text_input = gr.Textbox(
78
+ label="Your Query (Text Input)",
79
+ lines=2,
80
  scale=3
81
  )
82
  audio_btn = gr.Button("Generate and Speak")
83
 
84
+ # Outputs
85
+ text_output = gr.Textbox(label="LLM Response Text")
86
+ audio_output = gr.Audio(
87
+ label="Assistant Audio Playback (TTS)",
88
+ autoplay=True,
89
+ # Gradio automatically synthesizes the text output received by this Audio component
90
+ # into speech. We set it as an 'update' target.
91
+ interactive=False
92
+ )
93
 
94
+ # Set up the event listener: Button click triggers the function.
95
  audio_btn.click(
96
  fn=generate_and_speak,
97
  inputs=[text_input],
98
+ outputs=[text_output, audio_output]
99
+ )
100
+
101
+ # Enable enter key to submit
102
+ text_input.submit(
103
+ fn=generate_and_speak,
104
+ inputs=[text_input],
105
+ outputs=[text_output, audio_output]
106
  )
107
 
108
  demo.launch()