arymandeshwal commited on
Commit
1fd66a4
·
1 Parent(s): c72d0a8

fix: Integration speech to text and text to speech with UI

Browse files
.env.example CHANGED
@@ -1,2 +1,4 @@
1
  GEMINI_API_KEY=your_api_key_here
2
  GROQ_API_KEY=your_api_key_here
 
 
 
1
  GEMINI_API_KEY=your_api_key_here
2
  GROQ_API_KEY=your_api_key_here
3
+ TEXT_TO_AUDIO=your_hugging_face_api_key_here
4
+ STREAMLIT_WATCHER_TYPE=none
.gitignore CHANGED
@@ -1,4 +1,7 @@
1
  ### Python ###
 
 
 
2
  # Byte-compiled / optimized / DLL files
3
  __pycache__/
4
  *.py[cod]
 
1
  ### Python ###
2
+ audio/
3
+ models/
4
+
5
  # Byte-compiled / optimized / DLL files
6
  __pycache__/
7
  *.py[cod]
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [theme]
2
+ base="dark"
3
+ primaryColor = "#4B9EFF"
4
+
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import streamlit as st
2
  import json
3
  from datetime import datetime
@@ -8,7 +12,7 @@ from core.answering_competitor import Answering_competitor
8
  from core.response_evaluator import scorer
9
  from core.summary_utils import custom_css, generate_text_summary, clean_json_response
10
  from core.generate_summary import generate_summary_content
11
-
12
 
13
  # Page configuration
14
  st.set_page_config(
@@ -17,6 +21,8 @@ st.set_page_config(
17
  layout="wide"
18
  )
19
 
 
 
20
  if 'questions_generated' not in st.session_state:
21
  st.session_state.questions_generated = []
22
 
@@ -69,16 +75,6 @@ if st.session_state.page_stack[-1] == "Loading":
69
 
70
  if st.session_state.page_stack[-1] == "Summary":
71
  st.markdown(custom_css, unsafe_allow_html=True)
72
-
73
-
74
- # with st.container(border=True):
75
-
76
- # if "improvement_summary" not in st.session_state or not st.session_state["improvement_summary"]:
77
- # st.session_state["improvement_summary"] = improvement_summary(least_scores(st.session_state.track_score))
78
-
79
- # for i in st.session_state["improvement_summary"].split("\n"):
80
- # st.markdown(i)
81
-
82
  st.session_state.user_answers = [st.session_state[f"user_answer_{ques}"] for ques in range(len(st.session_state.questions))]
83
  st.session_state.ai_answers = [st.session_state[f"llm_answer_{ques}"] for ques in range(len(st.session_state.questions))]
84
  st.session_state.scores = [
@@ -301,6 +297,7 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
301
  with user_ans:
302
  st.markdown("**👤 Your Answer:**")
303
  if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
 
304
  st.text_area(
305
  "Type your answer here",
306
  value=st.session_state[f"user_answer_{ques}"],
@@ -308,19 +305,40 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
308
  label_visibility="collapsed",
309
  disabled=True
310
  )
 
 
311
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  user_answer = st.text_area(
313
  "Type your answer here",
314
- value="",
315
  height=150,
316
  label_visibility="collapsed",
317
  )
318
 
 
 
319
 
320
  with llm_ans:
321
  st.markdown("**🤖 Rival's Answer:**")
322
  if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
323
  st.text_area("Competitor Response",st.session_state[f"llm_answer_{ques}"], height=150, label_visibility="collapsed")
 
 
324
  else:
325
  # st.text_area("Competitor Response",value="Hidden values", height=150, label_visibility="collapsed")
326
  st.markdown(f"""
@@ -346,12 +364,20 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
346
  {st.session_state[f"llm_answer_{ques}"]}
347
  </div>
348
  """, unsafe_allow_html=True)
 
 
 
 
 
 
349
 
350
- _,submit_area,_ = st.columns([1,1,1])
 
351
  with submit_area:
352
- if st.button("Submit Answer", use_container_width=True):
353
  st.session_state[f"submitted_ans_{ques}"] = True
354
  st.session_state[f"user_answer_{ques}"] = user_answer
 
355
  st.rerun()
356
 
357
 
@@ -399,7 +425,7 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
399
 
400
  with next:
401
  if ques < len(st.session_state.questions)-1:
402
- if st.button("➡️ Next Question"):
403
  st.session_state.page_stack.append(f"Ques_{ques+1}")
404
  st.rerun()
405
  if ques == len(st.session_state.questions) -1:
 
1
+ import os
2
+ os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
3
+ os.environ["PYTORCH_DISABLE_WIN_FIX"] = "1"
4
+
5
  import streamlit as st
6
  import json
7
  from datetime import datetime
 
12
  from core.response_evaluator import scorer
13
  from core.summary_utils import custom_css, generate_text_summary, clean_json_response
14
  from core.generate_summary import generate_summary_content
15
+ from core.speech_converter import text_to_audio, load_model
16
 
17
  # Page configuration
18
  st.set_page_config(
 
21
  layout="wide"
22
  )
23
 
24
+ whisper_model = load_model()
25
+
26
  if 'questions_generated' not in st.session_state:
27
  st.session_state.questions_generated = []
28
 
 
75
 
76
  if st.session_state.page_stack[-1] == "Summary":
77
  st.markdown(custom_css, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
78
  st.session_state.user_answers = [st.session_state[f"user_answer_{ques}"] for ques in range(len(st.session_state.questions))]
79
  st.session_state.ai_answers = [st.session_state[f"llm_answer_{ques}"] for ques in range(len(st.session_state.questions))]
80
  st.session_state.scores = [
 
297
  with user_ans:
298
  st.markdown("**👤 Your Answer:**")
299
  if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
300
+ st.audio_input(label="Record Audio",disabled=True, label_visibility="collapsed")
301
  st.text_area(
302
  "Type your answer here",
303
  value=st.session_state[f"user_answer_{ques}"],
 
305
  label_visibility="collapsed",
306
  disabled=True
307
  )
308
+
309
+
310
  else:
311
+ audio_file = st.audio_input(label="Record Audio",key=f"audio_ip_{ques}" ,label_visibility="collapsed")
312
+ transcribed_text = ""
313
+ if audio_file:
314
+ file_path = os.path.join("audio",f"user_answer_{ques}.wav")
315
+ try:
316
+ with open(file_path, "wb") as f:
317
+ audio_file.seek(0)
318
+ f.write(audio_file.read())
319
+
320
+ transcribed_text = whisper_model.transcribe(file_path)["text"]
321
+ # print(st.session_state[f"transcribed_{ques}"])
322
+ # st.rerun()
323
+ except Exception as e:
324
+ st.error(f"Error occured while transcribing {e}")
325
+
326
  user_answer = st.text_area(
327
  "Type your answer here",
328
+ value= transcribed_text,
329
  height=150,
330
  label_visibility="collapsed",
331
  )
332
 
333
+
334
+
335
 
336
  with llm_ans:
337
  st.markdown("**🤖 Rival's Answer:**")
338
  if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
339
  st.text_area("Competitor Response",st.session_state[f"llm_answer_{ques}"], height=150, label_visibility="collapsed")
340
+ st.audio(data=os.path.join("audio",f"llm_answer_{ques}.wav"))
341
+
342
  else:
343
  # st.text_area("Competitor Response",value="Hidden values", height=150, label_visibility="collapsed")
344
  st.markdown(f"""
 
364
  {st.session_state[f"llm_answer_{ques}"]}
365
  </div>
366
  """, unsafe_allow_html=True)
367
+ try:
368
+ os.makedirs("audio",exist_ok=True)
369
+ text_to_audio(st.session_state[f"llm_answer_{ques}"], os.path.join("audio",f"llm_answer_{ques}.wav"))
370
+ except Exception as e:
371
+ print(e)
372
+ st.error(f"An error occurred {e}")
373
 
374
+ left_area,_ = st.columns([1,1])
375
+ _, submit_area, _ = left_area.columns([1,3,1])
376
  with submit_area:
377
+ if st.button("🚀 Submit & Compare Answers", use_container_width=True, type="primary"):
378
  st.session_state[f"submitted_ans_{ques}"] = True
379
  st.session_state[f"user_answer_{ques}"] = user_answer
380
+ print("User answer\n",st.session_state[f"user_answer_{ques}"])
381
  st.rerun()
382
 
383
 
 
425
 
426
  with next:
427
  if ques < len(st.session_state.questions)-1:
428
+ if st.button("Next Question ➡️"):
429
  st.session_state.page_stack.append(f"Ques_{ques+1}")
430
  st.rerun()
431
  if ques == len(st.session_state.questions) -1:
core/input_comp_gen.py CHANGED
@@ -1,12 +1,10 @@
1
  import streamlit as st
2
- import json
3
- from typing import Dict, Any, List, Tuple
4
  from model import generate_response
5
  from utils import FileProcessor
6
- from speech_converter import audio_to_text, text_to_audio, load_model
7
  import tempfile
8
  import os
9
- import torch
10
 
11
  # Initialize the model through speech_converter
12
  whisper_model = load_model()
 
1
  import streamlit as st
2
+ import json
 
3
  from model import generate_response
4
  from utils import FileProcessor
5
+ from speech_converter import text_to_audio, load_model
6
  import tempfile
7
  import os
 
8
 
9
  # Initialize the model through speech_converter
10
  whisper_model = load_model()
core/response_evaluator.py CHANGED
@@ -1,7 +1,6 @@
1
  from core.model import generate_response
2
  import json
3
  from typing import List
4
- from pydantic import BaseModel
5
  from core.utils import trim_backticks, Collect_score
6
 
7
 
 
1
  from core.model import generate_response
2
  import json
3
  from typing import List
 
4
  from core.utils import trim_backticks, Collect_score
5
 
6
 
core/speech_converter.py CHANGED
@@ -1,12 +1,7 @@
1
  import whisper
2
  import streamlit as st
3
  import pyttsx3
4
- import os
5
- import torch
6
 
7
- # Set PyTorch settings to avoid thread/loop errors
8
- torch.set_num_threads(1)
9
- torch.set_num_interop_threads(1)
10
 
11
  @st.cache_resource(show_spinner="Loading speech recognition model...")
12
  def load_model():
@@ -39,14 +34,31 @@ def audio_to_text(audio_file_path=None):
39
  print(f"Audio processing error: {e}")
40
  return None
41
 
42
- def text_to_audio(text):
43
  """Converts text to speech using pyttsx3"""
44
  engine = pyttsx3.init()
45
  voices = engine.getProperty("voices")
46
- engine.setProperty("rate", 125)
47
- engine.setProperty("voice", voices[1].id)
48
- engine.say(text)
 
49
  engine.runAndWait()
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if __name__ == "__main__":
52
  text_to_audio("Test speech conversion")
 
1
  import whisper
2
  import streamlit as st
3
  import pyttsx3
 
 
4
 
 
 
 
5
 
6
  @st.cache_resource(show_spinner="Loading speech recognition model...")
7
  def load_model():
 
34
  print(f"Audio processing error: {e}")
35
  return None
36
 
37
+ def text_to_audio(text, filepath):
38
  """Converts text to speech using pyttsx3"""
39
  engine = pyttsx3.init()
40
  voices = engine.getProperty("voices")
41
+ engine.setProperty("rate", 125)
42
+ engine.setProperty("voice", voices[0].id)
43
+ # engine.say(text)
44
+ engine.save_to_file(text, filepath)
45
  engine.runAndWait()
46
 
47
+ # def text_to_audio(text=None, filepath=None):
48
+ # client = InferenceClient(
49
+ # provider="fal-ai",
50
+ # api_key=os.getenv("TEXT_TO_AUDIO"),
51
+ # )
52
+ # # audio is returned as bytes
53
+ # audio_bytes=client.text_to_speech(
54
+ # text,
55
+ # model="hexgrad/Kokoro-82M",
56
+ # )
57
+
58
+ # with open(filepath, "wb") as f:
59
+ # f.write(audio_bytes)
60
+
61
+
62
+
63
  if __name__ == "__main__":
64
  text_to_audio("Test speech conversion")
requirements.txt CHANGED
@@ -1,6 +1,7 @@
 
 
 
1
  openai>=1.0.0
2
  python-dotenv>=1.0.0
3
- SpeechRecognition==3.14.3
4
  pyttsx3==2.98
5
- wavio==0.0.9
6
  groq==0.26.0
 
1
+ openai-whisper
2
+ streamlit
3
+ PyPDF2
4
  openai>=1.0.0
5
  python-dotenv>=1.0.0
 
6
  pyttsx3==2.98
 
7
  groq==0.26.0