arymandeshwal commited on
Commit ·
1fd66a4
1
Parent(s): c72d0a8
fix: Integration speech to text and text to speech with UI
Browse files- .env.example +2 -0
- .gitignore +3 -0
- .streamlit/config.toml +4 -0
- app.py +41 -15
- core/input_comp_gen.py +2 -4
- core/response_evaluator.py +0 -1
- core/speech_converter.py +21 -9
- requirements.txt +3 -2
.env.example
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
GEMINI_API_KEY=your_api_key_here
|
| 2 |
GROQ_API_KEY=your_api_key_here
|
|
|
|
|
|
|
|
|
| 1 |
GEMINI_API_KEY=your_api_key_here
|
| 2 |
GROQ_API_KEY=your_api_key_here
|
| 3 |
+
TEXT_TO_AUDIO=your_hugging_face_api_key_here
|
| 4 |
+
STREAMLIT_WATCHER_TYPE=none
|
.gitignore
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
### Python ###
|
|
|
|
|
|
|
|
|
|
| 2 |
# Byte-compiled / optimized / DLL files
|
| 3 |
__pycache__/
|
| 4 |
*.py[cod]
|
|
|
|
| 1 |
### Python ###
|
| 2 |
+
audio/
|
| 3 |
+
models/
|
| 4 |
+
|
| 5 |
# Byte-compiled / optimized / DLL files
|
| 6 |
__pycache__/
|
| 7 |
*.py[cod]
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base="dark"
|
| 3 |
+
primaryColor = "#4B9EFF"
|
| 4 |
+
|
app.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import json
|
| 3 |
from datetime import datetime
|
|
@@ -8,7 +12,7 @@ from core.answering_competitor import Answering_competitor
|
|
| 8 |
from core.response_evaluator import scorer
|
| 9 |
from core.summary_utils import custom_css, generate_text_summary, clean_json_response
|
| 10 |
from core.generate_summary import generate_summary_content
|
| 11 |
-
|
| 12 |
|
| 13 |
# Page configuration
|
| 14 |
st.set_page_config(
|
|
@@ -17,6 +21,8 @@ st.set_page_config(
|
|
| 17 |
layout="wide"
|
| 18 |
)
|
| 19 |
|
|
|
|
|
|
|
| 20 |
if 'questions_generated' not in st.session_state:
|
| 21 |
st.session_state.questions_generated = []
|
| 22 |
|
|
@@ -69,16 +75,6 @@ if st.session_state.page_stack[-1] == "Loading":
|
|
| 69 |
|
| 70 |
if st.session_state.page_stack[-1] == "Summary":
|
| 71 |
st.markdown(custom_css, unsafe_allow_html=True)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
# with st.container(border=True):
|
| 75 |
-
|
| 76 |
-
# if "improvement_summary" not in st.session_state or not st.session_state["improvement_summary"]:
|
| 77 |
-
# st.session_state["improvement_summary"] = improvement_summary(least_scores(st.session_state.track_score))
|
| 78 |
-
|
| 79 |
-
# for i in st.session_state["improvement_summary"].split("\n"):
|
| 80 |
-
# st.markdown(i)
|
| 81 |
-
|
| 82 |
st.session_state.user_answers = [st.session_state[f"user_answer_{ques}"] for ques in range(len(st.session_state.questions))]
|
| 83 |
st.session_state.ai_answers = [st.session_state[f"llm_answer_{ques}"] for ques in range(len(st.session_state.questions))]
|
| 84 |
st.session_state.scores = [
|
|
@@ -301,6 +297,7 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
|
|
| 301 |
with user_ans:
|
| 302 |
st.markdown("**👤 Your Answer:**")
|
| 303 |
if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
|
|
|
|
| 304 |
st.text_area(
|
| 305 |
"Type your answer here",
|
| 306 |
value=st.session_state[f"user_answer_{ques}"],
|
|
@@ -308,19 +305,40 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
|
|
| 308 |
label_visibility="collapsed",
|
| 309 |
disabled=True
|
| 310 |
)
|
|
|
|
|
|
|
| 311 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
user_answer = st.text_area(
|
| 313 |
"Type your answer here",
|
| 314 |
-
value=
|
| 315 |
height=150,
|
| 316 |
label_visibility="collapsed",
|
| 317 |
)
|
| 318 |
|
|
|
|
|
|
|
| 319 |
|
| 320 |
with llm_ans:
|
| 321 |
st.markdown("**🤖 Rival's Answer:**")
|
| 322 |
if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
|
| 323 |
st.text_area("Competitor Response",st.session_state[f"llm_answer_{ques}"], height=150, label_visibility="collapsed")
|
|
|
|
|
|
|
| 324 |
else:
|
| 325 |
# st.text_area("Competitor Response",value="Hidden values", height=150, label_visibility="collapsed")
|
| 326 |
st.markdown(f"""
|
|
@@ -346,12 +364,20 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
|
|
| 346 |
{st.session_state[f"llm_answer_{ques}"]}
|
| 347 |
</div>
|
| 348 |
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
|
|
|
|
| 351 |
with submit_area:
|
| 352 |
-
if st.button("Submit
|
| 353 |
st.session_state[f"submitted_ans_{ques}"] = True
|
| 354 |
st.session_state[f"user_answer_{ques}"] = user_answer
|
|
|
|
| 355 |
st.rerun()
|
| 356 |
|
| 357 |
|
|
@@ -399,7 +425,7 @@ if st.session_state.page_stack[-1].startswith("Ques_"):
|
|
| 399 |
|
| 400 |
with next:
|
| 401 |
if ques < len(st.session_state.questions)-1:
|
| 402 |
-
if st.button("
|
| 403 |
st.session_state.page_stack.append(f"Ques_{ques+1}")
|
| 404 |
st.rerun()
|
| 405 |
if ques == len(st.session_state.questions) -1:
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
|
| 3 |
+
os.environ["PYTORCH_DISABLE_WIN_FIX"] = "1"
|
| 4 |
+
|
| 5 |
import streamlit as st
|
| 6 |
import json
|
| 7 |
from datetime import datetime
|
|
|
|
| 12 |
from core.response_evaluator import scorer
|
| 13 |
from core.summary_utils import custom_css, generate_text_summary, clean_json_response
|
| 14 |
from core.generate_summary import generate_summary_content
|
| 15 |
+
from core.speech_converter import text_to_audio, load_model
|
| 16 |
|
| 17 |
# Page configuration
|
| 18 |
st.set_page_config(
|
|
|
|
| 21 |
layout="wide"
|
| 22 |
)
|
| 23 |
|
| 24 |
+
whisper_model = load_model()
|
| 25 |
+
|
| 26 |
if 'questions_generated' not in st.session_state:
|
| 27 |
st.session_state.questions_generated = []
|
| 28 |
|
|
|
|
| 75 |
|
| 76 |
if st.session_state.page_stack[-1] == "Summary":
|
| 77 |
st.markdown(custom_css, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
st.session_state.user_answers = [st.session_state[f"user_answer_{ques}"] for ques in range(len(st.session_state.questions))]
|
| 79 |
st.session_state.ai_answers = [st.session_state[f"llm_answer_{ques}"] for ques in range(len(st.session_state.questions))]
|
| 80 |
st.session_state.scores = [
|
|
|
|
| 297 |
with user_ans:
|
| 298 |
st.markdown("**👤 Your Answer:**")
|
| 299 |
if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
|
| 300 |
+
st.audio_input(label="Record Audio",disabled=True, label_visibility="collapsed")
|
| 301 |
st.text_area(
|
| 302 |
"Type your answer here",
|
| 303 |
value=st.session_state[f"user_answer_{ques}"],
|
|
|
|
| 305 |
label_visibility="collapsed",
|
| 306 |
disabled=True
|
| 307 |
)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
else:
|
| 311 |
+
audio_file = st.audio_input(label="Record Audio",key=f"audio_ip_{ques}" ,label_visibility="collapsed")
|
| 312 |
+
transcribed_text = ""
|
| 313 |
+
if audio_file:
|
| 314 |
+
file_path = os.path.join("audio",f"user_answer_{ques}.wav")
|
| 315 |
+
try:
|
| 316 |
+
with open(file_path, "wb") as f:
|
| 317 |
+
audio_file.seek(0)
|
| 318 |
+
f.write(audio_file.read())
|
| 319 |
+
|
| 320 |
+
transcribed_text = whisper_model.transcribe(file_path)["text"]
|
| 321 |
+
# print(st.session_state[f"transcribed_{ques}"])
|
| 322 |
+
# st.rerun()
|
| 323 |
+
except Exception as e:
|
| 324 |
+
st.error(f"Error occured while transcribing {e}")
|
| 325 |
+
|
| 326 |
user_answer = st.text_area(
|
| 327 |
"Type your answer here",
|
| 328 |
+
value= transcribed_text,
|
| 329 |
height=150,
|
| 330 |
label_visibility="collapsed",
|
| 331 |
)
|
| 332 |
|
| 333 |
+
|
| 334 |
+
|
| 335 |
|
| 336 |
with llm_ans:
|
| 337 |
st.markdown("**🤖 Rival's Answer:**")
|
| 338 |
if f"submitted_ans_{ques}" in st.session_state and st.session_state[f"submitted_ans_{ques}"]:
|
| 339 |
st.text_area("Competitor Response",st.session_state[f"llm_answer_{ques}"], height=150, label_visibility="collapsed")
|
| 340 |
+
st.audio(data=os.path.join("audio",f"llm_answer_{ques}.wav"))
|
| 341 |
+
|
| 342 |
else:
|
| 343 |
# st.text_area("Competitor Response",value="Hidden values", height=150, label_visibility="collapsed")
|
| 344 |
st.markdown(f"""
|
|
|
|
| 364 |
{st.session_state[f"llm_answer_{ques}"]}
|
| 365 |
</div>
|
| 366 |
""", unsafe_allow_html=True)
|
| 367 |
+
try:
|
| 368 |
+
os.makedirs("audio",exist_ok=True)
|
| 369 |
+
text_to_audio(st.session_state[f"llm_answer_{ques}"], os.path.join("audio",f"llm_answer_{ques}.wav"))
|
| 370 |
+
except Exception as e:
|
| 371 |
+
print(e)
|
| 372 |
+
st.error(f"An error occurred {e}")
|
| 373 |
|
| 374 |
+
left_area,_ = st.columns([1,1])
|
| 375 |
+
_, submit_area, _ = left_area.columns([1,3,1])
|
| 376 |
with submit_area:
|
| 377 |
+
if st.button("🚀 Submit & Compare Answers", use_container_width=True, type="primary"):
|
| 378 |
st.session_state[f"submitted_ans_{ques}"] = True
|
| 379 |
st.session_state[f"user_answer_{ques}"] = user_answer
|
| 380 |
+
print("User answer\n",st.session_state[f"user_answer_{ques}"])
|
| 381 |
st.rerun()
|
| 382 |
|
| 383 |
|
|
|
|
| 425 |
|
| 426 |
with next:
|
| 427 |
if ques < len(st.session_state.questions)-1:
|
| 428 |
+
if st.button("Next Question ➡️"):
|
| 429 |
st.session_state.page_stack.append(f"Ques_{ques+1}")
|
| 430 |
st.rerun()
|
| 431 |
if ques == len(st.session_state.questions) -1:
|
core/input_comp_gen.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import json
|
| 3 |
-
from typing import Dict, Any, List, Tuple
|
| 4 |
from model import generate_response
|
| 5 |
from utils import FileProcessor
|
| 6 |
-
from speech_converter import
|
| 7 |
import tempfile
|
| 8 |
import os
|
| 9 |
-
import torch
|
| 10 |
|
| 11 |
# Initialize the model through speech_converter
|
| 12 |
whisper_model = load_model()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import json
|
|
|
|
| 3 |
from model import generate_response
|
| 4 |
from utils import FileProcessor
|
| 5 |
+
from speech_converter import text_to_audio, load_model
|
| 6 |
import tempfile
|
| 7 |
import os
|
|
|
|
| 8 |
|
| 9 |
# Initialize the model through speech_converter
|
| 10 |
whisper_model = load_model()
|
core/response_evaluator.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from core.model import generate_response
|
| 2 |
import json
|
| 3 |
from typing import List
|
| 4 |
-
from pydantic import BaseModel
|
| 5 |
from core.utils import trim_backticks, Collect_score
|
| 6 |
|
| 7 |
|
|
|
|
| 1 |
from core.model import generate_response
|
| 2 |
import json
|
| 3 |
from typing import List
|
|
|
|
| 4 |
from core.utils import trim_backticks, Collect_score
|
| 5 |
|
| 6 |
|
core/speech_converter.py
CHANGED
|
@@ -1,12 +1,7 @@
|
|
| 1 |
import whisper
|
| 2 |
import streamlit as st
|
| 3 |
import pyttsx3
|
| 4 |
-
import os
|
| 5 |
-
import torch
|
| 6 |
|
| 7 |
-
# Set PyTorch settings to avoid thread/loop errors
|
| 8 |
-
torch.set_num_threads(1)
|
| 9 |
-
torch.set_num_interop_threads(1)
|
| 10 |
|
| 11 |
@st.cache_resource(show_spinner="Loading speech recognition model...")
|
| 12 |
def load_model():
|
|
@@ -39,14 +34,31 @@ def audio_to_text(audio_file_path=None):
|
|
| 39 |
print(f"Audio processing error: {e}")
|
| 40 |
return None
|
| 41 |
|
| 42 |
-
def text_to_audio(text):
|
| 43 |
"""Converts text to speech using pyttsx3"""
|
| 44 |
engine = pyttsx3.init()
|
| 45 |
voices = engine.getProperty("voices")
|
| 46 |
-
engine.setProperty("rate", 125)
|
| 47 |
-
engine.setProperty("voice", voices[
|
| 48 |
-
engine.say(text)
|
|
|
|
| 49 |
engine.runAndWait()
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
if __name__ == "__main__":
|
| 52 |
text_to_audio("Test speech conversion")
|
|
|
|
| 1 |
import whisper
|
| 2 |
import streamlit as st
|
| 3 |
import pyttsx3
|
|
|
|
|
|
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
@st.cache_resource(show_spinner="Loading speech recognition model...")
|
| 7 |
def load_model():
|
|
|
|
| 34 |
print(f"Audio processing error: {e}")
|
| 35 |
return None
|
| 36 |
|
| 37 |
+
def text_to_audio(text, filepath):
|
| 38 |
"""Converts text to speech using pyttsx3"""
|
| 39 |
engine = pyttsx3.init()
|
| 40 |
voices = engine.getProperty("voices")
|
| 41 |
+
engine.setProperty("rate", 125)
|
| 42 |
+
engine.setProperty("voice", voices[0].id)
|
| 43 |
+
# engine.say(text)
|
| 44 |
+
engine.save_to_file(text, filepath)
|
| 45 |
engine.runAndWait()
|
| 46 |
|
| 47 |
+
# def text_to_audio(text=None, filepath=None):
|
| 48 |
+
# client = InferenceClient(
|
| 49 |
+
# provider="fal-ai",
|
| 50 |
+
# api_key=os.getenv("TEXT_TO_AUDIO"),
|
| 51 |
+
# )
|
| 52 |
+
# # audio is returned as bytes
|
| 53 |
+
# audio_bytes=client.text_to_speech(
|
| 54 |
+
# text,
|
| 55 |
+
# model="hexgrad/Kokoro-82M",
|
| 56 |
+
# )
|
| 57 |
+
|
| 58 |
+
# with open(filepath, "wb") as f:
|
| 59 |
+
# f.write(audio_bytes)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
if __name__ == "__main__":
|
| 64 |
text_to_audio("Test speech conversion")
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
openai>=1.0.0
|
| 2 |
python-dotenv>=1.0.0
|
| 3 |
-
SpeechRecognition==3.14.3
|
| 4 |
pyttsx3==2.98
|
| 5 |
-
wavio==0.0.9
|
| 6 |
groq==0.26.0
|
|
|
|
| 1 |
+
openai-whisper
|
| 2 |
+
streamlit
|
| 3 |
+
PyPDF2
|
| 4 |
openai>=1.0.0
|
| 5 |
python-dotenv>=1.0.0
|
|
|
|
| 6 |
pyttsx3==2.98
|
|
|
|
| 7 |
groq==0.26.0
|