sop-audio-analyzer / generate_demos.py
daasime's picture
Add 3 new demo scenarios + Coming Soon badge + lazy generation
ebe2991
"""
Generate demo audio samples for the Audio Analyzer PoC.
Run during Docker build: python generate_demos.py
Produces 3 WAV files (16kHz mono) in DATA_DIR/samples/:
1. demo_clean_exam.wav - natural exam answer, ~35s, LOW risk
2. demo_reading_fraud.wav - monotone reading + long pause, ~35s, MEDIUM-HIGH risk
3. demo_coaching_fraud.wav - main speaker + whispered coach, ~35s, triggers whisper detection
"""
import os
import io
from gtts import gTTS
from pydub import AudioSegment
OUTPUT_DIR = os.path.join(os.environ.get("DATA_DIR", "data"), "samples")
def _tts(text, lang="en", slow=False):
"""Generate speech segment via gTTS, return as 16kHz mono AudioSegment."""
buf = io.BytesIO()
tts = gTTS(text=text, lang=lang, slow=slow)
tts.write_to_fp(buf)
buf.seek(0)
seg = AudioSegment.from_mp3(buf)
return seg.set_frame_rate(16000).set_channels(1)
def _silence(ms):
return AudioSegment.silent(duration=ms, frame_rate=16000)
def generate_clean_exam():
"""Scenario 1: Clean exam - natural speech with fillers and varied pace."""
parts = [
_tts("Well, I think the main reason people travel is, you know, to experience different cultures."),
_silence(800),
_tts("Um, personally, I've always found that visiting new places really broadens your perspective."),
_silence(1200),
_tts("Like, for example, when I visited Japan last year, the food and the customs were completely different from what I was used to."),
_silence(600),
_tts("And I think that's what makes it so valuable, right? You learn things you can't learn from books."),
_silence(900),
_tts("So yeah, I believe travel is one of the best ways to learn about the world and about yourself."),
]
return sum(parts[1:], parts[0])
def generate_reading_fraud():
"""Scenario 2: Reading fraud - monotone, constant pace, 7s suspicious pause."""
parts = [
_tts("The advantages of public transportation include reduced carbon emissions and lower costs for commuters.", slow=True),
_silence(500),
_tts("Furthermore, public transit systems decrease traffic congestion in urban areas and improve air quality.", slow=True),
_silence(7000), # 7-second suspicious pause
_tts("In addition, public transportation provides mobility options for people who cannot drive.", slow=True),
_silence(400),
_tts("In conclusion, governments should invest more in public transportation infrastructure to benefit both the environment and the economy.", slow=True),
]
return sum(parts[1:], parts[0])
def generate_coaching_fraud():
"""Scenario 3: Coaching - main speaker + quiet second voice whispering prompts."""
main1 = _tts("I believe that technology has had a very positive impact on education overall.")
main2 = _tts("Oh right, and also online platforms provide personalized learning experiences for students everywhere.")
main3 = _tts("Yes, and another important point is that technology makes education accessible to people in remote areas.")
main4 = _tts("So in summary, technology has transformed education for the better in many ways.")
# Coach whispers (lower volume to simulate background prompting)
coach1 = _tts("Talk about online platforms", lang="en") - 18 # -18dB
coach2 = _tts("Mention accessibility", lang="en") - 20 # -20dB
return (
main1
+ _silence(1500)
+ coach1
+ _silence(800)
+ main2
+ _silence(2000)
+ coach2
+ _silence(600)
+ main3
+ _silence(800)
+ main4
)
def generate_wake_word():
"""Scenario 4: Wake word - someone asks Alexa mid-exam."""
import numpy as np
main1 = _tts("I think the most important factor in choosing a career is job satisfaction.")
main2 = _tts("Because if you enjoy what you do, you'll be more productive and motivated.")
main3 = _tts("Sorry, what was I saying? Oh yes, career satisfaction is crucial for long term happiness.")
main4 = _tts("And that's why I believe people should follow their passion when choosing a career.")
# "Alexa" wake word spoken quietly
alexa_call = _tts("Alexa, what is career satisfaction", lang="en") - 12
# Synthetic beep tone (1kHz, 0.3s) to simulate device response
sample_rate = 16000
t = np.linspace(0, 0.3, int(sample_rate * 0.3))
beep = (np.sin(2 * np.pi * 1000 * t) * 0.15 * 32767).astype(np.int16)
beep_seg = AudioSegment(beep.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)
return (
main1
+ _silence(1000)
+ main2
+ _silence(1500)
+ alexa_call
+ _silence(400)
+ beep_seg
+ _silence(2000)
+ main3
+ _silence(800)
+ main4
)
def generate_synthetic_voice():
"""Scenario 5: Synthetic/robotic voice with beep tones."""
import numpy as np
# Use slow TTS to sound more robotic
robo1 = _tts("The answer to question number one is as follows.", slow=True)
robo2 = _tts("Climate change is caused by greenhouse gas emissions from human activities.", slow=True)
robo3 = _tts("The main sources include transportation, industry, and agriculture.", slow=True)
robo4 = _tts("In conclusion, reducing emissions requires global cooperation.", slow=True)
# Generate beep tones between segments (synthetic indicator)
sample_rate = 16000
def make_beep(freq, dur):
t = np.linspace(0, dur, int(sample_rate * dur))
tone = (np.sin(2 * np.pi * freq * t) * 0.2 * 32767).astype(np.int16)
return AudioSegment(tone.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)
beep_hi = make_beep(880, 0.15)
beep_lo = make_beep(440, 0.15)
double_beep = beep_hi + _silence(100) + beep_lo
return (
double_beep
+ _silence(300)
+ robo1
+ _silence(400)
+ robo2
+ double_beep
+ _silence(500)
+ robo3
+ _silence(400)
+ robo4
+ _silence(200)
+ double_beep
)
def generate_two_speakers():
"""Scenario 6: Two distinct speakers taking turns (impersonation/proxy)."""
# Speaker A - normal pace, English
a1 = _tts("Hello, my name is John and I'm here to take the speaking test today.")
a2 = _tts("I think that learning a second language is very important in today's world.")
a3 = _tts("For example, it helps you communicate with people from different countries.")
# Speaker B - different accent (UK English) to sound like a different person
b1 = _tts("Right, so the next topic is about technology in education.", lang="en-uk")
b2 = _tts("Technology has completely changed the way students learn and interact with content.", lang="en-uk")
b3 = _tts("Online courses and digital tools make education more accessible to everyone.", lang="en-uk")
return (
a1
+ _silence(1000)
+ a2
+ _silence(800)
+ a3
+ _silence(2500) # Longer pause as speakers switch
+ b1
+ _silence(700)
+ b2
+ _silence(600)
+ b3
)
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
demos = {
"demo_clean_exam.wav": generate_clean_exam,
"demo_reading_fraud.wav": generate_reading_fraud,
"demo_coaching_fraud.wav": generate_coaching_fraud,
"demo_wake_word.wav": generate_wake_word,
"demo_synthetic_voice.wav": generate_synthetic_voice,
"demo_two_speakers.wav": generate_two_speakers,
}
for filename, generator in demos.items():
print(f"Generating {filename}...")
audio = generator()
path = os.path.join(OUTPUT_DIR, filename)
audio.export(path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
duration = len(audio) / 1000
print(f" Saved: {path} ({duration:.1f}s)")
print("All demo files generated.")
if __name__ == "__main__":
main()