Spaces:
Running
Running
| """ | |
| Generate demo audio samples for the Audio Analyzer PoC. | |
| Run during Docker build: python generate_demos.py | |
| Produces 3 WAV files (16kHz mono) in DATA_DIR/samples/: | |
| 1. demo_clean_exam.wav - natural exam answer, ~35s, LOW risk | |
| 2. demo_reading_fraud.wav - monotone reading + long pause, ~35s, MEDIUM-HIGH risk | |
| 3. demo_coaching_fraud.wav - main speaker + whispered coach, ~35s, triggers whisper detection | |
| """ | |
| import os | |
| import io | |
| from gtts import gTTS | |
| from pydub import AudioSegment | |
| OUTPUT_DIR = os.path.join(os.environ.get("DATA_DIR", "data"), "samples") | |
| def _tts(text, lang="en", slow=False): | |
| """Generate speech segment via gTTS, return as 16kHz mono AudioSegment.""" | |
| buf = io.BytesIO() | |
| tts = gTTS(text=text, lang=lang, slow=slow) | |
| tts.write_to_fp(buf) | |
| buf.seek(0) | |
| seg = AudioSegment.from_mp3(buf) | |
| return seg.set_frame_rate(16000).set_channels(1) | |
| def _silence(ms): | |
| return AudioSegment.silent(duration=ms, frame_rate=16000) | |
| def generate_clean_exam(): | |
| """Scenario 1: Clean exam - natural speech with fillers and varied pace.""" | |
| parts = [ | |
| _tts("Well, I think the main reason people travel is, you know, to experience different cultures."), | |
| _silence(800), | |
| _tts("Um, personally, I've always found that visiting new places really broadens your perspective."), | |
| _silence(1200), | |
| _tts("Like, for example, when I visited Japan last year, the food and the customs were completely different from what I was used to."), | |
| _silence(600), | |
| _tts("And I think that's what makes it so valuable, right? You learn things you can't learn from books."), | |
| _silence(900), | |
| _tts("So yeah, I believe travel is one of the best ways to learn about the world and about yourself."), | |
| ] | |
| return sum(parts[1:], parts[0]) | |
| def generate_reading_fraud(): | |
| """Scenario 2: Reading fraud - monotone, constant pace, 7s suspicious pause.""" | |
| parts = [ | |
| _tts("The advantages of public transportation include reduced carbon emissions and lower costs for commuters.", slow=True), | |
| _silence(500), | |
| _tts("Furthermore, public transit systems decrease traffic congestion in urban areas and improve air quality.", slow=True), | |
| _silence(7000), # 7-second suspicious pause | |
| _tts("In addition, public transportation provides mobility options for people who cannot drive.", slow=True), | |
| _silence(400), | |
| _tts("In conclusion, governments should invest more in public transportation infrastructure to benefit both the environment and the economy.", slow=True), | |
| ] | |
| return sum(parts[1:], parts[0]) | |
| def generate_coaching_fraud(): | |
| """Scenario 3: Coaching - main speaker + quiet second voice whispering prompts.""" | |
| main1 = _tts("I believe that technology has had a very positive impact on education overall.") | |
| main2 = _tts("Oh right, and also online platforms provide personalized learning experiences for students everywhere.") | |
| main3 = _tts("Yes, and another important point is that technology makes education accessible to people in remote areas.") | |
| main4 = _tts("So in summary, technology has transformed education for the better in many ways.") | |
| # Coach whispers (lower volume to simulate background prompting) | |
| coach1 = _tts("Talk about online platforms", lang="en") - 18 # -18dB | |
| coach2 = _tts("Mention accessibility", lang="en") - 20 # -20dB | |
| return ( | |
| main1 | |
| + _silence(1500) | |
| + coach1 | |
| + _silence(800) | |
| + main2 | |
| + _silence(2000) | |
| + coach2 | |
| + _silence(600) | |
| + main3 | |
| + _silence(800) | |
| + main4 | |
| ) | |
| def generate_wake_word(): | |
| """Scenario 4: Wake word - someone asks Alexa mid-exam.""" | |
| import numpy as np | |
| main1 = _tts("I think the most important factor in choosing a career is job satisfaction.") | |
| main2 = _tts("Because if you enjoy what you do, you'll be more productive and motivated.") | |
| main3 = _tts("Sorry, what was I saying? Oh yes, career satisfaction is crucial for long term happiness.") | |
| main4 = _tts("And that's why I believe people should follow their passion when choosing a career.") | |
| # "Alexa" wake word spoken quietly | |
| alexa_call = _tts("Alexa, what is career satisfaction", lang="en") - 12 | |
| # Synthetic beep tone (1kHz, 0.3s) to simulate device response | |
| sample_rate = 16000 | |
| t = np.linspace(0, 0.3, int(sample_rate * 0.3)) | |
| beep = (np.sin(2 * np.pi * 1000 * t) * 0.15 * 32767).astype(np.int16) | |
| beep_seg = AudioSegment(beep.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1) | |
| return ( | |
| main1 | |
| + _silence(1000) | |
| + main2 | |
| + _silence(1500) | |
| + alexa_call | |
| + _silence(400) | |
| + beep_seg | |
| + _silence(2000) | |
| + main3 | |
| + _silence(800) | |
| + main4 | |
| ) | |
| def generate_synthetic_voice(): | |
| """Scenario 5: Synthetic/robotic voice with beep tones.""" | |
| import numpy as np | |
| # Use slow TTS to sound more robotic | |
| robo1 = _tts("The answer to question number one is as follows.", slow=True) | |
| robo2 = _tts("Climate change is caused by greenhouse gas emissions from human activities.", slow=True) | |
| robo3 = _tts("The main sources include transportation, industry, and agriculture.", slow=True) | |
| robo4 = _tts("In conclusion, reducing emissions requires global cooperation.", slow=True) | |
| # Generate beep tones between segments (synthetic indicator) | |
| sample_rate = 16000 | |
| def make_beep(freq, dur): | |
| t = np.linspace(0, dur, int(sample_rate * dur)) | |
| tone = (np.sin(2 * np.pi * freq * t) * 0.2 * 32767).astype(np.int16) | |
| return AudioSegment(tone.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1) | |
| beep_hi = make_beep(880, 0.15) | |
| beep_lo = make_beep(440, 0.15) | |
| double_beep = beep_hi + _silence(100) + beep_lo | |
| return ( | |
| double_beep | |
| + _silence(300) | |
| + robo1 | |
| + _silence(400) | |
| + robo2 | |
| + double_beep | |
| + _silence(500) | |
| + robo3 | |
| + _silence(400) | |
| + robo4 | |
| + _silence(200) | |
| + double_beep | |
| ) | |
| def generate_two_speakers(): | |
| """Scenario 6: Two distinct speakers taking turns (impersonation/proxy).""" | |
| # Speaker A - normal pace, English | |
| a1 = _tts("Hello, my name is John and I'm here to take the speaking test today.") | |
| a2 = _tts("I think that learning a second language is very important in today's world.") | |
| a3 = _tts("For example, it helps you communicate with people from different countries.") | |
| # Speaker B - different accent (UK English) to sound like a different person | |
| b1 = _tts("Right, so the next topic is about technology in education.", lang="en-uk") | |
| b2 = _tts("Technology has completely changed the way students learn and interact with content.", lang="en-uk") | |
| b3 = _tts("Online courses and digital tools make education more accessible to everyone.", lang="en-uk") | |
| return ( | |
| a1 | |
| + _silence(1000) | |
| + a2 | |
| + _silence(800) | |
| + a3 | |
| + _silence(2500) # Longer pause as speakers switch | |
| + b1 | |
| + _silence(700) | |
| + b2 | |
| + _silence(600) | |
| + b3 | |
| ) | |
| def main(): | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| demos = { | |
| "demo_clean_exam.wav": generate_clean_exam, | |
| "demo_reading_fraud.wav": generate_reading_fraud, | |
| "demo_coaching_fraud.wav": generate_coaching_fraud, | |
| "demo_wake_word.wav": generate_wake_word, | |
| "demo_synthetic_voice.wav": generate_synthetic_voice, | |
| "demo_two_speakers.wav": generate_two_speakers, | |
| } | |
| for filename, generator in demos.items(): | |
| print(f"Generating {filename}...") | |
| audio = generator() | |
| path = os.path.join(OUTPUT_DIR, filename) | |
| audio.export(path, format="wav", parameters=["-ar", "16000", "-ac", "1"]) | |
| duration = len(audio) / 1000 | |
| print(f" Saved: {path} ({duration:.1f}s)") | |
| print("All demo files generated.") | |
| if __name__ == "__main__": | |
| main() | |