Spaces:

daasime
/

sop-audio-analyzer

Running

App Files Files Community

sop-audio-analyzer / generate_demos.py

daasime

Add 3 new demo scenarios + Coming Soon badge + lazy generation

ebe2991 about 2 months ago

raw

history blame contribute delete

8.04 kB

	"""
	Generate demo audio samples for the Audio Analyzer PoC.
	Run during Docker build: python generate_demos.py

	Produces 3 WAV files (16kHz mono) in DATA_DIR/samples/:
	1. demo_clean_exam.wav - natural exam answer, ~35s, LOW risk
	2. demo_reading_fraud.wav - monotone reading + long pause, ~35s, MEDIUM-HIGH risk
	3. demo_coaching_fraud.wav - main speaker + whispered coach, ~35s, triggers whisper detection
	"""
	import os
	import io
	from gtts import gTTS
	from pydub import AudioSegment

	OUTPUT_DIR = os.path.join(os.environ.get("DATA_DIR", "data"), "samples")


	def _tts(text, lang="en", slow=False):
	"""Generate speech segment via gTTS, return as 16kHz mono AudioSegment."""
	buf = io.BytesIO()
	tts = gTTS(text=text, lang=lang, slow=slow)
	tts.write_to_fp(buf)
	buf.seek(0)
	seg = AudioSegment.from_mp3(buf)
	return seg.set_frame_rate(16000).set_channels(1)


	def _silence(ms):
	return AudioSegment.silent(duration=ms, frame_rate=16000)


	def generate_clean_exam():
	"""Scenario 1: Clean exam - natural speech with fillers and varied pace."""
	parts = [
	_tts("Well, I think the main reason people travel is, you know, to experience different cultures."),
	_silence(800),
	_tts("Um, personally, I've always found that visiting new places really broadens your perspective."),
	_silence(1200),
	_tts("Like, for example, when I visited Japan last year, the food and the customs were completely different from what I was used to."),
	_silence(600),
	_tts("And I think that's what makes it so valuable, right? You learn things you can't learn from books."),
	_silence(900),
	_tts("So yeah, I believe travel is one of the best ways to learn about the world and about yourself."),
	]
	return sum(parts[1:], parts[0])


	def generate_reading_fraud():
	"""Scenario 2: Reading fraud - monotone, constant pace, 7s suspicious pause."""
	parts = [
	_tts("The advantages of public transportation include reduced carbon emissions and lower costs for commuters.", slow=True),
	_silence(500),
	_tts("Furthermore, public transit systems decrease traffic congestion in urban areas and improve air quality.", slow=True),
	_silence(7000), # 7-second suspicious pause
	_tts("In addition, public transportation provides mobility options for people who cannot drive.", slow=True),
	_silence(400),
	_tts("In conclusion, governments should invest more in public transportation infrastructure to benefit both the environment and the economy.", slow=True),
	]
	return sum(parts[1:], parts[0])


	def generate_coaching_fraud():
	"""Scenario 3: Coaching - main speaker + quiet second voice whispering prompts."""
	main1 = _tts("I believe that technology has had a very positive impact on education overall.")
	main2 = _tts("Oh right, and also online platforms provide personalized learning experiences for students everywhere.")
	main3 = _tts("Yes, and another important point is that technology makes education accessible to people in remote areas.")
	main4 = _tts("So in summary, technology has transformed education for the better in many ways.")

	# Coach whispers (lower volume to simulate background prompting)
	coach1 = _tts("Talk about online platforms", lang="en") - 18 # -18dB
	coach2 = _tts("Mention accessibility", lang="en") - 20 # -20dB

	return (
	main1
	+ _silence(1500)
	+ coach1
	+ _silence(800)
	+ main2
	+ _silence(2000)
	+ coach2
	+ _silence(600)
	+ main3
	+ _silence(800)
	+ main4
	)


	def generate_wake_word():
	"""Scenario 4: Wake word - someone asks Alexa mid-exam."""
	import numpy as np

	main1 = _tts("I think the most important factor in choosing a career is job satisfaction.")
	main2 = _tts("Because if you enjoy what you do, you'll be more productive and motivated.")
	main3 = _tts("Sorry, what was I saying? Oh yes, career satisfaction is crucial for long term happiness.")
	main4 = _tts("And that's why I believe people should follow their passion when choosing a career.")

	# "Alexa" wake word spoken quietly
	alexa_call = _tts("Alexa, what is career satisfaction", lang="en") - 12

	# Synthetic beep tone (1kHz, 0.3s) to simulate device response
	sample_rate = 16000
	t = np.linspace(0, 0.3, int(sample_rate * 0.3))
	beep = (np.sin(2 * np.pi * 1000 * t) * 0.15 * 32767).astype(np.int16)
	beep_seg = AudioSegment(beep.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)

	return (
	main1
	+ _silence(1000)
	+ main2
	+ _silence(1500)
	+ alexa_call
	+ _silence(400)
	+ beep_seg
	+ _silence(2000)
	+ main3
	+ _silence(800)
	+ main4
	)


	def generate_synthetic_voice():
	"""Scenario 5: Synthetic/robotic voice with beep tones."""
	import numpy as np

	# Use slow TTS to sound more robotic
	robo1 = _tts("The answer to question number one is as follows.", slow=True)
	robo2 = _tts("Climate change is caused by greenhouse gas emissions from human activities.", slow=True)
	robo3 = _tts("The main sources include transportation, industry, and agriculture.", slow=True)
	robo4 = _tts("In conclusion, reducing emissions requires global cooperation.", slow=True)

	# Generate beep tones between segments (synthetic indicator)
	sample_rate = 16000
	def make_beep(freq, dur):
	t = np.linspace(0, dur, int(sample_rate * dur))
	tone = (np.sin(2 * np.pi * freq * t) * 0.2 * 32767).astype(np.int16)
	return AudioSegment(tone.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)

	beep_hi = make_beep(880, 0.15)
	beep_lo = make_beep(440, 0.15)
	double_beep = beep_hi + _silence(100) + beep_lo

	return (
	double_beep
	+ _silence(300)
	+ robo1
	+ _silence(400)
	+ robo2
	+ double_beep
	+ _silence(500)
	+ robo3
	+ _silence(400)
	+ robo4
	+ _silence(200)
	+ double_beep
	)


	def generate_two_speakers():
	"""Scenario 6: Two distinct speakers taking turns (impersonation/proxy)."""
	# Speaker A - normal pace, English
	a1 = _tts("Hello, my name is John and I'm here to take the speaking test today.")
	a2 = _tts("I think that learning a second language is very important in today's world.")
	a3 = _tts("For example, it helps you communicate with people from different countries.")

	# Speaker B - different accent (UK English) to sound like a different person
	b1 = _tts("Right, so the next topic is about technology in education.", lang="en-uk")
	b2 = _tts("Technology has completely changed the way students learn and interact with content.", lang="en-uk")
	b3 = _tts("Online courses and digital tools make education more accessible to everyone.", lang="en-uk")

	return (
	a1
	+ _silence(1000)
	+ a2
	+ _silence(800)
	+ a3
	+ _silence(2500) # Longer pause as speakers switch
	+ b1
	+ _silence(700)
	+ b2
	+ _silence(600)
	+ b3
	)


	def main():
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	demos = {
	"demo_clean_exam.wav": generate_clean_exam,
	"demo_reading_fraud.wav": generate_reading_fraud,
	"demo_coaching_fraud.wav": generate_coaching_fraud,
	"demo_wake_word.wav": generate_wake_word,
	"demo_synthetic_voice.wav": generate_synthetic_voice,
	"demo_two_speakers.wav": generate_two_speakers,
	}

	for filename, generator in demos.items():
	print(f"Generating {filename}...")
	audio = generator()
	path = os.path.join(OUTPUT_DIR, filename)
	audio.export(path, format="wav", parameters=["-ar", "16000", "-ac", "1"])
	duration = len(audio) / 1000
	print(f" Saved: {path} ({duration:.1f}s)")

	print("All demo files generated.")


	if __name__ == "__main__":
	main()