Spaces:

ssahal
/

RevisionAudioCreation

Configuration error

App Files Files Community

RevisionAudioCreation / app.py

ssahal

Update app.py

c030906 verified 6 months ago

raw

history blame contribute delete

4.03 kB

	import os
	import gradio as gr
	from openai import OpenAI
	from jiwer import wer
	from rouge_score import rouge_scorer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# Connect to OpenAI API
	def get_client():
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise gr.Error("Missing OPENAI_API_KEY. Please set it in the Space Secrets.")
	return OpenAI(api_key=api_key)

	# Podcast-style summary prompt
	SUMMARY_PROMPT = """
	You are a skilled voice script writer. Convert the following lecture transcript into a speech-friendly, podcast-style script suitable for a 3–5 minute audio revision.
	- Target audience is already familiar with the video and wants a clear, efficient recap.
	- Preserve all key knowledge nodes and insights; do not omit or add content.
	- Remove fillers, repetition, and references to slides or visuals.
	- Use natural spoken language suitable for listening.
	- Maintain a neutral, engaging tone.
	- Format as a smooth podcast monologue.
	Important Guidelines:
	- The summary should be ~20% of the transcript length.
	- Do not impersonate or claim to be a real professor or individual.
	- Avoid mentioning specific universities, brands, or affiliations unless explicitly present.
	- Do not fabricate facts, examples, or names not in the original transcript.
	- Ensure all information remains faithful to the transcript.
	"""

	def run_pipeline(transcript_file):
	if transcript_file is None:
	raise gr.Error("Please upload a .txt transcript file.")

	# Read transcript
	with open(transcript_file.name, "r", encoding="utf-8") as f:
	transcript = f.read()

	client = get_client()

	# Summarization
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{"role": "system", "content": SUMMARY_PROMPT},
	{"role": "user", "content": transcript}
	]
	)
	script_text = response.choices[0].message.content

	# TTS
	audio_file_path = "summary_audio.mp3"
	tts_response = client.audio.speech.create(
	model="gpt-4o-mini-tts",
	voice="alloy",
	input=script_text
	)
	with open(audio_file_path, "wb") as f:
	f.write(tts_response.read())

	# ASR
	with open(audio_file_path, "rb") as f:
	asr_response = client.audio.transcriptions.create(
	model="whisper-1",
	file=f
	)
	asr_text = asr_response.text.strip()

	# Evaluation
	wer_score = wer(script_text.lower(), asr_text.lower())
	scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
	rouge_l_score = scorer.score(transcript, asr_text)['rougeL'].fmeasure
	vec = TfidfVectorizer().fit_transform([transcript, asr_text])
	cos_sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]

	# Thresholds
	pass_wer = wer_score <= 0.15
	pass_rouge = rouge_l_score >= 0.20
	pass_cosine = cos_sim >= 0.35
	overall_pass = pass_wer and pass_rouge and pass_cosine

	eval_dict = {
	"WER": round(wer_score, 4),
	"WER_pass": pass_wer,
	"ROUGE-L_F1": round(rouge_l_score, 4),
	"ROUGE_pass": pass_rouge,
	"TFIDF_Cosine": round(cos_sim, 4),
	"Cosine_pass": pass_cosine,
	"Overall": "PASS" if overall_pass else "FAIL"
	}

	return script_text, audio_file_path, asr_text, eval_dict


	# Build Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# Transcript → Podcast Summary → TTS → ASR → Evaluation")

	infile = gr.File(label="Upload Transcript (.txt)", file_types=[".txt"])
	run_btn = gr.Button("Run Pipeline")

	summary_out = gr.Textbox(label="Podcast-style Summary", lines=14)
	audio_out = gr.Audio(label="Summary Audio", type="filepath")
	asr_out = gr.Textbox(label="ASR Transcript", lines=10)
	metrics_out = gr.JSON(label="Evaluation Metrics")

	run_btn.click(run_pipeline, inputs=[infile],
	outputs=[summary_out, audio_out, asr_out, metrics_out])

	if __name__ == "__main__":
	demo.launch()