import os import gradio as gr from openai import OpenAI from jiwer import wer from rouge_score import rouge_scorer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Connect to OpenAI API def get_client(): api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise gr.Error("Missing OPENAI_API_KEY. Please set it in the Space Secrets.") return OpenAI(api_key=api_key) # Podcast-style summary prompt SUMMARY_PROMPT = """ You are a skilled voice script writer. Convert the following lecture transcript into a speech-friendly, podcast-style script suitable for a 3–5 minute audio revision. - Target audience is already familiar with the video and wants a clear, efficient recap. - Preserve all key knowledge nodes and insights; do not omit or add content. - Remove fillers, repetition, and references to slides or visuals. - Use natural spoken language suitable for listening. - Maintain a neutral, engaging tone. - Format as a smooth podcast monologue. Important Guidelines: - The summary should be ~20% of the transcript length. - Do not impersonate or claim to be a real professor or individual. - Avoid mentioning specific universities, brands, or affiliations unless explicitly present. - Do not fabricate facts, examples, or names not in the original transcript. - Ensure all information remains faithful to the transcript. """ def run_pipeline(transcript_file): if transcript_file is None: raise gr.Error("Please upload a .txt transcript file.") # Read transcript with open(transcript_file.name, "r", encoding="utf-8") as f: transcript = f.read() client = get_client() # Summarization response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": SUMMARY_PROMPT}, {"role": "user", "content": transcript} ] ) script_text = response.choices[0].message.content # TTS audio_file_path = "summary_audio.mp3" tts_response = client.audio.speech.create( model="gpt-4o-mini-tts", voice="alloy", input=script_text ) with open(audio_file_path, "wb") as f: f.write(tts_response.read()) # ASR with open(audio_file_path, "rb") as f: asr_response = client.audio.transcriptions.create( model="whisper-1", file=f ) asr_text = asr_response.text.strip() # Evaluation wer_score = wer(script_text.lower(), asr_text.lower()) scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) rouge_l_score = scorer.score(transcript, asr_text)['rougeL'].fmeasure vec = TfidfVectorizer().fit_transform([transcript, asr_text]) cos_sim = cosine_similarity(vec[0:1], vec[1:2])[0][0] # Thresholds pass_wer = wer_score <= 0.15 pass_rouge = rouge_l_score >= 0.20 pass_cosine = cos_sim >= 0.35 overall_pass = pass_wer and pass_rouge and pass_cosine eval_dict = { "WER": round(wer_score, 4), "WER_pass": pass_wer, "ROUGE-L_F1": round(rouge_l_score, 4), "ROUGE_pass": pass_rouge, "TFIDF_Cosine": round(cos_sim, 4), "Cosine_pass": pass_cosine, "Overall": "PASS" if overall_pass else "FAIL" } return script_text, audio_file_path, asr_text, eval_dict # Build Gradio UI with gr.Blocks() as demo: gr.Markdown("# Transcript → Podcast Summary → TTS → ASR → Evaluation") infile = gr.File(label="Upload Transcript (.txt)", file_types=[".txt"]) run_btn = gr.Button("Run Pipeline") summary_out = gr.Textbox(label="Podcast-style Summary", lines=14) audio_out = gr.Audio(label="Summary Audio", type="filepath") asr_out = gr.Textbox(label="ASR Transcript", lines=10) metrics_out = gr.JSON(label="Evaluation Metrics") run_btn.click(run_pipeline, inputs=[infile], outputs=[summary_out, audio_out, asr_out, metrics_out]) if __name__ == "__main__": demo.launch()