File size: 4,031 Bytes
c030906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import gradio as gr
from openai import OpenAI
from jiwer import wer
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Connect to OpenAI API
def get_client():
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise gr.Error("Missing OPENAI_API_KEY. Please set it in the Space Secrets.")
    return OpenAI(api_key=api_key)

# Podcast-style summary prompt
SUMMARY_PROMPT = """
You are a skilled voice script writer. Convert the following lecture transcript into a speech-friendly, podcast-style script suitable for a 3–5 minute audio revision.
- Target audience is already familiar with the video and wants a clear, efficient recap.
- Preserve all key knowledge nodes and insights; do not omit or add content.
- Remove fillers, repetition, and references to slides or visuals.
- Use natural spoken language suitable for listening.
- Maintain a neutral, engaging tone.
- Format as a smooth podcast monologue.
Important Guidelines:
- The summary should be ~20% of the transcript length.
- Do not impersonate or claim to be a real professor or individual.
- Avoid mentioning specific universities, brands, or affiliations unless explicitly present.
- Do not fabricate facts, examples, or names not in the original transcript.
- Ensure all information remains faithful to the transcript.
"""

def run_pipeline(transcript_file):
    if transcript_file is None:
        raise gr.Error("Please upload a .txt transcript file.")

    # Read transcript
    with open(transcript_file.name, "r", encoding="utf-8") as f:
        transcript = f.read()

    client = get_client()

    # Summarization
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": SUMMARY_PROMPT},
            {"role": "user", "content": transcript}
        ]
    )
    script_text = response.choices[0].message.content

    # TTS
    audio_file_path = "summary_audio.mp3"
    tts_response = client.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice="alloy",
        input=script_text
    )
    with open(audio_file_path, "wb") as f:
        f.write(tts_response.read())

    # ASR
    with open(audio_file_path, "rb") as f:
        asr_response = client.audio.transcriptions.create(
            model="whisper-1",
            file=f
        )
    asr_text = asr_response.text.strip()

    # Evaluation
    wer_score = wer(script_text.lower(), asr_text.lower())
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_score = scorer.score(transcript, asr_text)['rougeL'].fmeasure
    vec = TfidfVectorizer().fit_transform([transcript, asr_text])
    cos_sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]

    # Thresholds
    pass_wer = wer_score <= 0.15
    pass_rouge = rouge_l_score >= 0.20
    pass_cosine = cos_sim >= 0.35
    overall_pass = pass_wer and pass_rouge and pass_cosine

    eval_dict = {
        "WER": round(wer_score, 4),
        "WER_pass": pass_wer,
        "ROUGE-L_F1": round(rouge_l_score, 4),
        "ROUGE_pass": pass_rouge,
        "TFIDF_Cosine": round(cos_sim, 4),
        "Cosine_pass": pass_cosine,
        "Overall": "PASS" if overall_pass else "FAIL"
    }

    return script_text, audio_file_path, asr_text, eval_dict


# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Transcript → Podcast Summary → TTS → ASR → Evaluation")

    infile = gr.File(label="Upload Transcript (.txt)", file_types=[".txt"])
    run_btn = gr.Button("Run Pipeline")

    summary_out = gr.Textbox(label="Podcast-style Summary", lines=14)
    audio_out = gr.Audio(label="Summary Audio", type="filepath")
    asr_out = gr.Textbox(label="ASR Transcript", lines=10)
    metrics_out = gr.JSON(label="Evaluation Metrics")

    run_btn.click(run_pipeline, inputs=[infile],
                  outputs=[summary_out, audio_out, asr_out, metrics_out])

if __name__ == "__main__":
    demo.launch()