Spaces:

PogusTheWhisper
/

Audio-to-Blog-Summarizer

Runtime error

File size: 9,802 Bytes

import gradio as gr
import torch
from faster_whisper import WhisperModel
import yt_dlp
from openai import OpenAI
import os
import json
import time
import uuid
import socket

YOUTUBE_REACHABLE = False
print("--- ATTEMPTING TO RESOLVE YOUTUBE.COM ---")
try:
    addr = socket.gethostbyname('www.youtube.com')
    print(f"--- SUCCESS: 'www.youtube.com' resolved to {addr}. YouTube features enabled. ---")
    YOUTUBE_REACHABLE = True
except socket.gaierror as e:
    print(f"--- FAILED to resolve 'www.youtube.com': {e}. YouTube functionality will be disabled. ---")
    
print("Initializing transcription model (faster-whisper)...")
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
model_size = "large-v3-turbo"
try:
    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    print("Transcription model loaded successfully.")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    exit()

def download_youtube_audio(url: str) -> str:
    unique_id = uuid.uuid4()
    output_template = f'{unique_id}.%(ext)s'
    final_filepath = f'{unique_id}.mp3'
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': output_template,
        'quiet': True,
        'overwrite': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return final_filepath

def transcribe_and_summarize(audio_file: str, youtube_url: str):
    log_history = ""
    def log(message):
        nonlocal log_history
        timestamp = time.strftime("%H:%M:%S")
        log_history += f"[{timestamp}] {message}\n"
        return log_history

    loading_message = "⏳ Generating summary..."
    yield log("Process started."), "", ""

    api_key = os.getenv('TYPHOON_API')
    if not api_key:
        error_msg = "## Error\n`TYPHOON_API` environment variable not set. Please configure the API key."
        yield log(error_msg.replace("\n", " ")), "", gr.Markdown(error_msg)
        return

    if audio_file is None and not youtube_url:
        raise gr.Error("Please upload an audio file or provide a YouTube link.")

    filepath = ""
    is_downloaded = False
    try:
        if youtube_url:
            yield log("Downloading YouTube audio..."), "", ""
            filepath = download_youtube_audio(youtube_url)
            is_downloaded = True
            yield log(f"Downloaded to {filepath}"), "", ""
        else:
            filepath = audio_file

        yield log("Transcription started (autodetecting language)..."), "", ""
        segments, info = model.transcribe(filepath, beam_size=5, task="transcribe")
        yield log(f"Detected language '{info.language}' (prob={info.language_probability:.2f})"), "", ""
        transcribed_text = ""
        for segment in segments:
            line = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text.strip()}"
            transcribed_text += segment.text + " "
            yield log(line), transcribed_text, ""

        yield log("Transcription complete."), transcribed_text, ""
        yield log("Sending to AI for summarization..."), transcribed_text, loading_message

        client = OpenAI(api_key=api_key, base_url="https://api.opentyphoon.ai/v1")
        system_prompt = f"""You are an automated system that converts transcripts into a blog post.
Your ONLY function is to output a valid JSON object. All text values in the JSON MUST be in the Thai language.
หน้าที่เดียวของคุณคือการส่งออกอ็อบเจกต์ JSON ที่ถูกต้อง โดยค่าที่เป็นข้อความทั้งหมดต้องเป็นภาษาไทยเท่านั้น
Do NOT write any explanations. The response MUST start with `{{` and end with `}}`.

The JSON object must have the following structure:
{{
  "title": "หัวข้อบทความที่น่าสนใจและเกี่ยวข้อง (เป็นภาษาไทย)",
  "key_takeaway": "สรุปใจความสำคัญของเนื้อหาทั้งหมดในหนึ่งย่อหน้า (เป็นภาษาไทย)",
  "main_ideas": [
    "ประเด็นหลักหรือใจความสำคัญ (เป็นภาษาไทย)",
    "ประเด็นหลักถัดไป...",
    "และต่อไปเรื่อยๆ..."
  ],
  "conclusion": "ย่อหน้าสรุปปิดท้าย (เป็นภาษาไทย)"
}}"""
        response = client.chat.completions.create(
            model="typhoon-v2.1-12b-instruct",
            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": transcribed_text}],
            max_tokens=2048,
            temperature=0.7
        )
        summary_json_string = response.choices[0].message.content
        if summary_json_string.strip().startswith("```json"):
            summary_json_string = summary_json_string.strip()[7:-4].strip()

        data = json.loads(summary_json_string)
        title = data.get("title", "Title Not Found")
        key_takeaway = data.get("key_takeaway", "")
        main_ideas = data.get("main_ideas", [])
        conclusion = data.get("conclusion", "")
        
        summary_markdown = f"# {title}\n\n<p>{key_takeaway}</p>\n\n## Key Ideas\n\n<ul>"
        for idea in main_ideas:
            summary_markdown += f"<li>{idea}</li>"
        summary_markdown += f"</ul>\n\n## Conclusion\n\n<p>{conclusion}</p>"

        yield log("Summarization complete."), transcribed_text, summary_markdown

    finally:
        if is_downloaded and os.path.exists(filepath):
            os.remove(filepath)

def update_video_preview(url):
    if not url:
        return gr.update(value=None, visible=False)
    video_id = None
    try:
        if "[youtube.com/shorts/](https://youtube.com/shorts/)" in url:
            video_id = url.split("/shorts/")[1].split("?")[0]
        elif "watch?v=" in url:
            video_id = url.split("watch?v=")[1].split("&")[0]
        elif "youtu.be/" in url:
            video_id = url.split("youtu.be/")[1].split("?")[0]
    except IndexError:
        pass
        
    if video_id:
        embed_url = f"[https://www.youtube.com/embed/](https://www.youtube.com/embed/){video_id}"
        iframe_html = f'<iframe width="100%" height="315" src="{embed_url}" frameborder="0" allowfullscreen></iframe>'
        return gr.update(value=iframe_html, visible=True)
    return gr.update(value=None, visible=False)

css = """
@import url('[https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap](https://fonts.googleapis.com/css2?family=Sarabun:wght@400;700&display=swap)');
.blog-output { font-family: 'Sarabun', sans-serif; line-height: 1.8; max-width: 800px; margin: auto; padding: 2rem; border-radius: 12px; background-color: #ffffff; border: 1px solid #e5e7eb; }
.blog-output h1 { font-size: 2.2em; font-weight: 700; border-bottom: 2px solid #f3f4f6; padding-bottom: 15px; margin-bottom: 25px; color: #111827; }
.blog-output h2 { font-size: 1.6em; font-weight: 700; margin-top: 40px; margin-bottom: 20px; color: #1f2937; }
.blog-output p { font-size: 1.1em; margin-bottom: 20px; color: #374151; }
.blog-output ul { padding-left: 25px; list-style-type: disc; }
.blog-output li { margin-bottom: 12px; padding-left: 5px; }
"""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css) as demo:
    gr.Markdown(
        """
        # 🎙️ Audio to Blog Summarizer ✒️
        Upload an audio file (MP3, WAV) or paste a YouTube link to transcribe it to Thai text and summarize the content into blog-style article using ASR and LLM.
        """
    )
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.TabItem("⬆️ Upload Audio"):
                    audio_file_input = gr.Audio(label="Upload Audio File", type="filepath")
                with gr.TabItem("🔗 YouTube Link"):
                    youtube_url_input = gr.Textbox(
                        label="YouTube URL" if YOUTUBE_REACHABLE else "YouTube URL (Disabled)",
                        placeholder="Paste a YouTube link here..." if YOUTUBE_REACHABLE else "YouTube is not reachable in this environment.",
                        interactive=YOUTUBE_REACHABLE
                    )
            submit_button = gr.Button("🚀 Generate Blog Post", variant="primary")
            video_preview = gr.HTML(visible=False)
            with gr.Accordion("📝 View Process Log", open=True):
                log_output = gr.Textbox(label="Log", interactive=False, lines=10)
        with gr.Column(scale=2):
            gr.Markdown("## ✨ Article Output")
            blog_summary_output = gr.Markdown(elem_classes=["blog-output"])
            with gr.Accordion("📜 View Full Transcription", open=False):
                transcription_output = gr.Textbox(label="Full Text", interactive=False, lines=10)

    submit_button.click(fn=transcribe_and_summarize,
                        inputs=[audio_file_input, youtube_url_input],
                        outputs=[log_output, transcription_output, blog_summary_output])
    
    if YOUTUBE_REACHABLE:
        youtube_url_input.change(fn=update_video_preview,
                                 inputs=youtube_url_input,
                                 outputs=video_preview)
        demo.load(fn=update_video_preview,
                  inputs=youtube_url_input,
                  outputs=video_preview)

if __name__ == "__main__":
    demo.launch(debug=True)