Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 11, 2025

Commit

03ef672

verified ·

1 Parent(s): cf56cc8

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -155

app.py CHANGED Viewed

@@ -1,179 +1,113 @@
 # =============================================================
-# Lecture → Podcast & Script Generator (English Only)
-# Two-step: 1) Gemini script  2) HF MMS-TTS audio
 # =============================================================
 import re
 import tempfile
 import textwrap
 from pathlib import Path
-from typing import List, Optional
 import gradio as gr
 from PyPDF2 import PdfReader
-from pydub import AudioSegment
-from pydub.exceptions import CouldntDecodeError
-# Google Gemini SDK
-try:
-    import google.generativeai as genai
-except ImportError:
-    raise ImportError("Please install the Google Generative AI SDK:\n"
-                      "    pip install google-generativeai")
-# Hugging Face TTS client (anonymous/public)
 from huggingface_hub import InferenceClient
-# ------------------------------------------------------------------
-# Globals & templates
-# ------------------------------------------------------------------
-PROMPT_TEMPLATE = textwrap.dedent(
-    """
-    You are producing a lively two-host educational podcast in English.
-    Summarize the following lecture content into a dialogue of approximately 300 words.
-    Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
-    Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
-    ### Lecture Content
-    {content}
-    """
-)
-HF_TTS_MODEL = "facebook/mms-tts-eng"
-CHUNK_CHAR_LIMIT = 280
-# Initialize the HF TTS client once
-tts_client = InferenceClient()
-# ------------------------------------------------------------------
-# Helper functions
-# ------------------------------------------------------------------
 def extract_pdf_text(pdf_path: str) -> str:
     reader = PdfReader(pdf_path)
     return "\n".join(page.extract_text() or "" for page in reader.pages)
-def truncate_text(text: str, max_words: int = 8000) -> str:
-    words = text.split()
-    return " ".join(words[:max_words])
-def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
-    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
-    chunks, current = [], ""
-    for sent in sentences:
-        if current and len(current) + len(sent) + 1 > limit:
-            chunks.append(current)
-            current = sent
         else:
-            current = f"{current} {sent}".strip() if current else sent
-    if current:
-        chunks.append(current)
     return chunks
-def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str:
-    chunks = split_to_chunks(script)
-    if not chunks:
-        raise RuntimeError("No text chunks to synthesize.")
-    segments = []
-    for idx, chunk in enumerate(chunks):
-        audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
-        part_path = out_dir / f"seg_{idx}.flac"
-        part_path.write_bytes(audio_bytes)
-        try:
-            seg = AudioSegment.from_file(part_path, format="flac")
-            segments.append(seg)
-        except CouldntDecodeError as e:
-            raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e
-    final_audio = sum(segments, AudioSegment.empty())
-    final_path = out_dir / "podcast_audio.flac"
-    final_audio.export(final_path, format="flac")
-    return str(final_path)
-# ------------------------------------------------------------------
-# Step 1: Generate script via Gemini
-# ------------------------------------------------------------------
-def generate_script(
-    gemini_api_key: str,
-    lecture_pdf: gr.File
-) -> List[str]:
-    if not gemini_api_key:
-        raise gr.Error("Please enter your Google AI Studio API Key.")
     if not lecture_pdf:
         raise gr.Error("Please upload a lecture PDF.")
-    # Configure Gemini
-    try:
-        genai.configure(api_key=gemini_api_key)
-        model = genai.GenerativeModel("gemini-1.5-flash-latest")
-    except Exception as e:
-        raise gr.Error(f"Gemini init/config error: {e}")
-    # Extract and truncate text
-    raw_text = extract_pdf_text(lecture_pdf.name)
-    content = truncate_text(raw_text)
-    if not content.strip():
-        raise gr.Error("No extractable text found in the PDF.")
-    # Generate dialogue script
-    prompt = PROMPT_TEMPLATE.format(content=content)
-    try:
-        response = model.generate_content(prompt)
-        script = response.text or ""
-    except Exception as e:
-        raise gr.Error(f"Gemini generation error: {e}")
-    return [script, script]  # [for Markdown display, for state storage]
-# ------------------------------------------------------------------
-# Step 2: Generate audio from provided script
-# ------------------------------------------------------------------
-def generate_audio(
-    script: str
-) -> str:
-    if not script:
-        raise gr.Error("No script available. Please generate the script first.")
-    # Create a temp dir for audio parts
-    with tempfile.TemporaryDirectory() as td:
-        out_dir = Path(td)
-        audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir)
-        return audio_path
-# ------------------------------------------------------------------
-# Gradio UI
-# ------------------------------------------------------------------
 with gr.Blocks() as demo:
-    # Shared state for the script
-    script_state = gr.State()
-    with gr.Tab("Generate Script"):
-        api_key_input = gr.Textbox(
-            label="Google Gemini API Key",
-            type="password",
-            placeholder="Enter your key"
-        )
-        pdf_input = gr.File(
-            label="Upload Lecture PDF",
-            file_types=[".pdf"]
-        )
-        script_md = gr.Markdown(
-            label="Generated Script",
-        )
-        gen_script_btn = gr.Button("Generate Script")
-        gen_script_btn.click(
-            fn=generate_script,
-            inputs=[api_key_input, pdf_input],
-            outputs=[script_md, script_state]
-        )
-    with gr.Tab("Generate Audio"):
-        gen_audio_btn = gr.Button("Generate Audio")
-        audio_out = gr.Audio(
-            label="Podcast Audio",
-            type="filepath"
-        )
-        gen_audio_btn.click(
-            fn=generate_audio,
-            inputs=[script_state],
-            outputs=[audio_out]
-        )
     demo.launch()

 # =============================================================
+# Lecture → English Podcast Generator
+# • Script: HF Inference API (Qwen/Qwen2.5-Coder-32B-Instruct)
+# • Audio: MeloTTS (English)
 # =============================================================
+import io
 import re
 import tempfile
 import textwrap
 from pathlib import Path
+from typing import List
 import gradio as gr
 from PyPDF2 import PdfReader
 from huggingface_hub import InferenceClient
+import torch
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+from melo.api import TTS
+# ────────────────────────────────────────────────────────────────────
+# 1) Setup HF client & MeloTTS for English
+# ────────────────────────────────────────────────────────────────────
+hf_client = InferenceClient()  # anonymous/public access
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+melo_en = TTS(language='EN', device=device)
+speaker_ids = melo_en.hps.data.spk2id
+default_speaker = next(iter(speaker_ids.keys()))
+# ────────────────────────────────────────────────────────────────────
+# 2) Prompt template
+# ────────────────────────────────────────────────────────────────────
+PROMPT = textwrap.dedent("""
+You are producing a lively two-host educational podcast in English.
+Summarize the following lecture content into a dialogue of approximately 300 words.
+Make it engaging: hosts ask questions, clarify ideas with analogies,
+and wrap up with a concise recap. Preserve technical accuracy.
+Use Markdown for host names (e.g., **Host 1:**).
+### Lecture Content
+{content}
+""")
+# ────────────────────────────────────────────────────────────────────
+# 3) Helpers
+# ────────────────────────────────────────────────────────────────────
 def extract_pdf_text(pdf_path: str) -> str:
     reader = PdfReader(pdf_path)
     return "\n".join(page.extract_text() or "" for page in reader.pages)
+def split_to_chunks(text: str, limit: int = 280) -> List[str]:
+    sents = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+    chunks, curr = [], ""
+    for sent in sents:
+        if curr and len(curr) + len(sent) + 1 > limit:
+            chunks.append(curr)
+            curr = sent
         else:
+            curr = f"{curr} {sent}".strip() if curr else sent
+    if curr:
+        chunks.append(curr)
     return chunks
+# ────────────────────────────────────────────────────────────────────
+# 4) Main generate function
+# ────────────────────────────────────────────────────────────────────
+def generate_podcast(lecture_pdf: gr.File):
     if not lecture_pdf:
         raise gr.Error("Please upload a lecture PDF.")
+    # 1️⃣ Extract & prompt
+    raw = extract_pdf_text(lecture_pdf.name)
+    prompt = PROMPT.format(content=raw)
+    # 2️⃣ HF text generation
+    out = hf_client.text_generation(
+        inputs=prompt,
+        model="Qwen/Qwen2.5-Coder-32B-Instruct",
+        parameters={"max_new_tokens": 512, "temperature": 0.5}
+    )
+    # InferenceClient returns a dict or a str depending on version
+    script = out.get("generated_text") if isinstance(out, dict) else out
+    # 3️⃣ MeloTTS audio
+    tmpdir = Path(tempfile.mkdtemp())
+    bio = io.BytesIO()
+    progress = gr.Progress()
+    # use the default English speaker
+    melo_en.tts_to_file(
+        script,
+        speaker_ids[default_speaker],
+        bio,
+        speed=1.0,
+        pbar=progress.tqdm,
+        format="wav"
+    )
+    audio_bytes = bio.getvalue()
+    return script, audio_bytes
+# ────────────────────────────────────────────────────────────────────
+# 5) Gradio UI
+# ────────────────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
+    gr.Markdown("## Lecture → English Podcast")
+    pdf_in = gr.File(label="Upload Lecture PDF", file_types=[".pdf"])
+    btn = gr.Button("Generate Podcast")
+    script_md = gr.Markdown(label="Podcast Script")
+    audio_out = gr.Audio(label="Podcast Audio", type="bytes")
+    btn.click(fn=generate_podcast, inputs=[pdf_in], outputs=[script_md, audio_out])
     demo.launch()