Spaces:

mlokendra
/

pdf_to_poadcast

Sleeping

App Files Files Community

mlokendra commited on Jun 18, 2025

Commit

b5e8c18

verified ·

1 Parent(s): 2436d0b

u

Browse files

Files changed (1) hide show

app.py +66 -80

app.py CHANGED Viewed

@@ -1,25 +1,49 @@
-#!pip install PyPDF2 gtts nltk gradio pydub
-from gtts import gTTS
-from pydub import AudioSegment
-from PyPDF2 import PdfReader
-import gradio as gr
 import os
 import re
 import heapq
-#from nltk.tokenize import sent_tokenize
 from transformers import pipeline
 import concurrent.futures
-# Load a dialogue-friendly LLM (you can cache it offline too)
 generator = pipeline("text-generation",
                      model="unsloth/gemma-3-1b-it",
                      device_map='cpu',
                      max_new_tokens=300)
 def split_sentences(text):
-    # Naive sentence splitter — works well for most academic text
     return re.split(r'(?<=[.!?])\s+', text.strip())
 def extract_sections_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
@@ -47,15 +71,8 @@ def extract_sections_from_pdf(pdf_path):
         lines = section_text.splitlines()
         limited_section_text = "\n".join(lines[:20])
         sections[name] = limited_section_text
-    print(sections)
-    return sections
-KEY_TERMS = [
-    "model", "propose", "architecture", "performance", "accuracy", "experiment",
-    "framework", "design", "method", "network", "approach", "outperform",
-    "layer", "training", "results", "learning", "evaluate", "baseline"
-]
 def summarize_section_by_heuristics(text, max_sentences=5):
     sentences = split_sentences(text)
@@ -76,52 +93,45 @@ def summarize_section_by_heuristics(text, max_sentences=5):
     top_sentences = heapq.nlargest(max_sentences, scored)
     top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
     return " ".join(top_sentences)
-host_lang = 'en' # American English
-guest_lang = 'en' # British English
-HOST_PITCH_SHIFT=-2.5
-HOST_SPEED_FACTOR=0.95
-def tts_line_by_line(script):
     lines = script.split('\n')
     segments = []
     for i, line in enumerate(lines):
         if 'Host:' in line or 'Guest:' in line:
-            print("print line ",i,line)
             speaker, content = line.split(':', 1)
-            if speaker.lower() == "host":
-              print( line.split(':', 1))
-              tts = gTTS(content.strip(), lang=host_lang,tld='ie', slow=False)
-              filename = f"segment_{i}.mp3"
-              tts.save(filename)
-            if speaker.lower() == "guest":
-              print( line.split(':', 1))
-              tts = gTTS(content.strip(), lang=guest_lang,tld='co.uk',slow=False)
-              filename = f"segment_{i}.mp3"
-              tts.save(filename)
-            segment = AudioSegment.from_mp3(filename)
-            segments.append(segment)
-    return segments
-def generate_podcast_script(section_name, section_text):
-    user_prompt = f"""You are hosting a podcast. Create a friendly, engaging conversation with maximum 10 sentence between Host and Guest discussing the {section_name} of a research paper  paper:\n\nPaper Section:
-      \"\"\"
-      {section_text}
-      \"\"\"
-      \n\nFormat:\nHost: ...\nGuest: ..."""
-    # Apply chat template formatting
-    print(user_prompt[:200])
-    messages =[{"role": "user", "content": user_prompt}]
-    response = generator(messages, max_new_tokens=300, do_sample=True, temperature=0.7)
-    print(response[0]["generated_text"])
-    return response[0]["generated_text"]
 def merge_segments(segments, output="podcast_output.mp3"):
     podcast = AudioSegment.empty()
     for segment in segments:
-        podcast += segment + AudioSegment.silent(duration=300)  # add short pause
     podcast.export(output, format="mp3")
     print(f"Podcast saved as {output}")
@@ -130,62 +140,38 @@ def process_section(section_summary_pair):
     dialogue = generate_podcast_script(section, summary)
     dialogue_content = dialogue[1]["content"]
     lines = dialogue_content.split('\n')
-    dialogue_fine = "\n".join([line for line in lines]).replace("**", "")
-    result = f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
-    return result
 def process_pdf(pdf_file):
-    # Save the uploaded file to a temporary location
     pdf_path = "uploaded_pdf.pdf"
     with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
         outfile.write(infile.read())
-    # Extract text, generate conversation, and create audio
-        # Step 1: Extract & summarize sections
     sections = extract_sections_from_pdf(pdf_path)
     summarized_sections = {
         name: summarize_section_by_heuristics(content)
         for name, content in sections.items()
     }
-    # Step 2: Generate podcast script
-    final_script = ""
-    # Prepare data
     section_summary_pairs = list(summarized_sections.items())
-    # Run in parallel using threads (good for API calls)
-    final_script = ""
     with concurrent.futures.ThreadPoolExecutor() as executor:
         results = executor.map(process_section, section_summary_pairs)
-    # Combine results
     final_script = "".join(results)
-    print(final_script)
-    segments = tts_line_by_line(final_script)
     output_audio_path = "podcast_output.mp3"
     merge_segments(segments, output=output_audio_path)
-    # Clean up the temporary PDF file
     os.remove(pdf_path)
-    # Clean up individual segment files
-    #for i in range(1,len(segments)):
-    #    os.remove(f"segment_{i}.mp3")
     return output_audio_path
-# Create the Gradio interface
 iface = gr.Interface(
     fn=process_pdf,
     inputs=gr.File(label="Upload a PDF file"),
     outputs=gr.Audio(label="Generated Podcast Audio"),
     title="PDF to Podcast",
-    description="Upload a PDF and get a podcast-style audio summary."
 )
-# Launch the interface
-iface.launch(debug=True,share=True)

 import os
 import re
 import heapq
+import uuid
+import asyncio
+import edge_tts
+import gradio as gr
+import nest_asyncio
+from PyPDF2 import PdfReader
+from pydub import AudioSegment
 from transformers import pipeline
 import concurrent.futures
+# Apply nested event loop patch for Jupyter/Colab
+nest_asyncio.apply()
+# Load LLM
 generator = pipeline("text-generation",
                      model="unsloth/gemma-3-1b-it",
                      device_map='cpu',
                      max_new_tokens=300)
+voices = await VoicesManager.create()
+voices_female = await VoicesManager.create()
+voice_male = voice_male.find(Gender="Male", Language="es")
+voice_female = voice_female.find(Gender="Female", Language="es")
+MALE_VOICE = random.choice(voice_male)["Name"]
+FEMALE_VOICE = random.choice(voice_female)["Name"]
+rate_male=40
+pitch_male=40
+pitch_female=40
+rate_female=40
+rate_female_str = f"{rate_female:+d}%"
+pitch_female_str = f"{rate_female:+d}Hz"
+KEY_TERMS = [
+    "model", "propose", "architecture", "performance", "accuracy", "experiment",
+    "framework", "design", "method", "network", "approach", "outperform",
+    "layer", "training", "results", "learning", "evaluate", "baseline"
+]
 def split_sentences(text):
     return re.split(r'(?<=[.!?])\s+', text.strip())
 def extract_sections_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
         lines = section_text.splitlines()
         limited_section_text = "\n".join(lines[:20])
         sections[name] = limited_section_text
+    return sections
 def summarize_section_by_heuristics(text, max_sentences=5):
     sentences = split_sentences(text)
     top_sentences = heapq.nlargest(max_sentences, scored)
     top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
     return " ".join(top_sentences)
+def generate_podcast_script(section_name, section_text):
+    user_prompt = f"""You are hosting a podcast. Create a friendly, engaging conversation with maximum 10 sentence between Host and Guest discussing the {section_name} of a research paper:
+\nPaper Section:\n\"\"\"\n{section_text}\n\"\"\"\n\nFormat:\nHost: ...\nGuest: ..."""
+    messages = [{"role": "user", "content": user_prompt}]
+    response = generator(messages, max_new_tokens=300, do_sample=True, temperature=0.7)
+    return response[0]["generated_text"]
+async def generate_voice_line(text, voice, filename):
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(filename)
+async def tts_edge_line_by_line(script):
     lines = script.split('\n')
     segments = []
+    tasks = []
+    filenames = []
     for i, line in enumerate(lines):
         if 'Host:' in line or 'Guest:' in line:
             speaker, content = line.split(':', 1)
+            speaker = speaker.strip().lower()
+            voice = MALE_VOICE if speaker == 'host' else FEMALE_VOICE
+            filename = f"segment_{uuid.uuid4().hex}.mp3"
+            filenames.append(filename)
+            tasks.append(generate_voice_line(content.strip(), voice, filename))
+    await asyncio.gather(*tasks)
+    for filename in filenames:
+        segments.append(AudioSegment.from_mp3(filename))
+    return segments
 def merge_segments(segments, output="podcast_output.mp3"):
     podcast = AudioSegment.empty()
     for segment in segments:
+        podcast += segment + AudioSegment.silent(duration=300)
     podcast.export(output, format="mp3")
     print(f"Podcast saved as {output}")
     dialogue = generate_podcast_script(section, summary)
     dialogue_content = dialogue[1]["content"]
     lines = dialogue_content.split('\n')
+    dialogue_fine = "\n".join([line for line in lines if 'Host:' in line or 'Guest:' in line]).replace("**", "")
+    return f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
 def process_pdf(pdf_file):
     pdf_path = "uploaded_pdf.pdf"
     with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
         outfile.write(infile.read())
     sections = extract_sections_from_pdf(pdf_path)
     summarized_sections = {
         name: summarize_section_by_heuristics(content)
         for name, content in sections.items()
     }
     section_summary_pairs = list(summarized_sections.items())
     with concurrent.futures.ThreadPoolExecutor() as executor:
         results = executor.map(process_section, section_summary_pairs)
     final_script = "".join(results)
+    segments = asyncio.run(tts_edge_line_by_line(final_script))
     output_audio_path = "podcast_output.mp3"
     merge_segments(segments, output=output_audio_path)
     os.remove(pdf_path)
     return output_audio_path
 iface = gr.Interface(
     fn=process_pdf,
     inputs=gr.File(label="Upload a PDF file"),
     outputs=gr.Audio(label="Generated Podcast Audio"),
     title="PDF to Podcast",
+    description="Upload a Research Paper PDF and get a podcast-style audio summary."
 )
+iface.launch(debug=True)