Emeritus-21's picture
Update app.py
240bc09 verified
import os
import json
import asyncio
import nest_asyncio
import edge_tts
from dotenv import load_dotenv
from pypdf import PdfReader
import gradio as gr
from huggingface_hub import InferenceClient
from spaces import GPU
# Allow async loops in Gradio
nest_asyncio.apply()
# Load environment keys
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN") # Automatically set in Spaces
# Initialize Client (Qwen 72B)
if HF_TOKEN:
hf_client = InferenceClient(model="Qwen/Qwen2.5-72B-Instruct", token=HF_TOKEN)
# =========================
# HELPER FUNCTIONS
# =========================
def extract_text_from_pdf(pdf):
try:
reader = PdfReader(pdf)
text = ""
# Extract first 5 pages to avoid token limits
for page in reader.pages[:5]:
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error reading PDF: {e}"
async def generate_audio_file(text, voice, output_path):
"""Generates audio using free Edge TTS"""
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
return output_path
# =========================
# CORE LOGIC
# =========================
@GPU
def generate_script(pdf_file, persona_style):
new_state = {
"script": [],
"current_index": 0,
"persona": persona_style,
"full_text": ""
}
if not pdf_file:
return "⚠️ Upload a PDF first.", None, new_state
if not HF_TOKEN:
return "⚠️ Missing HF_TOKEN. This usually works automatically in Spaces.", None, new_state
pdf_text = extract_text_from_pdf(pdf_file)
new_state["full_text"] = pdf_text
prompts = {
"Serious Academic": "You are a serious academic professor. Tone: Intellectual, critical, and insightful.",
"Gossip Columnist": "You are a gossip columnist host. Tone: Dramatic, sensationalist, and excited.",
}
# Qwen System Prompt
system_instruction = f"""
{prompts.get(persona_style)}
You will be given a research paper text.
Generate a 4-line dialogue script between two hosts (Host A and Host B) discussing the paper.
CRITICAL OUTPUT RULES:
1. Output MUST be valid JSON only.
2. Do not add markdown blocks like ```json.
3. Format: [ {{"speaker": "Host A", "text": "..."}}, {{"speaker": "Host B", "text": "..."}} ]
"""
user_message = f"Here is the paper text:\n\n{pdf_text[:4000]}..."
messages = [
{"role": "system", "content": system_instruction},
{"role": "user", "content": user_message}
]
try:
# Call Qwen via HF Inference
response = hf_client.chat_completion(
messages=messages,
max_tokens=1000,
temperature=0.7
)
raw_content = response.choices[0].message.content
# Clean up potential markdown formatting from LLM
clean_json = raw_content.replace("```json", "").replace("```", "").strip()
script = json.loads(clean_json)
new_state["script"] = script
new_state["current_index"] = 0
return "✅ Script ready (Qwen 2.5).", script, new_state
except Exception as e:
return f"Error with Qwen: {e}", None, new_state
# We use async here for Edge TTS
async def play_next_chunk(state_data):
if not state_data or not state_data.get("script"):
return None, "⚠️ No script generated yet.", state_data
idx = state_data["current_index"]
script = state_data["script"]
if idx >= len(script):
return None, "🎉 Podcast complete.", state_data
line = script[idx]
# SELECT VOICES (Free Edge TTS)
# Host A = Male, Host B = Female
voice_id = "en-US-ChristopherNeural"
if line["speaker"] == "Host B":
voice_id = "en-US-AriaNeural"
# Switch voices for Gossip mode
if state_data["persona"] == "Gossip Columnist":
voice_id = "en-US-EricNeural" if line["speaker"] == "Host A" else "en-US-AnaNeural"
try:
save_path = f"temp_{idx}.mp3"
await generate_audio_file(line["text"], voice_id, save_path)
state_data["current_index"] += 1
return save_path, f"{line['speaker']}: {line['text']}", state_data
except Exception as e:
return None, f"Audio error: {e}", state_data
async def interrupt_and_ask(question, state_data):
if not state_data or not state_data.get("full_text"):
return None, "Upload PDF first.", state_data
# Use Qwen for the interruption answer
try:
messages = [
{"role": "system", "content": f"You are a {state_data['persona']}. Answer the question briefly based on the paper, then say 'Anyway, back to the paper...'"},
{"role": "user", "content": f"Context: {state_data['full_text'][:2000]}\n\nUser Question: {question}"}
]
response = hf_client.chat_completion(messages=messages, max_tokens=200)
answer = response.choices[0].message.content
except Exception as e:
return None, f"Qwen Error: {e}", state_data
try:
save_path = "interrupt.mp3"
await generate_audio_file(answer, "en-US-ChristopherNeural", save_path)
return save_path, answer, state_data
except Exception as e:
return None, f"Audio Error: {e}", state_data
# =========================
# GRADIO UI
# =========================
with gr.Blocks() as demo:
app_state = gr.State({})
gr.Markdown("# 🎧 PodQuery — Research Paper Podcast Generator (Powered by Qwen 2.5)")
with gr.Row():
with gr.Column():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
persona = gr.Dropdown(
["Serious Academic", "Gossip Columnist"],
value="Serious Academic",
label="Persona Style"
)
btn_gen = gr.Button("Generate Podcast Script", variant="primary")
status = gr.Textbox(label="Status")
with gr.Column():
script_display = gr.JSON(label="Generated Script")
gr.Markdown("---")
with gr.Row():
player = gr.Audio(label="Audio Output", autoplay=True)
transcript = gr.Textbox(label="Transcript")
btn_play = gr.Button("▶️ Play Next Line")
gr.Markdown("---")
with gr.Row():
q_input = gr.Textbox(label="Ask a Question (Interrupt)")
btn_interrupt = gr.Button("✋ Interrupt Podcast")
btn_gen.click(
generate_script,
inputs=[pdf_input, persona],
outputs=[status, script_display, app_state]
)
btn_play.click(
play_next_chunk,
inputs=[app_state],
outputs=[player, transcript, app_state]
)
btn_interrupt.click(
interrupt_and_ask,
inputs=[q_input, app_state],
outputs=[player, transcript, app_state]
)
if __name__ == "__main__":
demo.launch(ssr_mode=False)