Spaces:

MCP-1st-Birthday
/

Research-podquery

Sleeping

App Files Files Community

Research-podquery / app.py

Emeritus-21

Update app.py

240bc09 verified 5 months ago

raw

history blame contribute delete

6.96 kB

	import os
	import json
	import asyncio
	import nest_asyncio
	import edge_tts
	from dotenv import load_dotenv
	from pypdf import PdfReader
	import gradio as gr
	from huggingface_hub import InferenceClient
	from spaces import GPU

	# Allow async loops in Gradio
	nest_asyncio.apply()

	# Load environment keys
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN") # Automatically set in Spaces

	# Initialize Client (Qwen 72B)
	if HF_TOKEN:
	hf_client = InferenceClient(model="Qwen/Qwen2.5-72B-Instruct", token=HF_TOKEN)

	# =========================
	# HELPER FUNCTIONS
	# =========================

	def extract_text_from_pdf(pdf):
	try:
	reader = PdfReader(pdf)
	text = ""
	# Extract first 5 pages to avoid token limits
	for page in reader.pages[:5]:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	return f"Error reading PDF: {e}"

	async def generate_audio_file(text, voice, output_path):
	"""Generates audio using free Edge TTS"""
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(output_path)
	return output_path

	# =========================
	# CORE LOGIC
	# =========================

	@GPU
	def generate_script(pdf_file, persona_style):
	new_state = {
	"script": [],
	"current_index": 0,
	"persona": persona_style,
	"full_text": ""
	}

	if not pdf_file:
	return "⚠️ Upload a PDF first.", None, new_state

	if not HF_TOKEN:
	return "⚠️ Missing HF_TOKEN. This usually works automatically in Spaces.", None, new_state

	pdf_text = extract_text_from_pdf(pdf_file)
	new_state["full_text"] = pdf_text

	prompts = {
	"Serious Academic": "You are a serious academic professor. Tone: Intellectual, critical, and insightful.",
	"Gossip Columnist": "You are a gossip columnist host. Tone: Dramatic, sensationalist, and excited.",
	}

	# Qwen System Prompt
	system_instruction = f"""
	{prompts.get(persona_style)}

	You will be given a research paper text.
	Generate a 4-line dialogue script between two hosts (Host A and Host B) discussing the paper.

	CRITICAL OUTPUT RULES:
	1. Output MUST be valid JSON only.
	2. Do not add markdown blocks like ```json.
	3. Format: [ {{"speaker": "Host A", "text": "..."}}, {{"speaker": "Host B", "text": "..."}} ]
	"""

	user_message = f"Here is the paper text:\n\n{pdf_text[:4000]}..."

	messages = [
	{"role": "system", "content": system_instruction},
	{"role": "user", "content": user_message}
	]

	try:
	# Call Qwen via HF Inference
	response = hf_client.chat_completion(
	messages=messages,
	max_tokens=1000,
	temperature=0.7
	)

	raw_content = response.choices[0].message.content

	# Clean up potential markdown formatting from LLM
	clean_json = raw_content.replace("```json", "").replace("```", "").strip()
	script = json.loads(clean_json)

	new_state["script"] = script
	new_state["current_index"] = 0

	return "✅ Script ready (Qwen 2.5).", script, new_state

	except Exception as e:
	return f"Error with Qwen: {e}", None, new_state

	# We use async here for Edge TTS
	async def play_next_chunk(state_data):
	if not state_data or not state_data.get("script"):
	return None, "⚠️ No script generated yet.", state_data

	idx = state_data["current_index"]
	script = state_data["script"]

	if idx >= len(script):
	return None, "🎉 Podcast complete.", state_data

	line = script[idx]

	# SELECT VOICES (Free Edge TTS)
	# Host A = Male, Host B = Female
	voice_id = "en-US-ChristopherNeural"

	if line["speaker"] == "Host B":
	voice_id = "en-US-AriaNeural"

	# Switch voices for Gossip mode
	if state_data["persona"] == "Gossip Columnist":
	voice_id = "en-US-EricNeural" if line["speaker"] == "Host A" else "en-US-AnaNeural"

	try:
	save_path = f"temp_{idx}.mp3"
	await generate_audio_file(line["text"], voice_id, save_path)

	state_data["current_index"] += 1
	return save_path, f"{line['speaker']}: {line['text']}", state_data

	except Exception as e:
	return None, f"Audio error: {e}", state_data

	async def interrupt_and_ask(question, state_data):
	if not state_data or not state_data.get("full_text"):
	return None, "Upload PDF first.", state_data

	# Use Qwen for the interruption answer
	try:
	messages = [
	{"role": "system", "content": f"You are a {state_data['persona']}. Answer the question briefly based on the paper, then say 'Anyway, back to the paper...'"},
	{"role": "user", "content": f"Context: {state_data['full_text'][:2000]}\n\nUser Question: {question}"}
	]

	response = hf_client.chat_completion(messages=messages, max_tokens=200)
	answer = response.choices[0].message.content

	except Exception as e:
	return None, f"Qwen Error: {e}", state_data

	try:
	save_path = "interrupt.mp3"
	await generate_audio_file(answer, "en-US-ChristopherNeural", save_path)

	return save_path, answer, state_data
	except Exception as e:
	return None, f"Audio Error: {e}", state_data

	# =========================
	# GRADIO UI
	# =========================
	with gr.Blocks() as demo:
	app_state = gr.State({})

	gr.Markdown("# 🎧 PodQuery — Research Paper Podcast Generator (Powered by Qwen 2.5)")

	with gr.Row():
	with gr.Column():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	persona = gr.Dropdown(
	["Serious Academic", "Gossip Columnist"],
	value="Serious Academic",
	label="Persona Style"
	)
	btn_gen = gr.Button("Generate Podcast Script", variant="primary")
	status = gr.Textbox(label="Status")

	with gr.Column():
	script_display = gr.JSON(label="Generated Script")

	gr.Markdown("---")

	with gr.Row():
	player = gr.Audio(label="Audio Output", autoplay=True)
	transcript = gr.Textbox(label="Transcript")

	btn_play = gr.Button("▶️ Play Next Line")

	gr.Markdown("---")

	with gr.Row():
	q_input = gr.Textbox(label="Ask a Question (Interrupt)")
	btn_interrupt = gr.Button("✋ Interrupt Podcast")

	btn_gen.click(
	generate_script,
	inputs=[pdf_input, persona],
	outputs=[status, script_display, app_state]
	)

	btn_play.click(
	play_next_chunk,
	inputs=[app_state],
	outputs=[player, transcript, app_state]
	)

	btn_interrupt.click(
	interrupt_and_ask,
	inputs=[q_input, app_state],
	outputs=[player, transcript, app_state]
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)