Emeritus-21 commited on
Commit
331f1ed
·
verified ·
1 Parent(s): 32d3d6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -129
app.py CHANGED
@@ -4,208 +4,176 @@ from elevenlabs.client import ElevenLabs
4
  import os
5
  import json
6
  import time
 
7
  from dotenv import load_dotenv
8
 
 
9
  load_dotenv()
10
-
11
- # --- CONFIGURATION ---
12
- # Get these keys from your .env file or Hugging Face Secrets
13
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
14
  ELEVEN_API_KEY = os.getenv("ELEVEN_API_KEY")
15
 
16
- # Configure APIs
17
- genai.configure(api_key=GEMINI_API_KEY)
18
- client = ElevenLabs(api_key=ELEVEN_API_KEY)
 
 
19
 
20
- # --- STATE MANAGEMENT ---
21
- # We use a global state to track where we are in the podcast
22
- # In a real production app, this would be per-user session state
23
  class PodcastState:
24
  def __init__(self):
25
  self.script = []
26
  self.current_index = 0
27
- self.pdf_context = ""
28
  self.persona = "Serious Academic"
 
29
 
30
  state = PodcastState()
31
 
32
- # --- HELPER FUNCTIONS ---
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def generate_script(pdf_file, persona_style):
35
- """
36
- 1. Uploads PDF to Gemini 2.0 Flash
37
- 2. Generates a dialogue script based on the chosen Persona
38
- """
39
  if not pdf_file:
40
- return "Please upload a PDF first.", []
41
-
42
- print(f"Processing PDF... Style: {persona_style}")
43
 
44
- # Upload file to Gemini
 
 
 
 
 
 
 
45
  model = genai.GenerativeModel('gemini-2.0-flash')
46
 
47
- # Define Persona Prompts
48
  prompts = {
49
- "Serious Academic": "You are two professors discussing a paper. Tone: Intellectual, precise, slightly dry but insightful. Speaker A is the Skeptic, Speaker B is the Believer.",
50
- "Gossip Columnist": "You are two drama-loving gossip columnists reading this paper like it's a scandal. Tone: Shocked, slang-heavy, 'Tea spilling'. Speaker A is hyper, Speaker B is sarcastic.",
51
- "Explain Like I'm 5": "You are a gentle teacher and a curious student. Tone: Simple analogies, very clear, enthusiastic."
52
  }
53
 
54
- selected_prompt = prompts.get(persona_style, prompts["Serious Academic"])
55
-
56
- # This prompt forces a JSON structure for easier parsing
57
- prompt = f"""
58
- {selected_prompt}
59
 
60
- Analyze the attached PDF research paper.
61
- Generate a 6-turn dialogue script (3 turns each) summarizing the key findings.
62
 
63
- RETURN JSON ONLY in this format:
 
64
  [
65
  {{"speaker": "Host A", "text": "..."}},
66
  {{"speaker": "Host B", "text": "..."}}
67
  ]
68
  """
69
 
70
- # In a real deployment, we would use the File API.
71
- # For this hackathon demo (files < 50 pages), we can pass text or use the upload API.
72
- # Here we assume text extraction or direct PDF support if the SDK allows.
73
- # For simplicity/reliability in this snippet, we will simulate the file read:
74
- # (Note: To make this robust, use `genai.upload_file` in production)
75
-
76
- # MOCKING THE FILE READ FOR THE DEMO (Replace this with actual Gemini File API call)
77
- # We will just send the prompt to Gemini without the file content if file acts up,
78
- # but normally you do: sample_file = genai.upload_file(path=pdf_file, display_name="Paper")
79
-
80
- # Actual Call (Simulated for speed in demo code):
81
- response = model.generate_content(prompt)
82
-
83
- # Clean up JSON (Gemini sometimes adds ```json ... ```)
84
- clean_json = response.text.replace("```json", "").replace("```", "").strip()
85
-
86
  try:
 
 
87
  script_data = json.loads(clean_json)
 
88
  state.script = script_data
89
  state.current_index = 0
90
- state.persona = persona_style
91
- # Store context for Q&A later
92
- state.pdf_context = "User uploaded a paper. (Context stored)."
93
 
94
  return "✅ Script Generated! Click 'Play' to start.", script_data
95
  except Exception as e:
96
- return f"Error parsing script: {str(e)}", []
97
 
98
  def play_next_chunk():
99
- """
100
- Generates Audio for the NEXT line in the script.
101
- """
102
  if state.current_index >= len(state.script):
103
  return None, "🎉 Podcast Ended."
104
 
105
  line = state.script[state.current_index]
106
- speaker = line["speaker"]
107
- text = line["text"]
108
 
109
- # Voice Selection Logic
110
- # Voices: 'Adam' (Deep/Male), 'Nicole' (Crisp/Female), 'Mimi' (Childlike - for ELI5)
111
  voice_id = "nPczCjz82tPNOwVbpGE2" # Default Male
112
-
113
- if state.persona == "Gossip Columnist":
114
- voice_id = "nPczCjz82tPNOwVbpGE2" if speaker == "Host A" else "21m00Tcm4TlvDq8ikWAM" # Rachel
115
- elif state.persona == "Explain Like I'm 5":
116
- voice_id = "nPczCjz82tPNOwVbpGE2" if speaker == "Host A" else "MF3mGyEYCl7XYWbV9V6O" # Childlike
117
- else:
118
- # Academic
119
- voice_id = "nPczCjz82tPNOwVbpGE2" if speaker == "Host A" else "EXAVITQu4vr4xnSDxMaL"
 
 
 
120
 
121
- # Generate Audio
122
- audio_stream = client.generate(
123
- text=text,
124
- voice=voice_id,
125
- model="eleven_monolingual_v1"
126
- )
127
-
128
- # Save to temp file
129
- save_path = f"temp_{state.current_index}.mp3"
130
- with open(save_path, "wb") as f:
131
- for chunk in audio_stream:
132
- f.write(chunk)
133
-
134
- state.current_index += 1
135
-
136
- return save_path, f"🎙️ {speaker}: {text}"
137
 
138
  def interrupt_and_ask(user_question):
139
- """
140
- The 'Hero' Feature:
141
- 1. Pauses context.
142
- 2. Answers question.
143
- 3. Bridges back to the podcast.
144
- """
145
  model = genai.GenerativeModel('gemini-2.0-flash')
146
 
147
- last_line = state.script[state.current_index - 1]["text"] if state.current_index > 0 else "the start"
148
-
149
  prompt = f"""
150
  You are a podcast host ({state.persona}).
151
- You were just interrupted by a listener asking: "{user_question}"
152
-
153
- The last thing you said was: "{last_line}"
154
 
155
- 1. Answer the question directly but keep the persona.
156
- 2. Seamlessly transition back to the podcast topic.
157
  """
158
 
159
  response = model.generate_content(prompt)
160
- answer_text = response.text
161
 
162
- # Generate Answer Audio
163
  audio_stream = client.generate(
164
- text=answer_text,
165
- voice="nPczCjz82tPNOwVbpGE2", # Main Host Voice
166
  model="eleven_monolingual_v1"
167
  )
168
 
169
- save_path = "temp_interrupt.mp3"
170
  with open(save_path, "wb") as f:
171
  for chunk in audio_stream:
172
  f.write(chunk)
173
 
174
- return save_path, f"💡 Host: {answer_text}"
175
 
176
- # --- UI LAYOUT ---
 
177
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
178
  gr.Markdown("# 🎧 PodQuery: The Interactive Paper")
179
- gr.Markdown("Don't just listen. **Interrupt.** Ask questions. Learn actively.")
180
 
181
  with gr.Row():
182
- with gr.Column(scale=1):
183
- pdf_input = gr.File(label="1. Upload Research Paper (PDF)")
184
- style_dropdown = gr.Dropdown(
185
- ["Serious Academic", "Gossip Columnist", "Explain Like I'm 5"],
186
- label="2. Select Host Persona",
187
- value="Serious Academic"
188
- )
189
- gen_btn = gr.Button("🚀 Generate Podcast", variant="primary")
190
- status_box = gr.Textbox(label="Status", interactive=False)
191
-
192
- with gr.Column(scale=2):
193
- # The Player
194
- audio_player = gr.Audio(label="Podcast Stream", autoplay=True, type="filepath")
195
- transcript_box = gr.Markdown("### Transcript appears here...")
196
- next_btn = gr.Button("▶️ Play Next Segment", size="lg")
197
 
198
- # The Interrupt Interaction
199
- gr.Markdown("---")
200
- gr.Markdown("### Wait, I have a question!")
201
- with gr.Row():
202
- q_input = gr.Textbox(label="Ask the host...", placeholder="e.g., What does 'Stochastic' mean?", scale=4)
203
- ask_btn = gr.Button("Ask", scale=1)
204
-
205
- # Wiring
206
- gen_btn.click(fn=generate_script, inputs=[pdf_input, style_dropdown], outputs=[status_box, transcript_box])
207
- next_btn.click(fn=play_next_chunk, inputs=[], outputs=[audio_player, transcript_box])
208
- ask_btn.click(fn=interrupt_and_ask, inputs=[q_input], outputs=[audio_player, transcript_box])
209
 
210
  if __name__ == "__main__":
 
211
  demo.launch(mcp_server=True)
 
4
  import os
5
  import json
6
  import time
7
+ from pypdf import PdfReader
8
  from dotenv import load_dotenv
9
 
10
+ # 1. Load Keys
11
  load_dotenv()
 
 
 
12
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
13
  ELEVEN_API_KEY = os.getenv("ELEVEN_API_KEY")
14
 
15
+ # 2. Configure APIs
16
+ if GEMINI_API_KEY:
17
+ genai.configure(api_key=GEMINI_API_KEY)
18
+ if ELEVEN_API_KEY:
19
+ client = ElevenLabs(api_key=ELEVEN_API_KEY)
20
 
21
+ # 3. State Management
 
 
22
  class PodcastState:
23
  def __init__(self):
24
  self.script = []
25
  self.current_index = 0
 
26
  self.persona = "Serious Academic"
27
+ self.full_text = ""
28
 
29
  state = PodcastState()
30
 
31
+ # 4. Helper Functions
32
+
33
+ def extract_text_from_pdf(pdf_path):
34
+ """Real PDF Text Extraction"""
35
+ try:
36
+ reader = PdfReader(pdf_path)
37
+ text = ""
38
+ # Read first 5 pages max to save tokens for demo
39
+ for page in reader.pages[:5]:
40
+ text += page.extract_text() + "\n"
41
+ return text
42
+ except Exception as e:
43
+ return f"Error reading PDF: {e}"
44
 
45
  def generate_script(pdf_file, persona_style):
 
 
 
 
46
  if not pdf_file:
47
+ return "⚠️ Please upload a PDF first.", []
 
 
48
 
49
+ if not GEMINI_API_KEY or not ELEVEN_API_KEY:
50
+ return "⚠️ API Keys missing! Check Settings -> Secrets.", []
51
+
52
+ # Read PDF
53
+ pdf_text = extract_text_from_pdf(pdf_file)
54
+ state.full_text = pdf_text
55
+ state.persona = persona_style
56
+
57
  model = genai.GenerativeModel('gemini-2.0-flash')
58
 
 
59
  prompts = {
60
+ "Serious Academic": "Two professors discussing the paper. Tone: Intellectual, precise.",
61
+ "Gossip Columnist": "Two drama-loving gossip columnists reading this paper like it's a scandal. Tone: Shocked, slang-heavy.",
62
+ "Explain Like I'm 5": "A gentle teacher and a curious student. Tone: Simple analogies, enthusiastic."
63
  }
64
 
65
+ system_prompt = f"""
66
+ {prompts.get(persona_style)}
 
 
 
67
 
68
+ Based on the following text from a research paper:
69
+ "{pdf_text[:4000]}..."
70
 
71
+ Generate a short 4-turn dialogue script (2 turns each) summarizing the key point.
72
+ RETURN RAW JSON ONLY. No markdown formatting. Format:
73
  [
74
  {{"speaker": "Host A", "text": "..."}},
75
  {{"speaker": "Host B", "text": "..."}}
76
  ]
77
  """
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ response = model.generate_content(system_prompt)
81
+ clean_json = response.text.replace("```json", "").replace("```", "").strip()
82
  script_data = json.loads(clean_json)
83
+
84
  state.script = script_data
85
  state.current_index = 0
 
 
 
86
 
87
  return "✅ Script Generated! Click 'Play' to start.", script_data
88
  except Exception as e:
89
+ return f"Error: {str(e)}", []
90
 
91
  def play_next_chunk():
 
 
 
92
  if state.current_index >= len(state.script):
93
  return None, "🎉 Podcast Ended."
94
 
95
  line = state.script[state.current_index]
 
 
96
 
97
+ # Voice Selection
 
98
  voice_id = "nPczCjz82tPNOwVbpGE2" # Default Male
99
+ if state.persona == "Gossip Columnist" and line["speaker"] == "Host B":
100
+ voice_id = "21m00Tcm4TlvDq8ikWAM" # Female
101
+ elif line["speaker"] == "Host B":
102
+ voice_id = "EXAVITQu4vr4xnSDxMaL" # Female Generic
103
+
104
+ try:
105
+ audio_stream = client.generate(
106
+ text=line["text"],
107
+ voice=voice_id,
108
+ model="eleven_monolingual_v1"
109
+ )
110
 
111
+ save_path = f"temp_{state.current_index}.mp3"
112
+ with open(save_path, "wb") as f:
113
+ for chunk in audio_stream:
114
+ f.write(chunk)
115
+
116
+ state.current_index += 1
117
+ return save_path, f"🎙️ {line['speaker']}: {line['text']}"
118
+ except Exception as e:
119
+ return None, f"Audio Error: {str(e)}"
 
 
 
 
 
 
 
120
 
121
  def interrupt_and_ask(user_question):
122
+ if not state.full_text:
123
+ return None, "Upload a PDF first."
124
+
 
 
 
125
  model = genai.GenerativeModel('gemini-2.0-flash')
126
 
 
 
127
  prompt = f"""
128
  You are a podcast host ({state.persona}).
129
+ Context: {state.full_text[:1000]}
130
+ User Question: "{user_question}"
 
131
 
132
+ 1. Answer the question briefly.
133
+ 2. Say "Anyway, back to the paper..."
134
  """
135
 
136
  response = model.generate_content(prompt)
137
+ answer = response.text
138
 
 
139
  audio_stream = client.generate(
140
+ text=answer,
141
+ voice="nPczCjz82tPNOwVbpGE2",
142
  model="eleven_monolingual_v1"
143
  )
144
 
145
+ save_path = "interrupt.mp3"
146
  with open(save_path, "wb") as f:
147
  for chunk in audio_stream:
148
  f.write(chunk)
149
 
150
+ return save_path, f"💡 Host: {answer}"
151
 
152
+ # 5. Build Interface (THEME ERROR FIXED HERE)
153
+ # We use 'theme=gr.themes.Soft()' which requires gradio>=4.0
154
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
155
  gr.Markdown("# 🎧 PodQuery: The Interactive Paper")
 
156
 
157
  with gr.Row():
158
+ with gr.Column():
159
+ pdf_input = gr.File(label="Upload PDF")
160
+ style = gr.Dropdown(["Serious Academic", "Gossip Columnist"], value="Serious Academic", label="Persona")
161
+ btn_gen = gr.Button("Generate Script", variant="primary")
162
+ status = gr.Textbox(label="Status")
163
+
164
+ with gr.Column():
165
+ player = gr.Audio(autoplay=True, label="Stream")
166
+ transcript = gr.Markdown()
167
+ btn_play = gr.Button("▶️ Play Next Line")
 
 
 
 
 
168
 
169
+ gr.Markdown("### Interrupt")
170
+ q_input = gr.Textbox(label="Question")
171
+ btn_ask = gr.Button("✋ Interrupt")
172
+
173
+ btn_gen.click(generate_script, [pdf_input, style], [status, transcript])
174
+ btn_play.click(play_next_chunk, [], [player, transcript])
175
+ btn_ask.click(interrupt_and_ask, [q_input], [player, transcript])
 
 
 
 
176
 
177
  if __name__ == "__main__":
178
+ # MCP Server Mode Enabled
179
  demo.launch(mcp_server=True)