mlokendra commited on
Commit
b5e8c18
·
verified ·
1 Parent(s): 2436d0b
Files changed (1) hide show
  1. app.py +66 -80
app.py CHANGED
@@ -1,25 +1,49 @@
1
-
2
-
3
- #!pip install PyPDF2 gtts nltk gradio pydub
4
- from gtts import gTTS
5
- from pydub import AudioSegment
6
- from PyPDF2 import PdfReader
7
- import gradio as gr
8
  import os
9
  import re
10
  import heapq
11
- #from nltk.tokenize import sent_tokenize
 
 
 
 
 
 
12
  from transformers import pipeline
13
  import concurrent.futures
14
 
15
- # Load a dialogue-friendly LLM (you can cache it offline too)
 
 
 
16
  generator = pipeline("text-generation",
17
  model="unsloth/gemma-3-1b-it",
18
  device_map='cpu',
19
  max_new_tokens=300)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def split_sentences(text):
21
- # Naive sentence splitter — works well for most academic text
22
  return re.split(r'(?<=[.!?])\s+', text.strip())
 
23
  def extract_sections_from_pdf(pdf_path):
24
  reader = PdfReader(pdf_path)
25
  full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
@@ -47,15 +71,8 @@ def extract_sections_from_pdf(pdf_path):
47
  lines = section_text.splitlines()
48
  limited_section_text = "\n".join(lines[:20])
49
  sections[name] = limited_section_text
50
- print(sections)
51
- return sections
52
 
53
-
54
- KEY_TERMS = [
55
- "model", "propose", "architecture", "performance", "accuracy", "experiment",
56
- "framework", "design", "method", "network", "approach", "outperform",
57
- "layer", "training", "results", "learning", "evaluate", "baseline"
58
- ]
59
 
60
  def summarize_section_by_heuristics(text, max_sentences=5):
61
  sentences = split_sentences(text)
@@ -76,52 +93,45 @@ def summarize_section_by_heuristics(text, max_sentences=5):
76
  top_sentences = heapq.nlargest(max_sentences, scored)
77
  top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
78
  return " ".join(top_sentences)
79
- host_lang = 'en' # American English
80
- guest_lang = 'en' # British English
81
- HOST_PITCH_SHIFT=-2.5
82
- HOST_SPEED_FACTOR=0.95
83
- def tts_line_by_line(script):
 
 
 
 
 
 
 
 
84
  lines = script.split('\n')
85
  segments = []
 
 
 
86
  for i, line in enumerate(lines):
87
  if 'Host:' in line or 'Guest:' in line:
88
- print("print line ",i,line)
89
  speaker, content = line.split(':', 1)
90
- if speaker.lower() == "host":
91
- print( line.split(':', 1))
92
- tts = gTTS(content.strip(), lang=host_lang,tld='ie', slow=False)
93
- filename = f"segment_{i}.mp3"
94
- tts.save(filename)
95
- if speaker.lower() == "guest":
96
- print( line.split(':', 1))
97
- tts = gTTS(content.strip(), lang=guest_lang,tld='co.uk',slow=False)
98
- filename = f"segment_{i}.mp3"
99
- tts.save(filename)
100
-
101
- segment = AudioSegment.from_mp3(filename)
102
- segments.append(segment)
103
- return segments
104
 
105
- def generate_podcast_script(section_name, section_text):
106
- user_prompt = f"""You are hosting a podcast. Create a friendly, engaging conversation with maximum 10 sentence between Host and Guest discussing the {section_name} of a research paper paper:\n\nPaper Section:
107
- \"\"\"
108
- {section_text}
109
- \"\"\"
110
- \n\nFormat:\nHost: ...\nGuest: ..."""
111
 
112
- # Apply chat template formatting
113
- print(user_prompt[:200])
114
- messages =[{"role": "user", "content": user_prompt}]
115
 
116
- response = generator(messages, max_new_tokens=300, do_sample=True, temperature=0.7)
117
- print(response[0]["generated_text"])
118
- return response[0]["generated_text"]
119
 
 
120
 
121
  def merge_segments(segments, output="podcast_output.mp3"):
122
  podcast = AudioSegment.empty()
123
  for segment in segments:
124
- podcast += segment + AudioSegment.silent(duration=300) # add short pause
125
  podcast.export(output, format="mp3")
126
  print(f"Podcast saved as {output}")
127
 
@@ -130,62 +140,38 @@ def process_section(section_summary_pair):
130
  dialogue = generate_podcast_script(section, summary)
131
  dialogue_content = dialogue[1]["content"]
132
  lines = dialogue_content.split('\n')
133
- dialogue_fine = "\n".join([line for line in lines]).replace("**", "")
134
- result = f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
135
- return result
136
 
137
  def process_pdf(pdf_file):
138
- # Save the uploaded file to a temporary location
139
  pdf_path = "uploaded_pdf.pdf"
140
  with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
141
  outfile.write(infile.read())
142
 
143
- # Extract text, generate conversation, and create audio
144
-
145
- # Step 1: Extract & summarize sections
146
  sections = extract_sections_from_pdf(pdf_path)
147
  summarized_sections = {
148
  name: summarize_section_by_heuristics(content)
149
  for name, content in sections.items()
150
  }
151
 
152
- # Step 2: Generate podcast script
153
- final_script = ""
154
- # Prepare data
155
  section_summary_pairs = list(summarized_sections.items())
156
-
157
- # Run in parallel using threads (good for API calls)
158
- final_script = ""
159
  with concurrent.futures.ThreadPoolExecutor() as executor:
160
  results = executor.map(process_section, section_summary_pairs)
161
 
162
- # Combine results
163
  final_script = "".join(results)
164
-
165
-
166
-
167
- print(final_script)
168
- segments = tts_line_by_line(final_script)
169
  output_audio_path = "podcast_output.mp3"
170
  merge_segments(segments, output=output_audio_path)
171
 
172
- # Clean up the temporary PDF file
173
  os.remove(pdf_path)
174
- # Clean up individual segment files
175
- #for i in range(1,len(segments)):
176
- # os.remove(f"segment_{i}.mp3")
177
-
178
-
179
  return output_audio_path
180
 
181
- # Create the Gradio interface
182
  iface = gr.Interface(
183
  fn=process_pdf,
184
  inputs=gr.File(label="Upload a PDF file"),
185
  outputs=gr.Audio(label="Generated Podcast Audio"),
186
  title="PDF to Podcast",
187
- description="Upload a PDF and get a podcast-style audio summary."
188
  )
189
 
190
- # Launch the interface
191
- iface.launch(debug=True,share=True)
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import heapq
4
+ import uuid
5
+ import asyncio
6
+ import edge_tts
7
+ import gradio as gr
8
+ import nest_asyncio
9
+ from PyPDF2 import PdfReader
10
+ from pydub import AudioSegment
11
  from transformers import pipeline
12
  import concurrent.futures
13
 
14
+ # Apply nested event loop patch for Jupyter/Colab
15
+ nest_asyncio.apply()
16
+
17
+ # Load LLM
18
  generator = pipeline("text-generation",
19
  model="unsloth/gemma-3-1b-it",
20
  device_map='cpu',
21
  max_new_tokens=300)
22
+
23
+ voices = await VoicesManager.create()
24
+ voices_female = await VoicesManager.create()
25
+ voice_male = voice_male.find(Gender="Male", Language="es")
26
+ voice_female = voice_female.find(Gender="Female", Language="es")
27
+
28
+ MALE_VOICE = random.choice(voice_male)["Name"]
29
+ FEMALE_VOICE = random.choice(voice_female)["Name"]
30
+
31
+ rate_male=40
32
+ pitch_male=40
33
+ pitch_female=40
34
+ rate_female=40
35
+ rate_female_str = f"{rate_female:+d}%"
36
+ pitch_female_str = f"{rate_female:+d}Hz"
37
+
38
+ KEY_TERMS = [
39
+ "model", "propose", "architecture", "performance", "accuracy", "experiment",
40
+ "framework", "design", "method", "network", "approach", "outperform",
41
+ "layer", "training", "results", "learning", "evaluate", "baseline"
42
+ ]
43
+
44
  def split_sentences(text):
 
45
  return re.split(r'(?<=[.!?])\s+', text.strip())
46
+
47
  def extract_sections_from_pdf(pdf_path):
48
  reader = PdfReader(pdf_path)
49
  full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
 
71
  lines = section_text.splitlines()
72
  limited_section_text = "\n".join(lines[:20])
73
  sections[name] = limited_section_text
 
 
74
 
75
+ return sections
 
 
 
 
 
76
 
77
  def summarize_section_by_heuristics(text, max_sentences=5):
78
  sentences = split_sentences(text)
 
93
  top_sentences = heapq.nlargest(max_sentences, scored)
94
  top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
95
  return " ".join(top_sentences)
96
+
97
+ def generate_podcast_script(section_name, section_text):
98
+ user_prompt = f"""You are hosting a podcast. Create a friendly, engaging conversation with maximum 10 sentence between Host and Guest discussing the {section_name} of a research paper:
99
+ \nPaper Section:\n\"\"\"\n{section_text}\n\"\"\"\n\nFormat:\nHost: ...\nGuest: ..."""
100
+ messages = [{"role": "user", "content": user_prompt}]
101
+ response = generator(messages, max_new_tokens=300, do_sample=True, temperature=0.7)
102
+ return response[0]["generated_text"]
103
+
104
+ async def generate_voice_line(text, voice, filename):
105
+ communicate = edge_tts.Communicate(text, voice)
106
+ await communicate.save(filename)
107
+
108
+ async def tts_edge_line_by_line(script):
109
  lines = script.split('\n')
110
  segments = []
111
+ tasks = []
112
+ filenames = []
113
+
114
  for i, line in enumerate(lines):
115
  if 'Host:' in line or 'Guest:' in line:
 
116
  speaker, content = line.split(':', 1)
117
+ speaker = speaker.strip().lower()
118
+ voice = MALE_VOICE if speaker == 'host' else FEMALE_VOICE
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ filename = f"segment_{uuid.uuid4().hex}.mp3"
121
+ filenames.append(filename)
122
+ tasks.append(generate_voice_line(content.strip(), voice, filename))
 
 
 
123
 
124
+ await asyncio.gather(*tasks)
 
 
125
 
126
+ for filename in filenames:
127
+ segments.append(AudioSegment.from_mp3(filename))
 
128
 
129
+ return segments
130
 
131
  def merge_segments(segments, output="podcast_output.mp3"):
132
  podcast = AudioSegment.empty()
133
  for segment in segments:
134
+ podcast += segment + AudioSegment.silent(duration=300)
135
  podcast.export(output, format="mp3")
136
  print(f"Podcast saved as {output}")
137
 
 
140
  dialogue = generate_podcast_script(section, summary)
141
  dialogue_content = dialogue[1]["content"]
142
  lines = dialogue_content.split('\n')
143
+ dialogue_fine = "\n".join([line for line in lines if 'Host:' in line or 'Guest:' in line]).replace("**", "")
144
+ return f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
 
145
 
146
  def process_pdf(pdf_file):
 
147
  pdf_path = "uploaded_pdf.pdf"
148
  with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
149
  outfile.write(infile.read())
150
 
 
 
 
151
  sections = extract_sections_from_pdf(pdf_path)
152
  summarized_sections = {
153
  name: summarize_section_by_heuristics(content)
154
  for name, content in sections.items()
155
  }
156
 
 
 
 
157
  section_summary_pairs = list(summarized_sections.items())
 
 
 
158
  with concurrent.futures.ThreadPoolExecutor() as executor:
159
  results = executor.map(process_section, section_summary_pairs)
160
 
 
161
  final_script = "".join(results)
162
+ segments = asyncio.run(tts_edge_line_by_line(final_script))
 
 
 
 
163
  output_audio_path = "podcast_output.mp3"
164
  merge_segments(segments, output=output_audio_path)
165
 
 
166
  os.remove(pdf_path)
 
 
 
 
 
167
  return output_audio_path
168
 
 
169
  iface = gr.Interface(
170
  fn=process_pdf,
171
  inputs=gr.File(label="Upload a PDF file"),
172
  outputs=gr.Audio(label="Generated Podcast Audio"),
173
  title="PDF to Podcast",
174
+ description="Upload a Research Paper PDF and get a podcast-style audio summary."
175
  )
176
 
177
+ iface.launch(debug=True)