Spaces:

mlokendra
/

pdf_to_poadcast

Sleeping

App Files Files Community

mlokendra commited on Jun 18, 2025

Commit

2436d0b

verified ·

1 Parent(s): 7da5350

update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import re
 import heapq
 #from nltk.tokenize import sent_tokenize
 from transformers import pipeline
 # Load a dialogue-friendly LLM (you can cache it offline too)
 generator = pipeline("text-generation",
@@ -27,8 +28,8 @@ def extract_sections_from_pdf(pdf_path):
     section_patterns = {
         "Start of podcast with first section of paper as abstract": r"\babstract\b",
         "second section continuing from abstract to introduction": r"\bintroduction\b",
-        "methodology": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
-        "conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
     sections = {}
@@ -124,6 +125,14 @@ def merge_segments(segments, output="podcast_output.mp3"):
     podcast.export(output, format="mp3")
     print(f"Podcast saved as {output}")
 def process_pdf(pdf_file):
     # Save the uploaded file to a temporary location
@@ -142,15 +151,16 @@ def process_pdf(pdf_file):
     # Step 2: Generate podcast script
     final_script = ""
-    for section, summary in summarized_sections.items():
-        dialogue = generate_podcast_script(section, summary)
-        dialogue_content = dialogue[1]["content"]
-        lines = dialogue_content.split('\n')
-        print("lines" ,lines)
-        # Filter lines that start with "Host:" or "Guest:" and join them
-        dialogue_fine = "\n".join([line for line in lines]).replace("**", "")
-        print("dialogue_fine",dialogue_fine)
-        final_script += f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
@@ -178,4 +188,4 @@ iface = gr.Interface(
 )
 # Launch the interface
-iface.launch(debug=True)

 import heapq
 #from nltk.tokenize import sent_tokenize
 from transformers import pipeline
+import concurrent.futures
 # Load a dialogue-friendly LLM (you can cache it offline too)
 generator = pipeline("text-generation",
     section_patterns = {
         "Start of podcast with first section of paper as abstract": r"\babstract\b",
         "second section continuing from abstract to introduction": r"\bintroduction\b",
+        "third section continuing from introduction to methodology": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
+        "fourth and the last section continuing from methodology to conclusion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
     sections = {}
     podcast.export(output, format="mp3")
     print(f"Podcast saved as {output}")
+def process_section(section_summary_pair):
+    section, summary = section_summary_pair
+    dialogue = generate_podcast_script(section, summary)
+    dialogue_content = dialogue[1]["content"]
+    lines = dialogue_content.split('\n')
+    dialogue_fine = "\n".join([line for line in lines]).replace("**", "")
+    result = f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
+    return result
 def process_pdf(pdf_file):
     # Save the uploaded file to a temporary location
     # Step 2: Generate podcast script
     final_script = ""
+    # Prepare data
+    section_summary_pairs = list(summarized_sections.items())
+    # Run in parallel using threads (good for API calls)
+    final_script = ""
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        results = executor.map(process_section, section_summary_pairs)
+    # Combine results
+    final_script = "".join(results)
 )
 # Launch the interface
+iface.launch(debug=True,share=True)