Spaces:

mlokendra
/

pdf_to_poadcast

Sleeping

App Files Files Community

mlokendra commited on Jul 1, 2025

Commit

05ffbdf

verified ·

1 Parent(s): d4bdf39

update extract

Browse files

Files changed (1) hide show

app.py +28 -6

app.py CHANGED Viewed

@@ -68,6 +68,12 @@ def extract_sections_from_pdf(pdf_path):
         "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
     sections = {}
     matches = []
@@ -85,10 +91,20 @@ def extract_sections_from_pdf(pdf_path):
         # Keep up to 4 paragraphs (based on double newline)
         paragraphs = section_text.split("\n\n")
         limited_section_text = "\n\n".join(paragraphs[:4])
-        sections[name] = limited_section_text
-    return sections
 def summarize_section_by_heuristics(text, max_sentences=5):
     sentences = split_sentences(text)
     if len(sentences) <= max_sentences:
@@ -214,15 +230,21 @@ def process_pdf(pdf_file):
     with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
         outfile.write(infile.read())
-    sections = extract_sections_from_pdf(pdf_path)
     print("Original text extrated \n\n\n",sections)
     summarized_sections = {
         name: summarize_section_by_heuristics(content)
         for name, content in sections.items()
     }
-    print("Summrized text . \n\n\n",sections)
-    section_summary_pairs = list(summarized_sections.items())
     with concurrent.futures.ThreadPoolExecutor() as executor:
         results = executor.map(process_section, section_summary_pairs)

         "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
+    section_patterns = {
+        "Start of podcast with first section of paper as abstract": r"^abstract\b",
+        "second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
+        "third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
+        "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
+    }
     sections = {}
     matches = []
         # Keep up to 4 paragraphs (based on double newline)
         paragraphs = section_text.split("\n\n")
         limited_section_text = "\n\n".join(paragraphs[:4])
+        sections[name] = extract_paragraphs(section_text, max_paragraphs=4)
+    return sections,section_patterns
+def extract_paragraphs(text, max_paragraphs=4):
+    # Use double newlines if present
+    if "\n\n" in text:
+        paras = text.split("\n\n")
+    else:
+        # If no clear paragraphs, group every 4 lines as one paragraph
+        lines = text.splitlines()
+        paras = ['\n'.join(lines[i:i+4]) for i in range(0, len(lines), 4)]
+    return "\n\n".join(paras[:max_paragraphs])
 def summarize_section_by_heuristics(text, max_sentences=5):
     sentences = split_sentences(text)
     if len(sentences) <= max_sentences:
     with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
         outfile.write(infile.read())
+    sections,section_patterns = extract_sections_from_pdf(pdf_path)
     print("Original text extrated \n\n\n",sections)
     summarized_sections = {
         name: summarize_section_by_heuristics(content)
         for name, content in sections.items()
     }
+    reordered_summarized_sections = {}
+    for key in section_patterns:
+        if key in summarized_sections: # Ensure the key exists in data_dict
+            reordered_summarized_sections[key] = summarized_sections[key]
+    print(reordered_summarized_sections)
+    print("Summrized text . \n\n\n",reordered_summarized_sections)
+    section_summary_pairs = list(reordered_summarized_sections.items())
     with concurrent.futures.ThreadPoolExecutor() as executor:
         results = executor.map(process_section, section_summary_pairs)