Spaces:

mlokendra
/

pdf_to_poadcast

Sleeping

App Files Files Community

mlokendra commited on Jun 30, 2025

Commit

4cdc5b2

verified ·

1 Parent(s): f5f90bc

update promt and summrization

Browse files

Files changed (1) hide show

app.py +18 -4

app.py CHANGED Viewed

@@ -46,7 +46,8 @@ pitch_male_str = f"{pitch_female:+d}Hz"
 KEY_TERMS = [
     "model", "propose", "architecture", "performance", "accuracy", "experiment",
     "framework", "design", "method", "network", "approach", "outperform",
-    "layer", "training", "results", "learning", "evaluate", "baseline"
 ]
 def split_sentences(text):
@@ -59,8 +60,8 @@ def extract_sections_from_pdf(pdf_path):
     section_patterns = {
         "Start of podcast with first section of paper as abstract": r"\babstract\b",
-        "second section continuing from abstract to introduction and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
-        "third section continuing from introduction to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
@@ -90,14 +91,27 @@ def summarize_section_by_heuristics(text, max_sentences=5):
     scored = []
     for idx, sent in enumerate(sentences):
         score = 0
-        words = sent.lower().split()
         score += sum(1 for word in words if word in KEY_TERMS)
         if 10 < len(words) < 50:
             score += 1
         if idx in [0, 1]:
             score += 1
         scored.append((score, sent))
     top_sentences = heapq.nlargest(max_sentences, scored)
     top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
     return " ".join(top_sentences)

 KEY_TERMS = [
     "model", "propose", "architecture", "performance", "accuracy", "experiment",
     "framework", "design", "method", "network", "approach", "outperform",
+    "layer", "training", "results", "learning", "evaluate", "baseline",
+    "precision", "recall", "f1", "error", "metric", "loss", "time", "weight", "speed"
 ]
 def split_sentences(text):
     section_patterns = {
         "Start of podcast with first section of paper as abstract": r"\babstract\b",
+        "second section continuing from abstract to Overview and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
+        "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
         "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
     }
     scored = []
     for idx, sent in enumerate(sentences):
         score = 0
+        lower_sent = sent.lower()
+        words = lower_sent.split()
+        # Keyword match
         score += sum(1 for word in words if word in KEY_TERMS)
+        # Give more weight to sentences with numbers (e.g. 85%, 0.97, etc.)
+        if re.search(r'\b\d+(\.\d+)?%?\b', sent):  # captures decimals, integers, percentages
+            score += 2
+        # Short, information-dense sentences
         if 10 < len(words) < 50:
             score += 1
+        # Sentence position (early sentences are usually summary-like)
         if idx in [0, 1]:
             score += 1
         scored.append((score, sent))
+    # Pick top sentences, preserving original order
     top_sentences = heapq.nlargest(max_sentences, scored)
     top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
     return " ".join(top_sentences)