mlokendra commited on
Commit
4cdc5b2
·
verified ·
1 Parent(s): f5f90bc

update promt and summrization

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -46,7 +46,8 @@ pitch_male_str = f"{pitch_female:+d}Hz"
46
  KEY_TERMS = [
47
  "model", "propose", "architecture", "performance", "accuracy", "experiment",
48
  "framework", "design", "method", "network", "approach", "outperform",
49
- "layer", "training", "results", "learning", "evaluate", "baseline"
 
50
  ]
51
 
52
  def split_sentences(text):
@@ -59,8 +60,8 @@ def extract_sections_from_pdf(pdf_path):
59
 
60
  section_patterns = {
61
  "Start of podcast with first section of paper as abstract": r"\babstract\b",
62
- "second section continuing from abstract to introduction and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
63
- "third section continuing from introduction to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
64
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
65
  }
66
 
@@ -90,14 +91,27 @@ def summarize_section_by_heuristics(text, max_sentences=5):
90
  scored = []
91
  for idx, sent in enumerate(sentences):
92
  score = 0
93
- words = sent.lower().split()
 
 
 
94
  score += sum(1 for word in words if word in KEY_TERMS)
 
 
 
 
 
 
95
  if 10 < len(words) < 50:
96
  score += 1
 
 
97
  if idx in [0, 1]:
98
  score += 1
 
99
  scored.append((score, sent))
100
 
 
101
  top_sentences = heapq.nlargest(max_sentences, scored)
102
  top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
103
  return " ".join(top_sentences)
 
46
  KEY_TERMS = [
47
  "model", "propose", "architecture", "performance", "accuracy", "experiment",
48
  "framework", "design", "method", "network", "approach", "outperform",
49
+ "layer", "training", "results", "learning", "evaluate", "baseline",
50
+ "precision", "recall", "f1", "error", "metric", "loss", "time", "weight", "speed"
51
  ]
52
 
53
  def split_sentences(text):
 
60
 
61
  section_patterns = {
62
  "Start of podcast with first section of paper as abstract": r"\babstract\b",
63
+ "second section continuing from abstract to Overview and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
64
+ "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
65
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
66
  }
67
 
 
91
  scored = []
92
  for idx, sent in enumerate(sentences):
93
  score = 0
94
+ lower_sent = sent.lower()
95
+ words = lower_sent.split()
96
+
97
+ # Keyword match
98
  score += sum(1 for word in words if word in KEY_TERMS)
99
+
100
+ # Give more weight to sentences with numbers (e.g. 85%, 0.97, etc.)
101
+ if re.search(r'\b\d+(\.\d+)?%?\b', sent): # captures decimals, integers, percentages
102
+ score += 2
103
+
104
+ # Short, information-dense sentences
105
  if 10 < len(words) < 50:
106
  score += 1
107
+
108
+ # Sentence position (early sentences are usually summary-like)
109
  if idx in [0, 1]:
110
  score += 1
111
+
112
  scored.append((score, sent))
113
 
114
+ # Pick top sentences, preserving original order
115
  top_sentences = heapq.nlargest(max_sentences, scored)
116
  top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
117
  return " ".join(top_sentences)