Spaces:
Sleeping
Sleeping
update promt and summrization
Browse files
app.py
CHANGED
|
@@ -46,7 +46,8 @@ pitch_male_str = f"{pitch_female:+d}Hz"
|
|
| 46 |
KEY_TERMS = [
|
| 47 |
"model", "propose", "architecture", "performance", "accuracy", "experiment",
|
| 48 |
"framework", "design", "method", "network", "approach", "outperform",
|
| 49 |
-
"layer", "training", "results", "learning", "evaluate", "baseline"
|
|
|
|
| 50 |
]
|
| 51 |
|
| 52 |
def split_sentences(text):
|
|
@@ -59,8 +60,8 @@ def extract_sections_from_pdf(pdf_path):
|
|
| 59 |
|
| 60 |
section_patterns = {
|
| 61 |
"Start of podcast with first section of paper as abstract": r"\babstract\b",
|
| 62 |
-
"second section continuing from abstract to
|
| 63 |
-
"third section continuing from
|
| 64 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 65 |
}
|
| 66 |
|
|
@@ -90,14 +91,27 @@ def summarize_section_by_heuristics(text, max_sentences=5):
|
|
| 90 |
scored = []
|
| 91 |
for idx, sent in enumerate(sentences):
|
| 92 |
score = 0
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
| 94 |
score += sum(1 for word in words if word in KEY_TERMS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if 10 < len(words) < 50:
|
| 96 |
score += 1
|
|
|
|
|
|
|
| 97 |
if idx in [0, 1]:
|
| 98 |
score += 1
|
|
|
|
| 99 |
scored.append((score, sent))
|
| 100 |
|
|
|
|
| 101 |
top_sentences = heapq.nlargest(max_sentences, scored)
|
| 102 |
top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
|
| 103 |
return " ".join(top_sentences)
|
|
|
|
| 46 |
KEY_TERMS = [
|
| 47 |
"model", "propose", "architecture", "performance", "accuracy", "experiment",
|
| 48 |
"framework", "design", "method", "network", "approach", "outperform",
|
| 49 |
+
"layer", "training", "results", "learning", "evaluate", "baseline",
|
| 50 |
+
"precision", "recall", "f1", "error", "metric", "loss", "time", "weight", "speed"
|
| 51 |
]
|
| 52 |
|
| 53 |
def split_sentences(text):
|
|
|
|
| 60 |
|
| 61 |
section_patterns = {
|
| 62 |
"Start of podcast with first section of paper as abstract": r"\babstract\b",
|
| 63 |
+
"second section continuing from abstract to Overview and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
|
| 64 |
+
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
| 65 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 66 |
}
|
| 67 |
|
|
|
|
| 91 |
scored = []
|
| 92 |
for idx, sent in enumerate(sentences):
|
| 93 |
score = 0
|
| 94 |
+
lower_sent = sent.lower()
|
| 95 |
+
words = lower_sent.split()
|
| 96 |
+
|
| 97 |
+
# Keyword match
|
| 98 |
score += sum(1 for word in words if word in KEY_TERMS)
|
| 99 |
+
|
| 100 |
+
# Give more weight to sentences with numbers (e.g. 85%, 0.97, etc.)
|
| 101 |
+
if re.search(r'\b\d+(\.\d+)?%?\b', sent): # captures decimals, integers, percentages
|
| 102 |
+
score += 2
|
| 103 |
+
|
| 104 |
+
# Short, information-dense sentences
|
| 105 |
if 10 < len(words) < 50:
|
| 106 |
score += 1
|
| 107 |
+
|
| 108 |
+
# Sentence position (early sentences are usually summary-like)
|
| 109 |
if idx in [0, 1]:
|
| 110 |
score += 1
|
| 111 |
+
|
| 112 |
scored.append((score, sent))
|
| 113 |
|
| 114 |
+
# Pick top sentences, preserving original order
|
| 115 |
top_sentences = heapq.nlargest(max_sentences, scored)
|
| 116 |
top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
|
| 117 |
return " ".join(top_sentences)
|