Spaces:
Sleeping
Sleeping
update extract
Browse files
app.py
CHANGED
|
@@ -68,6 +68,12 @@ def extract_sections_from_pdf(pdf_path):
|
|
| 68 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
| 69 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 70 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
sections = {}
|
| 73 |
matches = []
|
|
@@ -85,10 +91,20 @@ def extract_sections_from_pdf(pdf_path):
|
|
| 85 |
# Keep up to 4 paragraphs (based on double newline)
|
| 86 |
paragraphs = section_text.split("\n\n")
|
| 87 |
limited_section_text = "\n\n".join(paragraphs[:4])
|
| 88 |
-
sections[name] =
|
| 89 |
-
|
| 90 |
-
return sections
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def summarize_section_by_heuristics(text, max_sentences=5):
|
| 93 |
sentences = split_sentences(text)
|
| 94 |
if len(sentences) <= max_sentences:
|
|
@@ -214,15 +230,21 @@ def process_pdf(pdf_file):
|
|
| 214 |
with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
|
| 215 |
outfile.write(infile.read())
|
| 216 |
|
| 217 |
-
sections = extract_sections_from_pdf(pdf_path)
|
| 218 |
|
| 219 |
print("Original text extrated \n\n\n",sections)
|
| 220 |
summarized_sections = {
|
| 221 |
name: summarize_section_by_heuristics(content)
|
| 222 |
for name, content in sections.items()
|
| 223 |
}
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 227 |
results = executor.map(process_section, section_summary_pairs)
|
| 228 |
|
|
|
|
| 68 |
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
|
| 69 |
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
|
| 70 |
}
|
| 71 |
+
section_patterns = {
|
| 72 |
+
"Start of podcast with first section of paper as abstract": r"^abstract\b",
|
| 73 |
+
"second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
|
| 74 |
+
"third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
|
| 75 |
+
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
|
| 76 |
+
}
|
| 77 |
|
| 78 |
sections = {}
|
| 79 |
matches = []
|
|
|
|
| 91 |
# Keep up to 4 paragraphs (based on double newline)
|
| 92 |
paragraphs = section_text.split("\n\n")
|
| 93 |
limited_section_text = "\n\n".join(paragraphs[:4])
|
| 94 |
+
sections[name] = extract_paragraphs(section_text, max_paragraphs=4)
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
return sections,section_patterns
|
| 97 |
+
|
| 98 |
+
def extract_paragraphs(text, max_paragraphs=4):
|
| 99 |
+
# Use double newlines if present
|
| 100 |
+
if "\n\n" in text:
|
| 101 |
+
paras = text.split("\n\n")
|
| 102 |
+
else:
|
| 103 |
+
# If no clear paragraphs, group every 4 lines as one paragraph
|
| 104 |
+
lines = text.splitlines()
|
| 105 |
+
paras = ['\n'.join(lines[i:i+4]) for i in range(0, len(lines), 4)]
|
| 106 |
+
|
| 107 |
+
return "\n\n".join(paras[:max_paragraphs])
|
| 108 |
def summarize_section_by_heuristics(text, max_sentences=5):
|
| 109 |
sentences = split_sentences(text)
|
| 110 |
if len(sentences) <= max_sentences:
|
|
|
|
| 230 |
with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
|
| 231 |
outfile.write(infile.read())
|
| 232 |
|
| 233 |
+
sections,section_patterns = extract_sections_from_pdf(pdf_path)
|
| 234 |
|
| 235 |
print("Original text extrated \n\n\n",sections)
|
| 236 |
summarized_sections = {
|
| 237 |
name: summarize_section_by_heuristics(content)
|
| 238 |
for name, content in sections.items()
|
| 239 |
}
|
| 240 |
+
reordered_summarized_sections = {}
|
| 241 |
+
for key in section_patterns:
|
| 242 |
+
if key in summarized_sections: # Ensure the key exists in data_dict
|
| 243 |
+
reordered_summarized_sections[key] = summarized_sections[key]
|
| 244 |
+
|
| 245 |
+
print(reordered_summarized_sections)
|
| 246 |
+
print("Summrized text . \n\n\n",reordered_summarized_sections)
|
| 247 |
+
section_summary_pairs = list(reordered_summarized_sections.items())
|
| 248 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 249 |
results = executor.map(process_section, section_summary_pairs)
|
| 250 |
|