mlokendra commited on
Commit
05ffbdf
·
verified ·
1 Parent(s): d4bdf39

update extract

Browse files
Files changed (1) hide show
  1. app.py +28 -6
app.py CHANGED
@@ -68,6 +68,12 @@ def extract_sections_from_pdf(pdf_path):
68
  "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
69
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
70
  }
 
 
 
 
 
 
71
 
72
  sections = {}
73
  matches = []
@@ -85,10 +91,20 @@ def extract_sections_from_pdf(pdf_path):
85
  # Keep up to 4 paragraphs (based on double newline)
86
  paragraphs = section_text.split("\n\n")
87
  limited_section_text = "\n\n".join(paragraphs[:4])
88
- sections[name] = limited_section_text
89
-
90
- return sections
91
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def summarize_section_by_heuristics(text, max_sentences=5):
93
  sentences = split_sentences(text)
94
  if len(sentences) <= max_sentences:
@@ -214,15 +230,21 @@ def process_pdf(pdf_file):
214
  with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
215
  outfile.write(infile.read())
216
 
217
- sections = extract_sections_from_pdf(pdf_path)
218
 
219
  print("Original text extrated \n\n\n",sections)
220
  summarized_sections = {
221
  name: summarize_section_by_heuristics(content)
222
  for name, content in sections.items()
223
  }
224
- print("Summrized text . \n\n\n",sections)
225
- section_summary_pairs = list(summarized_sections.items())
 
 
 
 
 
 
226
  with concurrent.futures.ThreadPoolExecutor() as executor:
227
  results = executor.map(process_section, section_summary_pairs)
228
 
 
68
  "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
69
  "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
70
  }
71
+ section_patterns = {
72
+ "Start of podcast with first section of paper as abstract": r"^abstract\b",
73
+ "second section continuing from abstract to overview and no required to start introductuion between host & guest directly continue in discussion": r"^introduction\b|^overview\b",
74
+ "third section continuing from overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"^method(?:ology)?\b|^proposed method\b|^approach\b|^model architecture\b|^experimental setup\b|^network design\b",
75
+ "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"^conclusion(?:s)?\b|^summary\b|^final thought(?:s)\b|^result(?:s)\b",
76
+ }
77
 
78
  sections = {}
79
  matches = []
 
91
  # Keep up to 4 paragraphs (based on double newline)
92
  paragraphs = section_text.split("\n\n")
93
  limited_section_text = "\n\n".join(paragraphs[:4])
94
+ sections[name] = extract_paragraphs(section_text, max_paragraphs=4)
 
 
95
 
96
+ return sections,section_patterns
97
+
98
+ def extract_paragraphs(text, max_paragraphs=4):
99
+ # Use double newlines if present
100
+ if "\n\n" in text:
101
+ paras = text.split("\n\n")
102
+ else:
103
+ # If no clear paragraphs, group every 4 lines as one paragraph
104
+ lines = text.splitlines()
105
+ paras = ['\n'.join(lines[i:i+4]) for i in range(0, len(lines), 4)]
106
+
107
+ return "\n\n".join(paras[:max_paragraphs])
108
  def summarize_section_by_heuristics(text, max_sentences=5):
109
  sentences = split_sentences(text)
110
  if len(sentences) <= max_sentences:
 
230
  with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
231
  outfile.write(infile.read())
232
 
233
+ sections,section_patterns = extract_sections_from_pdf(pdf_path)
234
 
235
  print("Original text extrated \n\n\n",sections)
236
  summarized_sections = {
237
  name: summarize_section_by_heuristics(content)
238
  for name, content in sections.items()
239
  }
240
+ reordered_summarized_sections = {}
241
+ for key in section_patterns:
242
+ if key in summarized_sections: # Ensure the key exists in data_dict
243
+ reordered_summarized_sections[key] = summarized_sections[key]
244
+
245
+ print(reordered_summarized_sections)
246
+ print("Summrized text . \n\n\n",reordered_summarized_sections)
247
+ section_summary_pairs = list(reordered_summarized_sections.items())
248
  with concurrent.futures.ThreadPoolExecutor() as executor:
249
  results = executor.map(process_section, section_summary_pairs)
250