Guiyom commited on
Commit
45140d2
·
verified ·
1 Parent(s): 30d9711

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -25
app.py CHANGED
@@ -122,8 +122,7 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
122
 
123
  # Parse the original report HTML.
124
  soup = BeautifulSoup(report_html, "html.parser")
125
- # Create a working copy of the HTML as a string for exact replacement.
126
- updated_report_html = report_html
127
 
128
  # --- Specific adjustment extraction ---
129
  if adjustmentguidelines.strip():
@@ -259,7 +258,7 @@ Generate a JSON object with exactly two keys (no extra commentary):
259
 
260
  improvements_summary = []
261
 
262
- # --- Process each chunk with refined replacement logic ---
263
  for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
264
  # Enhanced chunk prompt with explicit instructions:
265
  chunk_prompt = f"""Improve the following report chunk based on these guidelines:
@@ -290,14 +289,20 @@ Please output a JSON object with exactly two keys (no extra commentary):
290
  chunk_summary = chunk_json.get("summary")
291
  if improved_chunk and chunk_summary:
292
  improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
293
- # Attempt to replace the old chunk with the improved chunk.
294
- # Use string 'strip()' to remove any surrounding whitespace.
295
- chunk_html_clean = chunk_html.strip()
296
- improved_chunk_clean = improved_chunk.strip()
297
- if chunk_html_clean in updated_report_html:
298
- updated_report_html = updated_report_html.replace(chunk_html_clean, improved_chunk_clean, 1)
299
- else:
300
- logging.warning(f"Chunk {idx}: Exact snippet not found for replacement. Replacement not applied.")
 
 
 
 
 
 
301
  else:
302
  logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
303
  except Exception as e:
@@ -316,23 +321,25 @@ Report HTML:
316
  updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
317
  updated_references = updated_references.strip().strip("```")
318
 
319
- # Instead of appending, look for a references section and replace its content.
320
- soup_updated = BeautifulSoup(updated_report_html, "html.parser")
321
- ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
322
- if ref_heading:
323
- # Assume that the reference table is the next sibling.
324
- next_sibling = ref_heading.find_next_sibling()
325
- if next_sibling:
326
- new_ref_html = BeautifulSoup(updated_references, "html.parser")
327
- next_sibling.replace_with(new_ref_html)
328
- updated_report_html = str(soup_updated)
 
 
 
329
  else:
330
- # No reference section found; do nothing.
331
- logging.info("No existing reference table found; skipping reference replacement.")
332
 
333
- # Do not append anything after the references.
334
- summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
335
  global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
 
336
  updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
337
 
338
  return updated_report_html, updated_qa
@@ -1761,6 +1768,7 @@ Important:
1761
  - Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
1762
  - For the Table of contents: do not mention the pages, but make each item on separate line
1763
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
 
1764
 
1765
  // Structure of the overall report:
1766
  - Abstract
 
122
 
123
  # Parse the original report HTML.
124
  soup = BeautifulSoup(report_html, "html.parser")
125
+ updated_report_html = report_html # working copy
 
126
 
127
  # --- Specific adjustment extraction ---
128
  if adjustmentguidelines.strip():
 
258
 
259
  improvements_summary = []
260
 
261
+ # --- Process each chunk with robust DOM-based replacement ---
262
  for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
263
  # Enhanced chunk prompt with explicit instructions:
264
  chunk_prompt = f"""Improve the following report chunk based on these guidelines:
 
289
  chunk_summary = chunk_json.get("summary")
290
  if improved_chunk and chunk_summary:
291
  improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
292
+ try:
293
+ # Convert both the original chunk and the improved content into BeautifulSoup objects.
294
+ orig_chunk_soup = BeautifulSoup(chunk_html, "html.parser")
295
+ improved_chunk_soup = BeautifulSoup(improved_chunk, "html.parser")
296
+ original_text = orig_chunk_soup.get_text().strip()
297
+ search_text = original_text[:50] # use first 50 characters as anchor
298
+ found_tag = soup.find(lambda tag: tag.get_text() and search_text in tag.get_text())
299
+ if found_tag:
300
+ found_tag.replace_with(improved_chunk_soup)
301
+ updated_report_html = str(soup)
302
+ else:
303
+ logging.warning(f"Chunk {idx}: Unable to locate tag matching '{search_text}'. Replacement not applied.")
304
+ except Exception as rep_e:
305
+ logging.error(f"Chunk {idx}: Error during DOM-based replacement: {rep_e}")
306
  else:
307
  logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
308
  except Exception as e:
 
321
  updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
322
  updated_references = updated_references.strip().strip("```")
323
 
324
+ # Only replace the content of the reference table if a heading exists
325
+ if updated_references:
326
+ soup_updated = BeautifulSoup(updated_report_html, "html.parser")
327
+ ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
328
+ if ref_heading:
329
+ next_sibling = ref_heading.find_next_sibling()
330
+ if next_sibling:
331
+ new_ref_html = BeautifulSoup(updated_references, "html.parser")
332
+ next_sibling.replace_with(new_ref_html)
333
+ # Ensure nothing is appended after the references section
334
+ updated_report_html = str(soup_updated)
335
+ else:
336
+ logging.info("No existing reference table found; reference update skipped.")
337
  else:
338
+ logging.info("Generated updated references empty; leaving original references unchanged.")
 
339
 
340
+ # Do not append any extra summary information after the references.
 
341
  global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
342
+ summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
343
  updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
344
 
345
  return updated_report_html, updated_qa
 
1768
  - Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
1769
  - For the Table of contents: do not mention the pages, but make each item on separate line
1770
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
1771
+ the name of the reference table should be: "Reference Summary Table"
1772
 
1773
  // Structure of the overall report:
1774
  - Abstract