Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -122,8 +122,7 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
|
|
| 122 |
|
| 123 |
# Parse the original report HTML.
|
| 124 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 125 |
-
|
| 126 |
-
updated_report_html = report_html
|
| 127 |
|
| 128 |
# --- Specific adjustment extraction ---
|
| 129 |
if adjustmentguidelines.strip():
|
|
@@ -259,7 +258,7 @@ Generate a JSON object with exactly two keys (no extra commentary):
|
|
| 259 |
|
| 260 |
improvements_summary = []
|
| 261 |
|
| 262 |
-
# --- Process each chunk with
|
| 263 |
for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
|
| 264 |
# Enhanced chunk prompt with explicit instructions:
|
| 265 |
chunk_prompt = f"""Improve the following report chunk based on these guidelines:
|
|
@@ -290,14 +289,20 @@ Please output a JSON object with exactly two keys (no extra commentary):
|
|
| 290 |
chunk_summary = chunk_json.get("summary")
|
| 291 |
if improved_chunk and chunk_summary:
|
| 292 |
improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
else:
|
| 302 |
logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
|
| 303 |
except Exception as e:
|
|
@@ -316,23 +321,25 @@ Report HTML:
|
|
| 316 |
updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 317 |
updated_references = updated_references.strip().strip("```")
|
| 318 |
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
| 329 |
else:
|
| 330 |
-
|
| 331 |
-
logging.info("No existing reference table found; skipping reference replacement.")
|
| 332 |
|
| 333 |
-
# Do not append
|
| 334 |
-
summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
|
| 335 |
global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
|
|
|
|
| 336 |
updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
|
| 337 |
|
| 338 |
return updated_report_html, updated_qa
|
|
@@ -1761,6 +1768,7 @@ Important:
|
|
| 1761 |
- Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
|
| 1762 |
- For the Table of contents: do not mention the pages, but make each item on separate line
|
| 1763 |
- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
|
|
|
|
| 1764 |
|
| 1765 |
// Structure of the overall report:
|
| 1766 |
- Abstract
|
|
|
|
| 122 |
|
| 123 |
# Parse the original report HTML.
|
| 124 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 125 |
+
updated_report_html = report_html # working copy
|
|
|
|
| 126 |
|
| 127 |
# --- Specific adjustment extraction ---
|
| 128 |
if adjustmentguidelines.strip():
|
|
|
|
| 258 |
|
| 259 |
improvements_summary = []
|
| 260 |
|
| 261 |
+
# --- Process each chunk with robust DOM-based replacement ---
|
| 262 |
for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
|
| 263 |
# Enhanced chunk prompt with explicit instructions:
|
| 264 |
chunk_prompt = f"""Improve the following report chunk based on these guidelines:
|
|
|
|
| 289 |
chunk_summary = chunk_json.get("summary")
|
| 290 |
if improved_chunk and chunk_summary:
|
| 291 |
improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
|
| 292 |
+
try:
|
| 293 |
+
# Convert both the original chunk and the improved content into BeautifulSoup objects.
|
| 294 |
+
orig_chunk_soup = BeautifulSoup(chunk_html, "html.parser")
|
| 295 |
+
improved_chunk_soup = BeautifulSoup(improved_chunk, "html.parser")
|
| 296 |
+
original_text = orig_chunk_soup.get_text().strip()
|
| 297 |
+
search_text = original_text[:50] # use first 50 characters as anchor
|
| 298 |
+
found_tag = soup.find(lambda tag: tag.get_text() and search_text in tag.get_text())
|
| 299 |
+
if found_tag:
|
| 300 |
+
found_tag.replace_with(improved_chunk_soup)
|
| 301 |
+
updated_report_html = str(soup)
|
| 302 |
+
else:
|
| 303 |
+
logging.warning(f"Chunk {idx}: Unable to locate tag matching '{search_text}'. Replacement not applied.")
|
| 304 |
+
except Exception as rep_e:
|
| 305 |
+
logging.error(f"Chunk {idx}: Error during DOM-based replacement: {rep_e}")
|
| 306 |
else:
|
| 307 |
logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
|
| 308 |
except Exception as e:
|
|
|
|
| 321 |
updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 322 |
updated_references = updated_references.strip().strip("```")
|
| 323 |
|
| 324 |
+
# Only replace the content of the reference table if a heading exists
|
| 325 |
+
if updated_references:
|
| 326 |
+
soup_updated = BeautifulSoup(updated_report_html, "html.parser")
|
| 327 |
+
ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
|
| 328 |
+
if ref_heading:
|
| 329 |
+
next_sibling = ref_heading.find_next_sibling()
|
| 330 |
+
if next_sibling:
|
| 331 |
+
new_ref_html = BeautifulSoup(updated_references, "html.parser")
|
| 332 |
+
next_sibling.replace_with(new_ref_html)
|
| 333 |
+
# Ensure nothing is appended after the references section
|
| 334 |
+
updated_report_html = str(soup_updated)
|
| 335 |
+
else:
|
| 336 |
+
logging.info("No existing reference table found; reference update skipped.")
|
| 337 |
else:
|
| 338 |
+
logging.info("Generated updated references empty; leaving original references unchanged.")
|
|
|
|
| 339 |
|
| 340 |
+
# Do not append any extra summary information after the references.
|
|
|
|
| 341 |
global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
|
| 342 |
+
summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
|
| 343 |
updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
|
| 344 |
|
| 345 |
return updated_report_html, updated_qa
|
|
|
|
| 1768 |
- Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
|
| 1769 |
- For the Table of contents: do not mention the pages, but make each item on separate line
|
| 1770 |
- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
|
| 1771 |
+
the name of the reference table should be: "Reference Summary Table"
|
| 1772 |
|
| 1773 |
// Structure of the overall report:
|
| 1774 |
- Abstract
|