Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 26, 2025

Commit

45140d2

verified ·

1 Parent(s): 30d9711

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -25

app.py CHANGED Viewed

@@ -122,8 +122,7 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
     # Parse the original report HTML.
     soup = BeautifulSoup(report_html, "html.parser")
-    # Create a working copy of the HTML as a string for exact replacement.
-    updated_report_html = report_html
     # --- Specific adjustment extraction ---
     if adjustmentguidelines.strip():
@@ -259,7 +258,7 @@ Generate a JSON object with exactly two keys (no extra commentary):
     improvements_summary = []
-    # --- Process each chunk with refined replacement logic ---
     for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
         # Enhanced chunk prompt with explicit instructions:
         chunk_prompt = f"""Improve the following report chunk based on these guidelines:
@@ -290,14 +289,20 @@ Please output a JSON object with exactly two keys (no extra commentary):
             chunk_summary = chunk_json.get("summary")
             if improved_chunk and chunk_summary:
                 improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
-                # Attempt to replace the old chunk with the improved chunk.
-                # Use string 'strip()' to remove any surrounding whitespace.
-                chunk_html_clean = chunk_html.strip()
-                improved_chunk_clean = improved_chunk.strip()
-                if chunk_html_clean in updated_report_html:
-                    updated_report_html = updated_report_html.replace(chunk_html_clean, improved_chunk_clean, 1)
-                else:
-                    logging.warning(f"Chunk {idx}: Exact snippet not found for replacement. Replacement not applied.")
             else:
                 logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
         except Exception as e:
@@ -316,23 +321,25 @@ Report HTML:
     updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
     updated_references = updated_references.strip().strip("```")
-    # Instead of appending, look for a references section and replace its content.
-    soup_updated = BeautifulSoup(updated_report_html, "html.parser")
-    ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
-    if ref_heading:
-        # Assume that the reference table is the next sibling.
-        next_sibling = ref_heading.find_next_sibling()
-        if next_sibling:
-            new_ref_html = BeautifulSoup(updated_references, "html.parser")
-            next_sibling.replace_with(new_ref_html)
-        updated_report_html = str(soup_updated)
     else:
-        # No reference section found; do nothing.
-        logging.info("No existing reference table found; skipping reference replacement.")
-    # Do not append anything after the references.
-    summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
     global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
     updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
     return updated_report_html, updated_qa
@@ -1761,6 +1768,7 @@ Important:
 - Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
 - For the Table of contents: do not mention the pages, but make each item on separate line
 - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
 // Structure of the overall report:
 - Abstract

     # Parse the original report HTML.
     soup = BeautifulSoup(report_html, "html.parser")
+    updated_report_html = report_html  # working copy
     # --- Specific adjustment extraction ---
     if adjustmentguidelines.strip():
     improvements_summary = []
+    # --- Process each chunk with robust DOM-based replacement ---
     for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
         # Enhanced chunk prompt with explicit instructions:
         chunk_prompt = f"""Improve the following report chunk based on these guidelines:
             chunk_summary = chunk_json.get("summary")
             if improved_chunk and chunk_summary:
                 improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
+                try:
+                    # Convert both the original chunk and the improved content into BeautifulSoup objects.
+                    orig_chunk_soup = BeautifulSoup(chunk_html, "html.parser")
+                    improved_chunk_soup = BeautifulSoup(improved_chunk, "html.parser")
+                    original_text = orig_chunk_soup.get_text().strip()
+                    search_text = original_text[:50]  # use first 50 characters as anchor
+                    found_tag = soup.find(lambda tag: tag.get_text() and search_text in tag.get_text())
+                    if found_tag:
+                        found_tag.replace_with(improved_chunk_soup)
+                        updated_report_html = str(soup)
+                    else:
+                        logging.warning(f"Chunk {idx}: Unable to locate tag matching '{search_text}'. Replacement not applied.")
+                except Exception as rep_e:
+                    logging.error(f"Chunk {idx}: Error during DOM-based replacement: {rep_e}")
             else:
                 logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
         except Exception as e:
     updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
     updated_references = updated_references.strip().strip("```")
+    # Only replace the content of the reference table if a heading exists
+    if updated_references:
+        soup_updated = BeautifulSoup(updated_report_html, "html.parser")
+        ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
+        if ref_heading:
+            next_sibling = ref_heading.find_next_sibling()
+            if next_sibling:
+                new_ref_html = BeautifulSoup(updated_references, "html.parser")
+                next_sibling.replace_with(new_ref_html)
+            # Ensure nothing is appended after the references section
+            updated_report_html = str(soup_updated)
+        else:
+            logging.info("No existing reference table found; reference update skipped.")
     else:
+        logging.info("Generated updated references empty; leaving original references unchanged.")
+    # Do not append any extra summary information after the references.
     global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
+    summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
     return updated_report_html, updated_qa
 - Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
 - For the Table of contents: do not mention the pages, but make each item on separate line
 - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
+the name of the reference table should be: "Reference Summary Table"
 // Structure of the overall report:
 - Abstract