Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Mar 9, 2025

Commit

a5f7d67

verified ·

1 Parent(s): 7241714

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -55

app.py CHANGED Viewed

@@ -451,22 +451,41 @@ def remove_text_from_html(report_html: str, text_to_remove: str) -> str:
                     child.replace_with(new_text)
     return str(soup)
-def fix_visual_after_section(report_html: str, section_title: str, instructions: str) -> str:
     soup = BeautifulSoup(report_html, "html.parser")
-    # Find the header (any h1-h4) that contains the section title (case-insensitive)
-    header = soup.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and section_title.lower() in tag.get_text(strip=True).lower())
     if header:
-        # Find the next sibling that is an iframe (assuming that is the visual)
-        iframe = header.find_next_sibling("iframe")
         if iframe and iframe.has_attr("srcdoc"):
             current_code = iframe["srcdoc"]
-            prompt = f"""Improve the following visual code (e.g., adjust parenthesis, simplify styling) based on the additional instructions below.
-Current code: {current_code}
-Additional instructions: {instructions}
-Return only the improved code (no other explanation)."""
-            improved_code = llm_call(prompt, model="o3-mini", temperature=0, max_tokens_param=2000).strip()
-            # Re-inject the improved code
-            iframe["srcdoc"] = improved_code
     return str(soup)
 def snippet_in_tag(tag: Tag, snippet: str) -> bool:
@@ -674,38 +693,36 @@ Only output valid JSON with no comments or code fences."""
     updated_report_html = str(soup)
-    # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
-    prompt_refs = (f"""
-You are a technical editor.
-Review the following updated report HTML.
-If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
-generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
-Updated Report HTML:\n{updated_report_html}"""
-    )
-    updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
-    updated_refs = updated_refs.strip().strip("```")
-    if updated_refs:
-        soup_updated = BeautifulSoup(updated_report_html, "html.parser")
-        ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
-        if ref_heading:
-            next_sibling = ref_heading.find_next_sibling()
-            if next_sibling:
-                try:
-                    new_ref_html = BeautifulSoup(updated_refs, "html.parser")
-                    next_sibling.replace_with(new_ref_html)
-                    logging.info("fine_tune_report: Reference table updated successfully.")
-                except Exception as e:
-                    logging.error("fine_tune_report: Error updating reference table: %s", e)
             else:
-                logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
-            updated_report_html = str(soup_updated)
         else:
-            logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
-    else:
-        logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary
@@ -754,18 +771,16 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
     adjustment_request = user_message.replace("@improve", "").strip()
     # --- CASE 1: Removal request ---
-    # e.g., "remove the following text: [[ABC]]"
-    text_removal_match = re.search(r"remove the following text:\s*\[\[([^\]]+)\]\]", adjustment_request, re.I)
-    if text_removal_match:
-        text_to_remove = text_removal_match.group(1).strip()
         updated_report = remove_text_from_html(report_text, text_to_remove)
         answer = f"Removed text: '{text_to_remove}' from the report."
         chat_history.append([user_message, answer])
         return chat_history, "", updated_report
-    # --- CASE 2: Fix visual request ---
-    # e.g., "fix visual after section XYZ: <extra instructions...>"
-    visual_fix_match = re.search(r"fix visual after section\s+([^\:]+)(?::\s*(.*))?", adjustment_request, re.I)
     if visual_fix_match:
         section_name = visual_fix_match.group(1).strip()
         extra_instructions = visual_fix_match.group(2).strip() if visual_fix_match.group(2) else ""
@@ -774,7 +789,7 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
         chat_history.append([user_message, answer])
         return chat_history, "", updated_report
-    # --- DEFAULT: Proceed with normal LLM-based improvement ---
     updated_report, _ = fine_tune_report(
         adjustment_request,
         os.getenv("OPENAI_API_KEY"),
@@ -938,7 +953,7 @@ You are a technical editor.
 Review the following expanded report HTML.
 If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
-generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
 Updated Report HTML:\n{updated_report_html}"""
     )
     updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
@@ -946,7 +961,7 @@ Updated Report HTML:\n{updated_report_html}"""
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
-        ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
             if next_sibling:
@@ -2240,9 +2255,9 @@ The report must follow this writing style: {reportstyle}.
 --------------- Citations -----------
 - The report must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below - be selective, don't put it at every sentence or every paragraph.
    Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
-- The name of the reference table should be: "Reference Summary Table"
 - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
-- The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
 - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
 - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
 - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
@@ -2401,7 +2416,7 @@ Use the following report structure with consistency:
 - Introduction
 - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
 - Conclusion
-- References summary table
 - Report ending formatting (as mentioned before)
 {{Do not add anything after - no conclusive meta comment or content}}

                     child.replace_with(new_text)
     return str(soup)
+def fix_visual_after_section(report_html: str, section_title: str, extra_instructions: str) -> str:
+    """
+    Given a report HTML (as a string), the target section name (from inside [[ ]]) and any extra instructions,
+    this function finds the first header (preferably an <h1>) that contains the section name (ignoring ones such as "Table of Contents"),
+    then finds the first <iframe> after that header, sends its srcdoc to the LLM for improvement, and reinjects the improved code.
+    """
     soup = BeautifulSoup(report_html, "html.parser")
+    header = None
+    # Look for header tags (h1–h4) that contain the target, excluding common TOC headings
+    for tag in soup.find_all(["h1", "h2", "h3", "h4"]):
+        text = tag.get_text(strip=True)
+        if section_title.lower() in text.lower() and "table of contents" not in text.lower():
+            header = tag
+            break
     if header:
+        # Look for the first <iframe> after that header
+        iframe = header.find_next("iframe")
         if iframe and iframe.has_attr("srcdoc"):
             current_code = iframe["srcdoc"]
+            prompt = (
+                f"Improve the following visual code by simplifying its formatting (e.g., adjust parenthesis, remove extra styling) "
+                f"based on these extra instructions: {extra_instructions}\n\n"
+                f"Current code:\n{current_code}\n\n"
+                "Return only the improved code (no extra commentary)."
+            )
+            improved_code = llm_call(prompt=prompt, model="o3-mini", temperature=0, max_tokens_param=1500).strip()
+            # Check that the improvement is valid and not an error message.
+            if improved_code and not improved_code.lower().startswith("error: empty response"):
+                iframe["srcdoc"] = improved_code
+            else:
+                logging.error("fix_visual_after_section: LLM returned an empty or error response.")
+        else:
+            logging.error(f"fix_visual_after_section: No iframe found after section '{section_title}'.")
+    else:
+        logging.error(f"fix_visual_after_section: Section '{section_title}' not found.")
     return str(soup)
 def snippet_in_tag(tag: Tag, snippet: str) -> bool:
     updated_report_html = str(soup)
+        # Step 5 (and 6): Update the reference table if needed.
+        prompt_refs = (
+            f"\nYou are a technical editor.\n\n"
+            "Review the following updated report HTML. If any new inline citations (e.g., [x]) have been added that are not in the original reference table,\n"
+            "generate an updated Reference Summary Table as valid HTML. Output only the updated table without any additional comments.\n\n"
+            f"Updated Report HTML:\n{updated_report_html}"
+        )
+        # Increase token limit to 1500 for this call
+        updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1500)
+        updated_refs = updated_refs.strip().strip("```").strip()
+        if updated_refs and not updated_refs.lower().startswith("error: empty response"):
+            soup_updated = BeautifulSoup(updated_report_html, "html.parser")
+            ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "reference summary table" in tag.get_text(strip=True).lower())
+            if ref_heading:
+                next_sibling = ref_heading.find_next_sibling()
+                if next_sibling:
+                    try:
+                        new_ref_html = BeautifulSoup(updated_refs, "html.parser")
+                        next_sibling.replace_with(new_ref_html)
+                        logging.info("fine_tune_report: Reference table updated successfully.")
+                    except Exception as e:
+                        logging.error("fine_tune_report: Error updating reference table: %s", e)
+                else:
+                    logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
+                updated_report_html = str(soup_updated)
             else:
+                logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
         else:
+            logging.info("fine_tune_report: No valid updated reference table returned; leaving unchanged.")
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary
     adjustment_request = user_message.replace("@improve", "").strip()
     # --- CASE 1: Removal request ---
+    removal_match = re.search(r"remove the following text:\s*\[\[([^\]]+)\]\]", adjustment_request, re.I)
+    if removal_match:
+        text_to_remove = removal_match.group(1).strip()
         updated_report = remove_text_from_html(report_text, text_to_remove)
         answer = f"Removed text: '{text_to_remove}' from the report."
         chat_history.append([user_message, answer])
         return chat_history, "", updated_report
+    # --- CASE 2: Visual fix request ---
+    visual_fix_match = re.search(r"fix visual after section\s+\[\[([^\]]+)\]\](?::\s*(.*))?", adjustment_request, re.I)
     if visual_fix_match:
         section_name = visual_fix_match.group(1).strip()
         extra_instructions = visual_fix_match.group(2).strip() if visual_fix_match.group(2) else ""
         chat_history.append([user_message, answer])
         return chat_history, "", updated_report
+    # --- DEFAULT: Proceed with existing LLM-based improvement (fine_tune_report) ---
     updated_report, _ = fine_tune_report(
         adjustment_request,
         os.getenv("OPENAI_API_KEY"),
 Review the following expanded report HTML.
 If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
+generate an updated References Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
 Updated Report HTML:\n{updated_report_html}"""
     )
     updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
+        ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "References Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
             if next_sibling:
 --------------- Citations -----------
 - The report must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below - be selective, don't put it at every sentence or every paragraph.
    Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
+- The name of the reference table should be: "References Summary Table"
 - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
+- The report MUST include a References Summary Table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
 - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
 - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
 - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
 - Introduction
 - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
 - Conclusion
+- References Summary Table
 - Report ending formatting (as mentioned before)
 {{Do not add anything after - no conclusive meta comment or content}}