Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Mar 9, 2025

Commit

0192ae5

verified ·

1 Parent(s): 4cae79c

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -21

app.py CHANGED Viewed

@@ -1004,6 +1004,27 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
 # ============================================================================= Expand
 def expand_report(expansion_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                   initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                   complementary_guidance: str) -> (str, str):
@@ -1055,6 +1076,13 @@ Only output valid JSON.
         logging.warning("expansion_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
     # Step 2: Parse the report HTML once.
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
@@ -1084,13 +1112,19 @@ Only output valid JSON.
         logging.info("expansion_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
         # Step 3: Adjust the container by asking the LLM to expand the content.
-        # Note the explicit instruction regarding inline citations.
         prompt_adjust = (f"""
 You are a technical editor.
 Given the following HTML container (including its outer tags) extracted from a larger report and based on the user expansion request, produce an expanded version by elaborating on the content.
-Preserve all inline citations (formatted as [x]) and ensure that if you add any new citations, they are consistent with sources that must also be reflected in the final References Summary Table.
-Skip lines occasionally to improve readability.
-The expanded version will be put back in the exact same location and must maintain the outer HTML tags.
 - Overall Report HTML:
 {report_html}
@@ -1108,48 +1142,71 @@ Additional Guidance:
 - Complementary Guidance:
 {complementary_guidance}
-Ensure that any inline citation (e.g., [1], [2], etc.) within the expanded content is preserved or newly included so they can all be captured in the final References Summary Table.
-Output a JSON object with exactly two keys:
-- "expanded" (the expanded container's full HTML)
-- "summary" (a brief explanation of the changes, including citation updates if applicable)
-Only output valid JSON with no comments or code fences.
 """)
         response_adjust = llm_call(prompt=prompt_adjust, model="o3-mini", temperature=0, max_tokens_param=10000)
         logging.info("expansion_report: Raw container adjustment response: %s", response_adjust)
         try:
             response_adjust = response_adjust.strip().strip("json").strip("```").strip()
             logging.info("Cleaned container adjustment response: %s", response_adjust)
-            adjust_data = json.loads(response_adjust)
-            corrected_container = adjust_data.get("expanded", "").strip()
-            container_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
             logging.error("expansion_report: Error parsing container adjustment JSON: %s", e)
             continue
-        if not corrected_container:
-            logging.warning("expansion_report: No expanded container was generated; skipping correction for this container.")
-            continue
-        corrections_summary.append(f"Container expanded: {container_summary}")
-        # Step 4: Replace the original container with the updated version.
         container_tag.replace_with(BeautifulSoup(corrected_container, "html.parser"))
         logging.info("expansion_report: Updated container re-injected.")
     updated_report_html = str(soup)
     # Step 5 (and 6): Update the References Summary Table.
     prompt_refs = (
         f"\nYou are a technical editor.\n\n"
         "Review the following updated report HTML. If any new inline citations (e.g., [x]) have been added that are not in the original reference table, "
-        "generate an updated References Summary Table as valid HTML."
-        " Every inline citation found in the report must have a corresponding entry in this table. "
         "Output only the updated table without any additional comments.\n\n"
         f"Updated Report HTML:\n{updated_report_html}"
     )
-    # Increase token limit to ensure full output.
     updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=10000)
     updated_refs = updated_refs.strip().strip("```").strip()

 # ============================================================================= Expand
+def get_max_reference(report_html: str) -> int:
+    """
+    Searches the provided report HTML for the References Summary Table and returns
+    the maximum reference number currently used.
+    """
+    soup_ = BeautifulSoup(report_html, "html.parser")
+    max_ref = 0
+    # Locate a heading that includes "references summary table" (case insensitive)
+    ref_heading = soup_.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "references summary table" in tag.get_text(strip=True).lower())
+    if ref_heading:
+        next_sibling = ref_heading.find_next_sibling()
+        if next_sibling:
+            # Get text from the assumed reference table block; assume each row starts with a number followed by a vertical bar.
+            text = next_sibling.get_text(separator="\n")
+            for line in text.splitlines():
+                m = re.match(r'\s*(\d+)\s*\|', line)
+                if m:
+                    num = int(m.group(1))
+                    max_ref = max(max_ref, num)
+    return max_ref
 def expand_report(expansion_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                   initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                   complementary_guidance: str) -> (str, str):
         logging.warning("expansion_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
+    # Determine the current maximum reference number in the report.
+    current_max_ref = get_max_reference(report_html)
+    logging.info(f"expansion_report: Current max reference number is {current_max_ref}")
+    # Prepare to accumulate newly added references across all expansions
+    new_references_list = []
     # Step 2: Parse the report HTML once.
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
         logging.info("expansion_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
         # Step 3: Adjust the container by asking the LLM to expand the content.
+        # The response is expected to include two parts separated by a line of ten hyphens:
+        #   - The expanded HTML snippet (to be reintegrated)
+        #   - New reference lines in the format: "# | name | author | url" (or blank if no new references)
         prompt_adjust = (f"""
 You are a technical editor.
 Given the following HTML container (including its outer tags) extracted from a larger report and based on the user expansion request, produce an expanded version by elaborating on the content.
+Preserve all inline citations (formatted as [x]) and, if you add any new citations, output them in a separate section.
+The output should be two parts separated by a single newline containing exactly ten hyphens (i.e., "----------").
+The first part is the expanded container’s full HTML (including its outer tags) to be reinserted as-is.
+The second part is a list (one per line) of any new references in the format:
+    new reference number | name | author | url
+The new reference number should be the one you wish to assign for the new references.
+If no new references have been added, leave the second part blank.
 - Overall Report HTML:
 {report_html}
 - Complementary Guidance:
 {complementary_guidance}
+Ensure that any inline citation (e.g., [1], [2], etc.) within the expanded content is preserved or newly included so that they can all be captured in the final References Summary Table.
+Output a JSON object with exactly one key "result" whose value is a string containing the two parts as specified (the expanded container, then a newline with "----------", then new reference lines).
+Only output valid JSON with no additional commentary or code fences.
 """)
         response_adjust = llm_call(prompt=prompt_adjust, model="o3-mini", temperature=0, max_tokens_param=10000)
         logging.info("expansion_report: Raw container adjustment response: %s", response_adjust)
         try:
             response_adjust = response_adjust.strip().strip("json").strip("```").strip()
             logging.info("Cleaned container adjustment response: %s", response_adjust)
+            # Parse the response: Split into two parts using the separator line "----------"
+            parts = response_adjust.split("\n----------\n")
+            if len(parts) == 2:
+                corrected_container = parts[0].strip()
+                new_refs_str = parts[1].strip()
+            else:
+                # Fallback in case no separator was found.
+                corrected_container = response_adjust
+                new_refs_str = ""
         except Exception as e:
             logging.error("expansion_report: Error parsing container adjustment JSON: %s", e)
             continue
+        # If new references exist, process and update their reference numbers.
+        if new_refs_str:
+            for line in new_refs_str.splitlines():
+                line = line.strip()
+                if line:
+                    ref_parts = line.split("|")
+                    if len(ref_parts) >= 4:
+                        # Reassign a new reference number by incrementing current_max_ref.
+                        current_max_ref += 1
+                        ref_name = ref_parts[1].strip()
+                        ref_author = ref_parts[2].strip()
+                        ref_url = ref_parts[3].strip()
+                        new_ref_line = f"{current_max_ref} | {ref_name} | {ref_author} | {ref_url}"
+                        new_references_list.append(new_ref_line)
+                        logging.info("expansion_report: Added new reference: %s", new_ref_line)
+        else:
+            logging.info("expansion_report: No new references found for this container.")
+        corrections_summary.append("Container expanded and references updated if applicable.")
+        # Step 4: Replace the original container with the updated version (only the expanded snippet).
         container_tag.replace_with(BeautifulSoup(corrected_container, "html.parser"))
         logging.info("expansion_report: Updated container re-injected.")
     updated_report_html = str(soup)
     # Step 5 (and 6): Update the References Summary Table.
+    # Prepare new references text, if any.
+    new_refs_text = "\n".join(new_references_list) if new_references_list else ""
     prompt_refs = (
         f"\nYou are a technical editor.\n\n"
         "Review the following updated report HTML. If any new inline citations (e.g., [x]) have been added that are not in the original reference table, "
+        "generate an updated References Summary Table as valid HTML.\n"
+        "Every inline citation found in the report must have a corresponding entry in this table.\n"
+        "Use the following details for new references:\n"
+        f"Current max reference number: {current_max_ref}\n"
+        "New References (format: number | name | author | url):\n"
+        f"{new_refs_text if new_refs_text else 'None'}\n\n"
         "Output only the updated table without any additional comments.\n\n"
         f"Updated Report HTML:\n{updated_report_html}"
     )
     updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=10000)
     updated_refs = updated_refs.strip().strip("```").strip()