Spaces:

10gen
/

deepsearchitv2

Runtime error

App Files Files Community

Guiyom commited on Feb 27, 2025

Commit

7835b45

verified ·

1 Parent(s): 7e262e3

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -75

app.py CHANGED Viewed

@@ -88,10 +88,25 @@ Your Answer:"""
     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
 def expand_snippet_area(full_html: str, snippet: str) -> str:
     """
-    Given the full HTML and a small snippet (e.g., containing a keyword),
     find the element in which that snippet appears and traverse upward through the DOM
     until a larger container is reached. Allowed container tags include div, table, iframe, and section.
     The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
@@ -162,51 +177,24 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
     Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
     Process Overview:
-      1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
-         The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
-         (without HTML tags) that uniquely identify the targeted area(s) in the report.
-      2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area) to search for
-         the exact text and select the outer container (<div>, <table>, or <iframe>).
-      3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
-         and the user adjustment request, and outputs a corrected version.
-      4. The code then replaces the original container with the updated version in the BeautifulSoup object.
-      5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
-         the reference table.
-      6. A summary of all corrections is appended to the QA log.
-    Parameters:
-      adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
-                          "the introduction should be more detailed, adjust it" etc.
-      openai_api_key: OpenAI API Key.
-      serpapi_api_key: SERPAPI API Key.
-      report_html: A string containing the full HTML report.
-      initial_request: The original research query or request.
-      qa: Existing clarification Q&A log.
-      target_style: The stylistic guidelines the report should follow.
-      knowledge_crumbs: Aggregated source or search result content.
-      complementary_guidance: Additional instructions.
-    Returns:
-      A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
     """
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
-    # ---------------------------------------------------------------
-    # Step 1: Identify unique string(s) that are representative of the targeted area.
-    #
-    # The prompt now asks the LLM to extract one or more unique plain text strings (without HTML)
-    # that appear in the targeted area(s) identified by the user adjustment request. These strings
-    # will be used to locate the corresponding container elements.
-    # ---------------------------------------------------------------
     prompt_identify = (
         f"You are a meticulous technical editor. Below is the full report HTML together with a "
-        f"user adjustment request. Identify one or more unique text strings (without any HTML tags or formatting) "
-        f"that are representative of the area(s) targeted by the adjustment request. Return these unique strings in a JSON "
-        f"object with the key \"identified_unique_strings\" mapped to a list of strings. Ensure that these strings are exact "
-        f"as they appear in the report so that they can be used to accurately locate the relevant section(s).\n\n"
         f"Full Report HTML:\n{report_html}\n\n"
         f"User Adjustment Request:\n{adjustment_request}\n\n"
         f"Only output valid JSON."
@@ -216,7 +204,7 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
     logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
     try:
-        response_identify = response_identify.strip().strip("```")
         id_data = json.loads(response_identify)
         unique_strings = id_data.get("identified_unique_strings", [])
     except Exception as e:
@@ -227,39 +215,30 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
         logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
-    # ---------------------------------------------------------------
-    # Step 2: For each unique string, locate its corresponding container.
-    # ---------------------------------------------------------------
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
     for uniq_str in unique_strings:
         uniq_str = uniq_str.strip()
-        # Use expand_snippet_area to get the full container outer HTML that encloses the unique text.
         container_html = expand_snippet_area(report_html, uniq_str)
         if not container_html:
             logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
             continue
-        # Now, search the soup for a tag that includes this container HTML.
         candidate = soup.find(lambda tag: container_html in str(tag))
         if not candidate:
-            logging.warning(f"fine_tune_report: The container for the unique string was not found in the report:\n{uniq_str}")
             continue
         original_container_html = str(candidate)
         logging.info("fine_tune_report: Found container for unique string adjustment.")
-        # ---------------------------------------------------------------
         # Step 3: Call the LLM to adjust this container.
-        #
-        # Pass the entire container HTML, the full report context, and the adjustment request.
-        # The LLM should output a JSON object with the keys "improved" and "summary".
-        # ---------------------------------------------------------------
         prompt_adjust = (
             f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
             f"from a larger report and based on the user adjustment request, produce a corrected version by making "
-            f"only the necessary changes. Preserve existing inline citations, formatting, and context. Ensure the updated content "
-            f"remains consistent with the overall report style. Output your answer as a JSON object with exactly two keys: "
-            f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
             f"Overall Report HTML:\n{report_html}\n\n"
             f"Original Container to Adjust:\n{original_container_html}\n\n"
             f"User Adjustment Request:\n{adjustment_request}\n\n"
@@ -269,9 +248,8 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
         logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
         try:
-            response_adjust = response_adjust.strip().strip("```")
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
             container_summary = adjust_data.get("summary", "").strip()
@@ -280,31 +258,25 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
             continue
         if not corrected_container:
-            logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this container.")
             continue
         corrections_summary.append(f"Container corrected: {container_summary}")
-        # ---------------------------------------------------------------
-        # Step 4: Replace the original container with the corrected container in the BeautifulSoup object.
-        # ---------------------------------------------------------------
         candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
-        logging.info("fine_tune_report: Updated container re-injected into the report.")
-    # Get the updated report HTML from the modified soup.
     updated_report_html = str(soup)
-    # ---------------------------------------------------------------
-    # Step 5: (Optional) Update the reference table if new inline citations exist.
-    # ---------------------------------------------------------------
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
-        f"If there are any new inline citations (e.g., [x]) not present in the original reference table, "
-        f"generate an updated Reference Summary Table as valid HTML containing all references. Output only the HTML code for the updated reference table with no commentary.\n\n"
         f"Updated Report HTML:\n{updated_report_html}"
     )
     updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
-    updated_refs = updated_refs.strip().strip("```")
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
@@ -317,22 +289,20 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
                     next_sibling.replace_with(new_ref_html)
                     logging.info("fine_tune_report: Reference table updated successfully.")
                 except Exception as e:
-                    logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
             else:
-                logging.info("fine_tune_report: No sibling element found after reference table heading; skipping reference update.")
             updated_report_html = str(soup_updated)
         else:
-            logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
     else:
         logging.info("fine_tune_report: No updated reference table returned; leaving references unchanged.")
-    # ---------------------------------------------------------------
-    # Step 6: Append a summary of corrections to the existing QA log.
-    # ---------------------------------------------------------------
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary
-    logging.info("fine_tune_report: Fine-tuning process completed.")
     return updated_report_html, updated_qa
 def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key: str) -> str:

     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
+def clean_llm_response(response: str) -> str:
+    """
+    Clean the raw LLM response by removing code fences and replacing newline characters
+    with spaces so that the resulting string is valid JSON.
+    """
+    # Remove any leading/trailing whitespace and code fence markers
+    cleaned = response.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.lstrip("```")
+    if cleaned.endswith("```"):
+        cleaned = cleaned.rstrip("```")
+    # Remove newline characters (replace with space) and collapse multiple spaces
+    cleaned = cleaned.replace("\n", " ")
+    cleaned = re.sub(r'\s+', ' ', cleaned)
+    return cleaned.strip()
 def expand_snippet_area(full_html: str, snippet: str) -> str:
     """
+    Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
     find the element in which that snippet appears and traverse upward through the DOM
     until a larger container is reached. Allowed container tags include div, table, iframe, and section.
     The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
     Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
     Process Overview:
+      1. Submits full report HTML and the user adjustment request to the LLM and gets back one or more unique plain text strings.
+      2. For each unique string, uses BeautifulSoup (and expand_snippet_area) to retrieve the outer container (<div>, <table>, or <iframe>).
+      3. For each container, calls the LLM (with full report context, crumbs, and adjustment request) to output a corrected version.
+      4. Replaces the original container with the corrected version.
+      5. Optionally updates the reference table if new inline citations appear.
+      6. Appends a corrections summary to the QA log.
     """
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
+    # Step 1: Identify unique plain text string(s) that pinpoint the targeted area.
     prompt_identify = (
         f"You are a meticulous technical editor. Below is the full report HTML together with a "
+        f"user adjustment request. Extract one or more unique plain-text string(s) (without any HTML tags or formatting) "
+        f"that uniquely appear in the area targeted by the adjustment request. Output them in a JSON object with the key "
+        f"\"identified_unique_strings\" mapped to a list of strings.\n\n"
         f"Full Report HTML:\n{report_html}\n\n"
         f"User Adjustment Request:\n{adjustment_request}\n\n"
         f"Only output valid JSON."
     logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
     try:
+        response_identify = clean_llm_response(response_identify)
         id_data = json.loads(response_identify)
         unique_strings = id_data.get("identified_unique_strings", [])
     except Exception as e:
         logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
+    # Step 2: For each unique string, locate the corresponding outer container.
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
     for uniq_str in unique_strings:
         uniq_str = uniq_str.strip()
         container_html = expand_snippet_area(report_html, uniq_str)
         if not container_html:
             logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
             continue
         candidate = soup.find(lambda tag: container_html in str(tag))
         if not candidate:
+            logging.warning(f"fine_tune_report: The container for unique string was not found: {uniq_str}")
             continue
         original_container_html = str(candidate)
         logging.info("fine_tune_report: Found container for unique string adjustment.")
         # Step 3: Call the LLM to adjust this container.
         prompt_adjust = (
             f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
             f"from a larger report and based on the user adjustment request, produce a corrected version by making "
+            f"only the necessary changes. Preserve inline citations, formatting, and context. Ensure the updated content "
+            f"remains consistent with the overall report style. Output a JSON object with two keys: "
+            f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes).\n\n"
             f"Overall Report HTML:\n{report_html}\n\n"
             f"Original Container to Adjust:\n{original_container_html}\n\n"
             f"User Adjustment Request:\n{adjustment_request}\n\n"
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
         logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
         try:
+            response_adjust = clean_llm_response(response_adjust)
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
             container_summary = adjust_data.get("summary", "").strip()
             continue
         if not corrected_container:
+            logging.warning("fine_tune_report: No improved container was generated; skipping this container.")
             continue
         corrections_summary.append(f"Container corrected: {container_summary}")
+        # Step 4: Replace the original container with the updated container.
         candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
+        logging.info("fine_tune_report: Updated container re-injected.")
     updated_report_html = str(soup)
+    # Step 5: (Optional) Update reference table if needed.
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
+        f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
+        f"generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no commentary.\n\n"
         f"Updated Report HTML:\n{updated_report_html}"
     )
     updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
+    updated_refs = clean_llm_response(updated_refs)
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
                     next_sibling.replace_with(new_ref_html)
                     logging.info("fine_tune_report: Reference table updated successfully.")
                 except Exception as e:
+                    logging.error(f"fine_tune_report: Error updating reference table: {e}")
             else:
+                logging.info("fine_tune_report: No sibling after reference heading; skipping update.")
             updated_report_html = str(soup_updated)
         else:
+            logging.info("fine_tune_report: No reference table heading found; skipping reference update.")
     else:
         logging.info("fine_tune_report: No updated reference table returned; leaving references unchanged.")
+    # Step 6: Append corrections summary to the QA log.
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary
+    logging.info("fine_tune_report: Fine-tuning complete.")
     return updated_report_html, updated_qa
 def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key: str) -> str: