Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 27, 2025

Commit

5a160ac

verified ·

1 Parent(s): 9f7d347

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -99

app.py CHANGED Viewed

@@ -97,157 +97,127 @@ def clean_llm_response(response: str) -> str:
 import logging
 from bs4 import BeautifulSoup
-def expand_snippet_area(full_html: str, snippet: str) -> str:
     """
-    Given the full HTML and a snippet of text, this function finds the element that contains the snippet.
     It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
-    until the highest level iframe is reached or (if no iframe is present) until a div or table is
-    encountered—the first allowed container (div or table) found is used. If neither an iframe nor
-    an allowed container is found, it returns the candidate element itself.
-    Logging is added at each critical step to trace the decision process.
     """
     allowed_tags = {"div", "table"}
-    logging.info("Parsing full HTML with BeautifulSoup.")
-    soup = BeautifulSoup(full_html, "html.parser")
-    logging.info(f"Searching for all elements containing the snippet: '{snippet}'")
-    # Get all tags where the snippet is contained in the aggregated text.
     candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
     if not candidates:
-        logging.info("No element containing the snippet was found. Returning snippet.")
-        return snippet
-    # Choose the candidate with the greatest depth (i.e. the most ancestors)
-    # This gives us the smallest container containing the snippet.
     candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
-    logging.info(f"Candidate element selected based on depth (<{candidate.name}>): {str(candidate)}")
     iframe_candidate = None
     allowed_candidate = None
-    # Iterate upward from the candidate's direct parent.
     current = candidate.parent
     while current is not None and current.name.lower() != "body":
-        logging.info(f"Evaluating parent element: <{current.name}>")
         tag_name = current.name.lower()
         if tag_name == "iframe":
             iframe_candidate = current
             logging.info("Found an <iframe> container; updating iframe_candidate.")
         elif tag_name in allowed_tags and allowed_candidate is None:
             allowed_candidate = current
-            logging.info(f"Found allowed container <{tag_name}>; setting allowed_candidate.")
         current = current.parent
-    # Decision on which container to return based on the priority:
     if iframe_candidate is not None:
-        logging.info("Returning outer HTML of the iframe container.")
-        return str(iframe_candidate)
     elif allowed_candidate is not None:
-        logging.info("No iframe found; returning outer HTML of the first allowed container (div/table).")
-        return str(allowed_candidate)
     else:
-        logging.info("No iframe, div, or table container found; returning candidate element's HTML.")
-        return str(candidate)
 def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
     """
-    Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
-    Process Overview:
-      1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
-         The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
-         that are representative of the targeted area(s) in the report.
-      2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area)
-         to search for that text exactly and select the outer container (<div>, <table>, or <iframe>).
-      3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
-         and the user adjustment request, and outputs a corrected version.
-      4. The code then replaces the original container with the updated version in the BeautifulSoup object.
-      5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
-         the reference table.
-      6. A summary of all corrections is appended to the QA log.
     """
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
-    # Step 1: Get unique plain text string(s) (without HTML) that identify the targeted area.
-    prompt_identify = (f"""
-You are a meticulous technical editor.
-Below is the full report HTML and a user adjustment request.
-Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
-// Examples
-1) if the user request to "Add xyz in the conclusion", the unique string to identify should be specific to the conclusion
-2) if the user request to "correct the graph after section 1.2", the unique string should be one of the string that appear specifically in the graph after section 1.2 (ex: the title)
-3) if the user request is "Remove any mention about the car industry", the unique string(s) should be a sentence that would be in a paragraph of the report that would talk about car industry
---> The unique string is what would allow to identify precisely through a search the section targeted by the user request, it has to be concise and unique.
-Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
-Ensure these strings exactly match the content in the report.
-Full Report HTML:
-{report_html}
-User Adjustment Request:
-{adjustment_request}
 Only output valid JSON."""
     )
     response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
-    logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
     try:
         response_identify = clean_llm_response(response_identify.strip().strip("```"))
         id_data = json.loads(response_identify)
         unique_strings = id_data.get("identified_unique_strings", [])
     except Exception as e:
-        logging.error(f"fine_tune_report: Error parsing unique strings JSON: {e}")
         unique_strings = []
     if not unique_strings:
         logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
-    # Step 2: For each unique string, find the corresponding outer container.
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
     for uniq_str in unique_strings:
         uniq_str = uniq_str.strip()
-        # Use expand_snippet_area to get the outer container for the unique text.
-        container_html = expand_snippet_area(report_html, uniq_str)
-        if not container_html:
-            logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
-            continue
-        candidate = soup.find(lambda tag: container_html in str(tag))
-        if not candidate:
-            logging.warning(f"fine_tune_report: The container for unique string was not found: {uniq_str}")
             continue
-        original_container_html = str(candidate)
-        logging.info(f"fine_tune_report: Found container for unique string adjustment:\n\n{original_container_html}\n\n")
         # Step 3: Call the LLM to adjust this container.
-        prompt_adjust = (f"""
-You are a technical editor.
 Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
 produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
-The updated version will put back precisely in the same location, the output should have the same outer tags.
-// Context
 - Overall Report HTML:
 {report_html}
-- Knowledge Crumbs you can use if relevant - collected from various search results:
 {knowledge_crumbs}
-// Request
 - Original Container to Adjust:
 {original_container_html}
@@ -259,7 +229,8 @@ Additional Guidance:
 - Complementary Guidance:
 {complementary_guidance}
-Ensure the updated content remains consistent with the overall report style. Output a JSON object with exactly two keys:
 - "improved" (the corrected container's full HTML) and
 - "summary" (a brief explanation of the changes)
@@ -267,30 +238,31 @@ Only output valid JSON."""
         )
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
-        logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
         try:
             response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
-            logging.info(f"Cleaned container adjustment response: {response_adjust}")
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
             container_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
-            logging.error(f"fine_tune_report: Error parsing container adjustment JSON: {e}")
             continue
         if not corrected_container:
-            logging.warning("fine_tune_report: No improved container was generated; skipping.")
             continue
         corrections_summary.append(f"Container corrected: {container_summary}")
-        # Step 4: Replace the original container with the updated one.
-        candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
         logging.info("fine_tune_report: Updated container re-injected.")
     updated_report_html = str(soup)
-    # Step 5: (Optional) Update the reference table if needed.
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
         f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
@@ -311,16 +283,15 @@ Only output valid JSON."""
                     next_sibling.replace_with(new_ref_html)
                     logging.info("fine_tune_report: Reference table updated successfully.")
                 except Exception as e:
-                    logging.error(f"fine_tune_report: Error updating reference table: {e}")
             else:
-                logging.info("fine_tune_report: No sibling after reference heading; skipping update.")
             updated_report_html = str(soup_updated)
         else:
             logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
     else:
         logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
-    # Step 6: Append a summary of corrections to the existing QA log.
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary

 import logging
 from bs4 import BeautifulSoup
+def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
     """
+    Given a BeautifulSoup object and a snippet of text, this function finds the element that contains the snippet.
     It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
+    until the highest level <iframe> is reached or (if no <iframe> is present) until a <div> or <table> is
+    encountered—the first allowed container (<div> or <table>) found is used. If neither is found,
+    it returns the candidate element itself.
+    Logging is provided at each key step.
     """
     allowed_tags = {"div", "table"}
+    logging.info("Searching for all elements containing the snippet: '%s'", snippet)
+    # Get all tags where the snippet is in the text.
     candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
     if not candidates:
+        logging.info("No element containing the snippet was found. Returning None.")
+        return None
+    # Choose the candidate with the greatest depth (i.e. most ancestors).
     candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
+    logging.info("Candidate element selected based on depth (<%s>): %s", candidate.name, candidate)
     iframe_candidate = None
     allowed_candidate = None
+    # Iterate upward from the candidate's parent.
     current = candidate.parent
     while current is not None and current.name.lower() != "body":
+        logging.info("Evaluating parent element: <%s>", current.name)
         tag_name = current.name.lower()
         if tag_name == "iframe":
             iframe_candidate = current
             logging.info("Found an <iframe> container; updating iframe_candidate.")
         elif tag_name in allowed_tags and allowed_candidate is None:
             allowed_candidate = current
+            logging.info("Found allowed container <%s>; setting allowed_candidate.", tag_name)
         current = current.parent
     if iframe_candidate is not None:
+        logging.info("Returning the iframe container.")
+        return iframe_candidate
     elif allowed_candidate is not None:
+        logging.info("No iframe found; returning the first allowed container (div/table).")
+        return allowed_candidate
     else:
+        logging.info("No iframe, div, or table container found; returning candidate element.")
+        return candidate
+# In fine_tune_report, use the same soup instance:
 def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
     """
+    ...
+    The function fine-tunes the report by:
+      1. Identifying unique strings in the area to adjust.
+      2. Using expand_snippet_area (which now receives a BeautifulSoup object) to locate the container.
+      3. Calling an LLM to produce an improved container and then replacing the original.
+      4. Optionally updating the reference table and appending a summary.
     """
+    import os
+    import json
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
+    # Step 1: (LLM call to get unique strings) ...
+    # [Assume this part remains unchanged and unique_strings is obtained]
+    prompt_identify = (
+        f"""You are a meticulous technical editor.
+...
 Only output valid JSON."""
     )
     response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
+    logging.info("fine_tune_report: Raw unique string identification response: %s", response_identify)
     try:
         response_identify = clean_llm_response(response_identify.strip().strip("```"))
         id_data = json.loads(response_identify)
         unique_strings = id_data.get("identified_unique_strings", [])
     except Exception as e:
+        logging.error("fine_tune_report: Error parsing unique strings JSON: %s", e)
         unique_strings = []
     if not unique_strings:
         logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
+    # Step 2: Parse the report HTML once.
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
     for uniq_str in unique_strings:
         uniq_str = uniq_str.strip()
+        logging.info("fine_tune_report: Processing unique string: '%s'", uniq_str)
+        # Use expand_snippet_area to get the container Tag directly.
+        container_tag = expand_snippet_area(soup, uniq_str)
+        if container_tag is None:
+            logging.warning("fine_tune_report: Could not locate a container for unique string: '%s'", uniq_str)
             continue
+        original_container_html = str(container_tag)
+        logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
         # Step 3: Call the LLM to adjust this container.
+        prompt_adjust = (
+            f"""You are a technical editor.
 Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
 produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
+The updated version will be put back in the exact same location and must have the same outer tags.
 - Overall Report HTML:
 {report_html}
+- Knowledge Crumbs:
 {knowledge_crumbs}
 - Original Container to Adjust:
 {original_container_html}
 - Complementary Guidance:
 {complementary_guidance}
+Ensure the updated content remains consistent with the overall report style.
+Output a JSON object with exactly two keys:
 - "improved" (the corrected container's full HTML) and
 - "summary" (a brief explanation of the changes)
         )
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
+        logging.info("fine_tune_report: Raw container adjustment response: %s", response_adjust)
         try:
             response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
+            logging.info("Cleaned container adjustment response: %s", response_adjust)
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
             container_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
+            logging.error("fine_tune_report: Error parsing container adjustment JSON: %s", e)
             continue
         if not corrected_container:
+            logging.warning("fine_tune_report: No improved container was generated; skipping correction for this container.")
             continue
         corrections_summary.append(f"Container corrected: {container_summary}")
+        # Step 4: Replace the original container with the updated one in our soup.
+        container_tag.replace_with(BeautifulSoup(corrected_container, "html.parser"))
         logging.info("fine_tune_report: Updated container re-injected.")
     updated_report_html = str(soup)
+    # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
         f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
                     next_sibling.replace_with(new_ref_html)
                     logging.info("fine_tune_report: Reference table updated successfully.")
                 except Exception as e:
+                    logging.error("fine_tune_report: Error updating reference table: %s", e)
             else:
+                logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
             updated_report_html = str(soup_updated)
         else:
             logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
     else:
         logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
     global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
     updated_qa = qa.strip() + "\n----------\n" + global_summary