Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 26, 2025

Commit

ecebfd6

verified ·

1 Parent(s): 8fdb5c7

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -65

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import openai
 import PyPDF2
 import tempfile
 import logging
 import markdown
 import unicodedata
 import pdfkit
@@ -88,9 +89,6 @@ Your Answer:"""
     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
-import difflib
-from bs4 import BeautifulSoup
 def expand_snippet_area(full_html: str, snippet: str) -> str:
     """
     Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
@@ -131,7 +129,6 @@ def find_best_matching_snippet(chunk_html: str, report_html: str) -> str:
         if similarity > best_similarity:
             best_similarity = similarity
             best_snippet = str(tag)
-    # Accept if similarity is reasonably high; threshold can be adjusted.
     if best_similarity > 0.6:
         return best_snippet
     return ""
@@ -165,17 +162,17 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
     Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
     Process Overview:
-      1. The function submits the full report HTML along with the user’s adjustment request to the LLM.
-         The prompt instructs the model to output a JSON object containing the minimal unique container(s)
-         (including their outer HTML—e.g. <iframe>, <div>, or <table>) that correspond to the content that needs
-         fixing.
-      2. The identified container snippet(s) are then located in the report using BeautifulSoup.
-      3. For each container, a second LLM call is made to generate a corrected version that integrates the user
-         instructions while maintaining context, citations, and overall style.
-      4. The old container markup is replaced by the corrected version directly in the BeautifulSoup object.
-      5. Finally, if new inline citations have been introduced (beyond those in the reference table), a final LLM
-         call updates the reference table.
-      6. A summary of the corrections is appended to the QA log.
     Parameters:
       adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
@@ -186,88 +183,83 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
       initial_request: The original research query or request.
       qa: Existing clarification Q&A log.
       target_style: The stylistic guidelines the report should follow.
-      knowledge_crumbs: Aggregated source/search result content.
       complementary_guidance: Additional instructions.
     Returns:
       A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
     """
-    import os
-    import json
-    import logging
-    from bs4 import BeautifulSoup
-    # Set API keys as environment variables for downstream calls.
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
     # ---------------------------------------------------------------
-    # Step 1: Identify container snippet(s) needing adjustment.
     #
-    # The prompt instructs the LLM to scan the full report and output a JSON object
-    # with a key "identified_snippets" that contains complete HTML container elements
-    # (including their outer tags) that uniquely correspond to the section(s) which
-    # should be adjusted per the user request.
     # ---------------------------------------------------------------
     prompt_identify = (
         f"You are a meticulous technical editor. Below is the full report HTML together with a "
-        f"user adjustment request. Identify the minimal, unique container(s) that capture the key content "
-        f"relevant to the user instruction. The containers may be complete HTML elements such as a <div>, <iframe>, "
-        f"<table>, etc. Output a JSON object with the key \"identified_snippets\" that maps to a list of these container "
-        f"HTML snippets ONLY (include the outer tags). No commentary or additional text should be present.\n\n"
         f"Full Report HTML:\n{report_html}\n\n"
         f"User Adjustment Request:\n{adjustment_request}\n\n"
         f"Only output valid JSON."
     )
     response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
-    logging.info(f"fine_tune_report: Raw snippet identification response: {response_identify}")
     try:
         response_identify = response_identify.strip().strip("```")
         id_data = json.loads(response_identify)
-        identified_snippets = id_data.get("identified_snippets", [])
     except Exception as e:
-        logging.error(f"fine_tune_report: Error parsing identified snippets JSON: {e}")
-        identified_snippets = []
-    if not identified_snippets:
-        logging.warning("fine_tune_report: No specific container snippets were identified for adjustment. Returning original report.")
         return report_html, qa
     # ---------------------------------------------------------------
-    # Step 2: For each identified container snippet, find it in the report.
     # ---------------------------------------------------------------
     soup = BeautifulSoup(report_html, "html.parser")
-    updated_report_html = report_html
     corrections_summary = []
-    for snippet in identified_snippets:
-        snippet = snippet.strip()
-        # Use BeautifulSoup to search for a tag whose complete outer HTML contains the snippet.
-        candidate = soup.find(lambda tag: snippet in str(tag))
         if not candidate:
-            logging.warning(f"fine_tune_report: The snippet could not be uniquely located in the report:\n{snippet}")
             continue
         original_container_html = str(candidate)
-        logging.info("fine_tune_report: Found container snippet for adjustment.")
         # ---------------------------------------------------------------
-        # Step 3: Send a prompt to the LLM to correct this container.
         #
-        # Here the LLM is given the entire current container (the extracted full HTML)
-        # and the full report context (and other guidance) and is asked to produce a corrected
-        # version that applies the adjustment request.
         # ---------------------------------------------------------------
         prompt_adjust = (
-            f"You are a technical editor. Given the following HTML container (with its outer tags) "
-            f"extracted from a larger report and based on the user adjustment request, produce a corrected "
-            f"version by making only the changes required. Preserve existing inline citations, formatting, and contextual details. "
-            f"Ensure the updated content remains consistent with the overall report style. Output your answer as a JSON object "
-            f"with exactly two keys: \"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
             f"Overall Report HTML:\n{report_html}\n\n"
             f"Original Container to Adjust:\n{original_container_html}\n\n"
             f"User Adjustment Request:\n{adjustment_request}\n\n"
@@ -276,34 +268,34 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
         )
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
-        logging.info(f"fine_tune_report: Raw adjustment response: {response_adjust}")
         try:
             response_adjust = response_adjust.strip().strip("```")
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
-            snippet_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
-            logging.error(f"fine_tune_report: Error parsing snippet adjustment JSON: {e}")
             continue
         if not corrected_container:
-            logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this snippet.")
             continue
-        corrections_summary.append(f"Container corrected: {snippet_summary}")
         # ---------------------------------------------------------------
-        # Step 4: Replace the original container in the BeautifulSoup object.
         # ---------------------------------------------------------------
         candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
-        logging.info("fine_tune_report: Container snippet replaced.")
     # Get the updated report HTML from the modified soup.
     updated_report_html = str(soup)
     # ---------------------------------------------------------------
-    # Step 5: Update the reference table if any new inline citations exist.
     # ---------------------------------------------------------------
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
@@ -316,7 +308,6 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
-        # Look for a heading that includes something like "Reference Summary Table"
         ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
@@ -328,7 +319,7 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
                 except Exception as e:
                     logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
             else:
-                logging.info("fine_tune_report: No sibling element found after reference heading; skipping reference update.")
             updated_report_html = str(soup_updated)
         else:
             logging.info("fine_tune_report: No reference table heading found; reference update skipped.")

 import PyPDF2
 import tempfile
 import logging
+import difflib
 import markdown
 import unicodedata
 import pdfkit
     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
 def expand_snippet_area(full_html: str, snippet: str) -> str:
     """
     Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
         if similarity > best_similarity:
             best_similarity = similarity
             best_snippet = str(tag)
     if best_similarity > 0.6:
         return best_snippet
     return ""
     Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
     Process Overview:
+      1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
+         The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
+         (without HTML tags) that uniquely identify the targeted area(s) in the report.
+      2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area) to search for
+         the exact text and select the outer container (<div>, <table>, or <iframe>).
+      3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
+         and the user adjustment request, and outputs a corrected version.
+      4. The code then replaces the original container with the updated version in the BeautifulSoup object.
+      5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
+         the reference table.
+      6. A summary of all corrections is appended to the QA log.
     Parameters:
       adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
       initial_request: The original research query or request.
       qa: Existing clarification Q&A log.
       target_style: The stylistic guidelines the report should follow.
+      knowledge_crumbs: Aggregated source or search result content.
       complementary_guidance: Additional instructions.
     Returns:
       A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
     """
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
     # ---------------------------------------------------------------
+    # Step 1: Identify unique string(s) that are representative of the targeted area.
     #
+    # The prompt now asks the LLM to extract one or more unique plain text strings (without HTML)
+    # that appear in the targeted area(s) identified by the user adjustment request. These strings
+    # will be used to locate the corresponding container elements.
     # ---------------------------------------------------------------
     prompt_identify = (
         f"You are a meticulous technical editor. Below is the full report HTML together with a "
+        f"user adjustment request. Identify one or more unique text strings (without any HTML tags or formatting) "
+        f"that are representative of the area(s) targeted by the adjustment request. Return these unique strings in a JSON "
+        f"object with the key \"identified_unique_strings\" mapped to a list of strings. Ensure that these strings are exact "
+        f"as they appear in the report so that they can be used to accurately locate the relevant section(s).\n\n"
         f"Full Report HTML:\n{report_html}\n\n"
         f"User Adjustment Request:\n{adjustment_request}\n\n"
         f"Only output valid JSON."
     )
     response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
+    logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
     try:
         response_identify = response_identify.strip().strip("```")
         id_data = json.loads(response_identify)
+        unique_strings = id_data.get("identified_unique_strings", [])
     except Exception as e:
+        logging.error(f"fine_tune_report: Error parsing unique strings JSON: {e}")
+        unique_strings = []
+    if not unique_strings:
+        logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
         return report_html, qa
     # ---------------------------------------------------------------
+    # Step 2: For each unique string, locate its corresponding container.
     # ---------------------------------------------------------------
     soup = BeautifulSoup(report_html, "html.parser")
     corrections_summary = []
+    for uniq_str in unique_strings:
+        uniq_str = uniq_str.strip()
+        # Use expand_snippet_area to get the full container outer HTML that encloses the unique text.
+        container_html = expand_snippet_area(report_html, uniq_str)
+        if not container_html:
+            logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
+            continue
+        # Now, search the soup for a tag that includes this container HTML.
+        candidate = soup.find(lambda tag: container_html in str(tag))
         if not candidate:
+            logging.warning(f"fine_tune_report: The container for the unique string was not found in the report:\n{uniq_str}")
             continue
         original_container_html = str(candidate)
+        logging.info("fine_tune_report: Found container for unique string adjustment.")
         # ---------------------------------------------------------------
+        # Step 3: Call the LLM to adjust this container.
         #
+        # Pass the entire container HTML, the full report context, and the adjustment request.
+        # The LLM should output a JSON object with the keys "improved" and "summary".
         # ---------------------------------------------------------------
         prompt_adjust = (
+            f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
+            f"from a larger report and based on the user adjustment request, produce a corrected version by making "
+            f"only the necessary changes. Preserve existing inline citations, formatting, and context. Ensure the updated content "
+            f"remains consistent with the overall report style. Output your answer as a JSON object with exactly two keys: "
+            f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
             f"Overall Report HTML:\n{report_html}\n\n"
             f"Original Container to Adjust:\n{original_container_html}\n\n"
             f"User Adjustment Request:\n{adjustment_request}\n\n"
         )
         response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
+        logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
         try:
             response_adjust = response_adjust.strip().strip("```")
             adjust_data = json.loads(response_adjust)
             corrected_container = adjust_data.get("improved", "").strip()
+            container_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
+            logging.error(f"fine_tune_report: Error parsing container adjustment JSON: {e}")
             continue
         if not corrected_container:
+            logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this container.")
             continue
+        corrections_summary.append(f"Container corrected: {container_summary}")
         # ---------------------------------------------------------------
+        # Step 4: Replace the original container with the corrected container in the BeautifulSoup object.
         # ---------------------------------------------------------------
         candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
+        logging.info("fine_tune_report: Updated container re-injected into the report.")
     # Get the updated report HTML from the modified soup.
     updated_report_html = str(soup)
     # ---------------------------------------------------------------
+    # Step 5: (Optional) Update the reference table if new inline citations exist.
     # ---------------------------------------------------------------
     prompt_refs = (
         f"You are a technical editor. Review the following updated report HTML. "
     if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
         ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
                 except Exception as e:
                     logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
             else:
+                logging.info("fine_tune_report: No sibling element found after reference table heading; skipping reference update.")
             updated_report_html = str(soup_updated)
         else:
             logging.info("fine_tune_report: No reference table heading found; reference update skipped.")