Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 25, 2025

Commit

3bddbef

verified ·

1 Parent(s): c264689

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -24

app.py CHANGED Viewed

@@ -86,6 +86,28 @@ def send_chat_message(user_message, chat_history, report_text, crumbs_text):
     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
 def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
@@ -93,12 +115,16 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
-    # Parse the existing report HTML.
     soup = BeautifulSoup(report_html, "html.parser")
-    # --- Specific adjustment: extract only the snippets indicated by the user ---
     if adjustmentguidelines.strip():
-        extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including surrounding context if needed) that must be improved.
 User Instruction: "{adjustmentguidelines}"
@@ -106,7 +132,7 @@ Report HTML:
 {report_html}
 Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
-Do not output any extra commentary or markdown formatting.
 """
         extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
         try:
@@ -117,11 +143,14 @@ Do not output any extra commentary or markdown formatting.
             logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
             identified_snippets = []
         if identified_snippets:
-            all_chunks = identified_snippets
-            # Use the exact provided user instruction as the guideline for every extracted snippet.
-            all_guidelines = [adjustmentguidelines.strip() for _ in range(len(identified_snippets))]
-            # Use a default token size (or adjust as needed).
-            all_token_sizes = [1000] * len(identified_snippets)
         else:
             logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
             all_chunks = []
@@ -132,15 +161,16 @@ Do not output any extra commentary or markdown formatting.
         all_guidelines = []
         all_token_sizes = []
-    # --- Fallback logic if no specific snippets provided or extracted ---
     if not all_chunks:
         designated_chunks = soup.find_all("div", class_="improvable-chunk")
-        global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability at a chunk level. The identified chunks should be distributed across the document in order to enhance alignment with the initial request and the insights from the complementary guidance.
-Please provide three pieces of information in a JSON object with exactly three fields (no extra commentary):
 "identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
-"chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) specifying the adjustments for each chunk.
 "chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
 Report HTML:
@@ -174,7 +204,6 @@ Knowledge Crumbs (search results):
             chunk_adjustment_guidelines_from_llm = []
             chunk_token_sizes_from_llm = []
-        # Process designated chunks (if any)
         designated_chunks_html = []
         designated_guidelines = []
         designated_token_sizes = []
@@ -184,8 +213,8 @@ Knowledge Crumbs (search results):
                 designated_prompt = f"""Given the following report chunk:
 {chunk_html}
-Generate a JSON object with exactly two fields (no extra commentary):
-"guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring that modifications align with the research query and that citations are updated as needed ([x]).
 "token_size": An integer representing the recommended token size for processing this chunk.
 """
                 try:
@@ -200,7 +229,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
                     designated_guidelines.append("")
                     designated_token_sizes.append(1000)
                     designated_chunks_html.append(chunk_html)
-        # Merge the global LLM results and any designated chunks.
         all_chunks = []
         all_guidelines = []
         all_token_sizes = []
@@ -212,7 +240,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
             all_chunks.extend(identified_chunks_from_llm)
             all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
             all_token_sizes.extend(chunk_token_sizes_from_llm)
-        # If still nothing, fall back to grouping paragraphs.
         if not all_chunks:
             all_paragraphs = soup.find_all("p")
             group_size = max(1, len(all_paragraphs) // 10)
@@ -230,7 +257,7 @@ Generate a JSON object with exactly two fields (no extra commentary):
     improvements_summary = []  # To store a plain text summary for each processed chunk
-    # --- Process each chunk individually ---
     for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
         chunk_prompt = f"""Improve the following report chunk based on these guidelines:
 {guideline}
@@ -247,7 +274,7 @@ Knowledge Crumbs: {knowledge_crumbs}
 Complementary Guidance: {complementary_guidance}
 Full Report: {report_html}
-Please output a JSON object with exactly two fields (no extra commentary):
 {{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
 """
         try:
@@ -258,7 +285,6 @@ Please output a JSON object with exactly two fields (no extra commentary):
             chunk_summary = chunk_json.get("summary")
             if improved_chunk and chunk_summary:
                 improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
-                # Replace the original chunk; find the corresponding content in the soup and replace it.
                 orig = BeautifulSoup(chunk_html, "html.parser")
                 new_chunk = BeautifulSoup(improved_chunk, "html.parser")
                 replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
@@ -274,12 +300,9 @@ Please output a JSON object with exactly two fields (no extra commentary):
         except Exception as e:
             logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
-    # Get the updated report HTML as a string.
     final_report_html = str(soup)
-    # Create a plain text summary combining all improvements.
     summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
     global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
-    # Append both summaries (with a separator) to the original Q&A.
     updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
     return final_report_html, updated_qa

     updated_history = chat_history + [[user_message, answer]]
     return updated_history, ""
+def expand_snippet_area(full_html: str, snippet: str) -> str:
+    """
+    Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
+    find the element in which that snippet appears and traverse upward through the DOM
+    until a larger container is reached. Allowed container tags include div, table, iframe, and section.
+    The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
+    Returns the outer HTML of the found container.
+    """
+    allowed_tags = {"div", "table", "iframe", "section"}
+    soup = BeautifulSoup(full_html, "html.parser")
+    candidate = soup.find(lambda tag: snippet in tag.get_text() if tag.get_text() else False)
+    if candidate:
+        current = candidate
+        while current.parent is not None and current.parent.name.lower() != "body":
+            if current.parent.name.lower() in allowed_tags:
+                current = current.parent
+            else:
+                break
+        return str(current)
+    else:
+        return snippet
 def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
+    # Parse the entire report HTML.
     soup = BeautifulSoup(report_html, "html.parser")
+    # --- Specific adjustment extraction ---
+    # When an adjustment instruction (such as "rewrite the visual after 6.1") is provided,
+    # ask the LLM to locate every occurrence and extract the HTML snippet that needs adjustment.
+    # These snippets might be many (or none), and for each snippet, we then expand the selection
+    # to include its outer container (like a div, table, or iframe) for full context.
     if adjustmentguidelines.strip():
+        extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including any meaningful surrounding context) that must be improved.
 User Instruction: "{adjustmentguidelines}"
 {report_html}
 Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
+Do not include any additional commentary or markdown formatting.
 """
         extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
         try:
             logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
             identified_snippets = []
         if identified_snippets:
+            expanded_snippets = []
+            # Process EVERY occurrence in the returned array.
+            for snippet in identified_snippets:
+                expanded = expand_snippet_area(report_html, snippet)
+                expanded_snippets.append(expanded)
+            all_chunks = expanded_snippets
+            all_guidelines = [adjustmentguidelines.strip() for _ in range(len(expanded_snippets))]
+            all_token_sizes = [1000] * len(expanded_snippets)
         else:
             logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
             all_chunks = []
         all_guidelines = []
         all_token_sizes = []
+    # --- Fallback global analysis if no specific snippets were extracted ---
     if not all_chunks:
         designated_chunks = soup.find_all("div", class_="improvable-chunk")
+        global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability.
+The identified chunks should be distributed across the document in order to enhance alignment with the initial request and complementary guidance.
+Please provide a JSON object with exactly three keys (without additional commentary):
 "identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
+"chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) for each chunk.
 "chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
 Report HTML:
             chunk_adjustment_guidelines_from_llm = []
             chunk_token_sizes_from_llm = []
         designated_chunks_html = []
         designated_guidelines = []
         designated_token_sizes = []
                 designated_prompt = f"""Given the following report chunk:
 {chunk_html}
+Generate a JSON object with exactly two keys (no extra commentary):
+"guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring modifications align with the research query and that citations are updated ([x]).
 "token_size": An integer representing the recommended token size for processing this chunk.
 """
                 try:
                     designated_guidelines.append("")
                     designated_token_sizes.append(1000)
                     designated_chunks_html.append(chunk_html)
         all_chunks = []
         all_guidelines = []
         all_token_sizes = []
             all_chunks.extend(identified_chunks_from_llm)
             all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
             all_token_sizes.extend(chunk_token_sizes_from_llm)
         if not all_chunks:
             all_paragraphs = soup.find_all("p")
             group_size = max(1, len(all_paragraphs) // 10)
     improvements_summary = []  # To store a plain text summary for each processed chunk
+    # --- Process each chunk ---
     for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
         chunk_prompt = f"""Improve the following report chunk based on these guidelines:
 {guideline}
 Complementary Guidance: {complementary_guidance}
 Full Report: {report_html}
+Please output a JSON object with exactly two keys (no extra commentary):
 {{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
 """
         try:
             chunk_summary = chunk_json.get("summary")
             if improved_chunk and chunk_summary:
                 improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
                 orig = BeautifulSoup(chunk_html, "html.parser")
                 new_chunk = BeautifulSoup(improved_chunk, "html.parser")
                 replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
         except Exception as e:
             logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
     final_report_html = str(soup)
     summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
     global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
     updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
     return final_report_html, updated_qa