Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 26, 2025

Commit

db9a2da

verified ·

1 Parent(s): f31625d

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -201

app.py CHANGED Viewed

@@ -158,229 +158,147 @@ Chunk to be replaced:
         unique_snippet = find_best_matching_snippet(chunk_html, report_html)
     return unique_snippet
-def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
     import json
     import logging
-    import os
     from bs4 import BeautifulSoup
-    # Set API keys in environment variables.
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
-    # Parse the original report HTML.
-    soup = BeautifulSoup(report_html, "html.parser")
-    updated_report_html = report_html  # working copy
-    # --- Specific adjustment extraction ---
-    if adjustmentguidelines.strip():
-        extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below,
-extract only the precise HTML snippet(s) (including any meaningful surrounding context) that must be improved.
-User Instruction: "{adjustmentguidelines}"
-Report HTML:
-{report_html}
-Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
-Do not include any additional commentary or markdown formatting.
-"""
-        extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
-        try:
-            extraction_result = extraction_result.strip().strip("```")
-            extraction_json = json.loads(extraction_result)
-            identified_snippets = extraction_json.get("identified_snippets", [])
-        except Exception as e:
-            logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
-            identified_snippets = []
-        if identified_snippets:
-            expanded_snippets = []
-            for snippet in identified_snippets:
-                expanded = expand_snippet_area(report_html, snippet)
-                expanded_snippets.append(expanded)
-            all_chunks = expanded_snippets
-            all_guidelines = [adjustmentguidelines.strip() for _ in range(len(expanded_snippets))]
-            all_token_sizes = [1000] * len(expanded_snippets)
-        else:
-            logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
-            all_chunks = []
-            all_guidelines = []
-            all_token_sizes = []
-    else:
-        all_chunks = []
-        all_guidelines = []
-        all_token_sizes = []
-    # --- Fallback global analysis if no specific snippets were extracted ---
-    if not all_chunks:
-        designated_chunks = soup.find_all("div", class_="improvable-chunk")
-        global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability.
-The identified chunks should be distributed across the document in order to enhance alignment with the initial request and complementary guidance.
-Please provide a JSON object with exactly three keys (without additional commentary):
-"identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
-"chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) for each chunk.
-"chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
-Report HTML:
-{report_html}
-Initial Request:
-{initial_request}
-Complementary Guidance:
-{complementary_guidance}
-Clarification Q&A:
-{qa}
-Target Style:
-{target_style}
-Knowledge Crumbs (search results):
-{knowledge_crumbs}
-"""
-        try:
-            global_result = openai_call(prompt=global_chunk_prompt, model="o3-mini", max_tokens_param=4000, temperature=0.5)
-            global_result = global_result.strip().strip("```")
-            global_json = json.loads(global_result)
-            identified_chunks_from_llm = global_json.get("identified_chunks", [])
-            chunk_adjustment_guidelines_from_llm = global_json.get("chunk_adjustment_guidelines", [])
-            chunk_token_sizes_from_llm = global_json.get("chunk_token_sizes", [])
-        except Exception as e:
-            logging.error(f"Error during global analysis: {e}. Raw result: {global_result}")
-            identified_chunks_from_llm = []
-            chunk_adjustment_guidelines_from_llm = []
-            chunk_token_sizes_from_llm = []
-        designated_chunks_html = []
-        designated_guidelines = []
-        designated_token_sizes = []
-        if designated_chunks:
-            for chunk in designated_chunks:
-                chunk_html = str(chunk)
-                designated_prompt = f"""Given the following report chunk:
-{chunk_html}
-Generate a JSON object with exactly two keys (no extra commentary):
-"guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring modifications align with the research query and that citations are updated ([x]).
-"token_size": An integer representing the recommended token size for processing this chunk.
-"""
-                try:
-                    result = openai_call(prompt=designated_prompt, model="o3-mini", max_tokens_param=500, temperature=0.5)
-                    result = result.strip().strip("```")
-                    result_json = json.loads(result)
-                    designated_guidelines.append(result_json.get("guideline", ""))
-                    designated_token_sizes.append(result_json.get("token_size", 1000))
-                    designated_chunks_html.append(chunk_html)
-                except Exception as e:
-                    logging.error(f"Error processing a designated chunk: {e}. Raw result: {result}")
-                    designated_guidelines.append("")
-                    designated_token_sizes.append(1000)
-                    designated_chunks_html.append(chunk_html)
-        # Reset all_chunks, guidelines and token sizes
-        all_chunks = []
-        all_guidelines = []
-        all_token_sizes = []
-        if designated_chunks_html:
-            all_chunks.extend(designated_chunks_html)
-            all_guidelines.extend(designated_guidelines)
-            all_token_sizes.extend(designated_token_sizes)
-        if identified_chunks_from_llm and isinstance(identified_chunks_from_llm, list):
-            all_chunks.extend(identified_chunks_from_llm)
-            all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
-            all_token_sizes.extend(chunk_token_sizes_from_llm)
-        if not all_chunks:
-            all_paragraphs = soup.find_all("p")
-            group_size = max(1, len(all_paragraphs) // 10)
-            for i in range(0, len(all_paragraphs), group_size):
-                new_div = soup.new_tag("div", **{"class": "improvable-chunk"})
-                for p in all_paragraphs[i:i+group_size]:
-                    new_div.append(p.extract())
-                if soup.body:
-                    soup.body.append(new_div)
-                else:
-                    soup.append(new_div)
-                all_chunks.append(str(new_div))
-                all_guidelines.append("Improve clarity and conciseness; ensure consistency regarding citations ([x]).")
-                all_token_sizes.append(1000)
-    improvements_summary = []
-    # --- Process each chunk with robust DOM-based replacement ---
-    for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
-        chunk_prompt = f"""Improve the following report chunk based on these guidelines:
-{guideline}
-Use a maximum of {token_size} tokens to generate the improved content.
-IMPORTANT: Only modify parts that require improvement. If no changes are necessary, return the original content unchanged.
-Additionally, ensure that the improved content includes concrete real-world examples—such as persons with names and titles, company names, institution names, research report titles, quotes, products, and use-case examples—complete with proper inline citations ([x]) as sourced.
---- Chunk #{idx} Original Content ---
-{chunk_html}
-Initial Request: {initial_request}
-Clarification Q&A: {qa}
-Target Style: {target_style}
-Knowledge Crumbs: {knowledge_crumbs}
-Complementary Guidance: {complementary_guidance}
-Full Report: {report_html}
-Please output a JSON object with exactly two keys (no extra commentary):
-{{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
-"""
         try:
-            chunk_result = openai_call(prompt=chunk_prompt, model="o3-mini", max_tokens_param=token_size+1500)
-            chunk_result = chunk_result.strip().strip("```")
-            chunk_json = json.loads(chunk_result)
-            improved_chunk = chunk_json.get("improved")
-            chunk_summary = chunk_json.get("summary")
-            if improved_chunk and chunk_summary:
-                improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
-                # Determine a minimal unique snippet for the current chunk.
-                unique_snippet = get_unique_snippet(chunk_html, report_html)
-                improved_chunk_clean = improved_chunk.strip()
-                if unique_snippet and unique_snippet in updated_report_html:
-                    updated_report_html = updated_report_html.replace(unique_snippet, improved_chunk_clean, 1)
-                else:
-                    logging.warning(f"Chunk {idx}: Unable to locate the unique snippet ({unique_snippet}). Replacement not applied.")
-            else:
-                logging.error(f"Chunk {idx}: Incomplete JSON result: {chunk_result}")
         except Exception as e:
-            logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
-    # --- Post-process the report to update the reference table without appending extra content ---
-    references_prompt = f"""Review the following report HTML.
-If any new inline citations (formatted as [x] where x is a number) have been introduced
-that are not yet included in the references table at the end of the report,
-generate an updated reference summary table that includes all citations.
-Each reference entry must include: reference number, name, author(s), and full URL.
-Output only the HTML code for the updated reference table.
-Report HTML:
-{updated_report_html}
-"""
-    updated_references = openai_call(prompt=references_prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
-    updated_references = updated_references.strip().strip("```")
-    if updated_references:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
-        ref_heading = soup_updated.find(lambda tag: tag.name == "h1" and "Reference Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
             if next_sibling:
-                new_ref_html = BeautifulSoup(updated_references, "html.parser")
-                next_sibling.replace_with(new_ref_html)
             updated_report_html = str(soup_updated)
         else:
-            logging.info("No existing reference table found; reference update skipped.")
     else:
-        logging.info("Generated updated references empty; leaving original references unchanged.")
-    global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
-    summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
-    updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
     return updated_report_html, updated_qa
 def generate_graph_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str:

         unique_snippet = find_best_matching_snippet(chunk_html, report_html)
     return unique_snippet
+def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
                      initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
                      complementary_guidance: str) -> (str, str):
+    """
+    Fine-tunes an HTML report based on a user’s correction request.
+    Steps:
+      1. Identify relevant snippet(s) from the report that need adjustment by calling the LLM.
+      2. Using BeautifulSoup, find those snippet(s) in report_html.
+      3. For each snippet, call the LLM to generate a corrected version given the user request,
+         keeping in mind the full report context and search crumbs.
+      4. Replace the old snippet in the report with the corrected one.
+      5. Call the LLM to review the updated report and generate an updated reference table (if new references exist).
+      6. Return the updated report and append a summary of applied corrections to the QA log.
+    Parameters:
+      adjustment_request: The user request for corrections (e.g. "fix the visual after 'xyz'").
+      openai_api_key: OpenAI API Key.
+      serpapi_api_key: SERPAPI API Key.
+      report_html: The full HTML of the current report.
+      initial_request: The original research query/original request.
+      qa: Existing clarification Q&A.
+      target_style: The target style for the report.
+      knowledge_crumbs: Aggregated source/crumb content.
+      complementary_guidance: Any additional guidance.
+    Returns:
+      A tuple (updated_report_html, updated_qa)
+    """
+    import os
     import json
     import logging
     from bs4 import BeautifulSoup
+    # Set API keys in environment variables
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
+    logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
+    # Step 1: Identify the snippet(s) in the report relevant to the adjustment.
+    prompt_identify = (f"You are a meticulous technical editor. Below is the full report HTML and a user adjustment request. "
+                         f"Based on the user instruction, extract and output the minimal, unique HTML snippet(s) (including their container tags) "
+                         f"from the report that need fixing. Output your answer as a JSON object with a key \"identified_snippets\" mapping to a list of HTML snippets only (no commentary).\n\n"
+                         f"Full Report HTML:\n{report_html}\n\n"
+                         f"User Adjustment Request:\n{adjustment_request}\n\n"
+                         f"Only output valid JSON.")
+    response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
+    logging.info(f"fine_tune_report: Raw snippet identification response: {response_identify}")
+    try:
+        response_identify = response_identify.strip().strip("```")
+        id_data = json.loads(response_identify)
+        identified_snippets = id_data.get("identified_snippets", [])
+    except Exception as e:
+        logging.error(f"fine_tune_report: Error parsing identified snippets JSON: {e}")
+        identified_snippets = []
+    # If no snippets were identified, log an error and fall back (optional: you may choose to return without changes).
+    if not identified_snippets:
+        logging.warning("fine_tune_report: No specific snippets were identified for adjustment. Returning original report.")
+        return report_html, qa
+    # Step 2: For each identified snippet, extract it from the report and prepare to correct it.
+    soup = BeautifulSoup(report_html, "html.parser")
+    updated_report_html = report_html
+    corrections_summary = []
+    for snippet in identified_snippets:
+        snippet = snippet.strip()
+        # Check if the snippet text appears in the report
+        if snippet not in updated_report_html:
+            logging.warning(f"fine_tune_report: The following snippet was not found exactly in the report and will be skipped:\n{snippet}")
+            continue
+        # Step 3: For each snippet, prompt the LLM to apply the user-specified correction.
+        prompt_adjust = (f"You are a technical editor. Given the following HTML snippet extracted from a larger report and the user request, "
+                         f"make only the changes necessary to address the instruction. Preserve all existing citations, formatting, and context. "
+                         f"Ensure that the overall style of the report remains consistent with the provided target style and that any new references (if any) "
+                         f"are clearly indicated. Output your answer as a JSON object with two keys: \"improved\" (the corrected HTML snippet) and \"summary\" "
+                         f"(a brief summary of the changes applied).\n\n"
+                         f"Overall Report HTML:\n{report_html}\n\n"
+                         f"Current Snippet to Adjust:\n{snippet}\n\n"
+                         f"User Adjustment Request:\n{adjustment_request}\n\n"
+                         f"Additional Guidance:\nTarget Style: {target_style}\nKnowledge Crumbs: {knowledge_crumbs}\nComplementary Guidance: {complementary_guidance}\n\n"
+                         f"Only output valid JSON.")
+        response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
+        logging.info(f"fine_tune_report: Raw adjustment response: {response_adjust}")
         try:
+            response_adjust = response_adjust.strip().strip("```")
+            adjust_data = json.loads(response_adjust)
+            corrected_snippet = adjust_data.get("improved", "").strip()
+            snippet_summary = adjust_data.get("summary", "").strip()
         except Exception as e:
+            logging.error(f"fine_tune_report: Error parsing snippet adjustment JSON: {e}")
+            continue
+        if not corrected_snippet:
+            logging.warning("fine_tune_report: No improved snippet was returned by the LLM; skipping this snippet.")
+            continue
+        corrections_summary.append(f"Changes applied to snippet: {snippet_summary}")
+        # Step 4: Replace the original snippet with the improved snippet in the report HTML.
+        updated_report_html = updated_report_html.replace(snippet, corrected_snippet, 1)
+        logging.info("fine_tune_report: Snippet replaced in the report.")
+    # Step 5: Update the reference table. Ask the LLM to review the updated report and generate an updated reference table if needed.
+    prompt_refs = (f"You are a technical editor. Review the following updated report HTML. "
+                   f"If there are any new inline citations (formatted as [x]) that are not in the existing reference table, "
+                   f"generate an updated Reference Summary Table in valid HTML that includes all references. "
+                   f"Output only the HTML code for the updated reference table without any extra commentary.\n\n"
+                   f"Updated Report HTML:\n{updated_report_html}")
+    updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
+    updated_refs = updated_refs.strip().strip("```")
+    if updated_refs:
         soup_updated = BeautifulSoup(updated_report_html, "html.parser")
+        # Look for a heading that includes "Reference Summary Table"
+        ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
         if ref_heading:
             next_sibling = ref_heading.find_next_sibling()
             if next_sibling:
+                try:
+                    new_ref_html = BeautifulSoup(updated_refs, "html.parser")
+                    next_sibling.replace_with(new_ref_html)
+                    logging.info("fine_tune_report: Reference table updated successfully.")
+                except Exception as e:
+                    logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
+            else:
+                logging.info("fine_tune_report: No sibling element found after the reference heading; skipping reference table update.")
             updated_report_html = str(soup_updated)
         else:
+            logging.info("fine_tune_report: No existing reference table heading found; reference update skipped.")
     else:
+        logging.info("fine_tune_report: LLM did not return an updated reference table; leaving original references intact.")
+    # Step 6: Append corrections summary to the Q&A log.
+    global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
+    updated_qa = qa.strip() + "\n----------\n" + global_summary
+    logging.info("fine_tune_report: Fine-tuning process completed.")
     return updated_report_html, updated_qa
 def generate_graph_snippet(placeholder_text: str, context: str, initial_query: str, crumbs: str) -> str: