Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Mar 7, 2025

Commit

508a2e2

verified ·

1 Parent(s): 3e33744

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -55

app.py CHANGED Viewed

@@ -1719,37 +1719,18 @@ def compress_text(text: str, target_length: int) -> str:
 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
                           aggregated_crumbs: str, references: list, pages: int = 8) -> str:
     """
-    Revised generate_final_report with placeholder allocation decisions in the initial JSON skeleton.
-    The function proceeds as follows:
-    1. **Skeleton Generation:**
-       It first builds a JSON skeleton outline for the report. For core sections,
-       in addition to "section_name", "instructions", "target_word_count", and "key_content_elements",
-       an extra field "placeholders" is generated. This field is an object with boolean values
-       indicating whether to include a visual, graph, and/or focus placeholder.
-       The prompt instructs the LLM that, overall, the report should have:
-         - Visual placeholders between ⌊pages/10⌋ and ⌈pages/5⌉ in total.
-         - Graph placeholders in the same range as visual.
-         - Focus placeholders between ⌊pages/20⌋ and ⌈pages/10⌉ in total.
-       Not every section need have every placeholder.
-    2. **Section Generation:**
-       Each core section is generated using a detailed prompt that incorporates the respective
-       placeholder decisions along with context, initial query, report style, learnings, aggregated crumbs,
-       and previously generated sections.
-    3. **Final Sections Generation and Assembly:**
-       The final sections (Introduction, Abstract, Conclusion, Reference Summary Table) are generated afterward
-       and a Table of Contents is created from all section titles. Finally, all parts are assembled into a complete
-       HTML document.
-    4. **Placeholder Replacement:**
-       Once the HTML report is assembled, the placeholder markers (e.g. [[Visual Placeholder: …]]) are replaced
-       via the dedicated replacement functions.
     """
-    import json, logging
     # Calculate overall target word count (approximate)
     total_word_count = pages * 500
@@ -1757,7 +1738,7 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
         "No external summaries were directly extracted. It is not possible to analyze relevance."
     )
-    # Step 1: Generate the JSON skeleton outline with explicit placeholder allocation decisions.
     prompt_skeleton = f"""
 You are a master technical editor.
 Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
@@ -1771,45 +1752,57 @@ Divide the report into two groups:
        • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
        • "target_word_count": An approximate desired word count for that section.
        • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
-       • "placeholders": An object indicating which placeholder types to include.
-           - Include "visual": true or false.
-           - Include "graph": true or false.
-           - Include "focus": true or false.
-         **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉, graph placeholders should follow the same rule, and focus placeholders should appear between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which placeholder(s) to activate, ensuring that not every section receives all three.
-2. "final_sections": These sections frame the report and include:
        - "Introduction"
        - "Abstract"
        - "Conclusion"
        - "Reference Summary Table"
-   The combined target word count for final sections should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
    distributed evenly among them.
 Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
 """
     skeleton_response = openai_call(
         prompt=prompt_skeleton,
         model="o3-mini",
-        max_tokens_param=1500,
         temperature=0
     )
     try:
         skeleton = json.loads(skeleton_response)
     except Exception as e:
         logging.error(f"Error parsing skeleton JSON: {e}")
-        skeleton = {"core_sections": [], "final_sections": []}
-    # Step 2: Generate content for each core section sequentially.
     generated_core_sections = {}
     previous_sections_content = ""
     if "core_sections" in skeleton:
         for section in skeleton["core_sections"]:
             section_name = section.get("section_name", "Untitled Section")
             instructions = section.get("instructions", "")
-            target_wc = section.get("target_word_count", 500)
             key_elements = section.get("key_content_elements", [])
             placeholders = section.get("placeholders", {})
-            # Build a placeholder directive based on the allocated booleans.
             placeholder_directive = ""
             if placeholders.get("visual", False):
                 placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
@@ -1838,21 +1831,23 @@ Return only the HTML content for this section (do not include outer <html> or <b
             section_content = openai_call(
                 prompt=prompt_section,
                 model="o3-mini",
-                max_tokens_param=target_wc * 5,
                 temperature=0
             )
             section_content = section_content.strip()
             generated_core_sections[section_name] = section_content
-            # Accumulate current section's content into a shared context for continuity.
             previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
-    # Step 3: Generate final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
     generated_final_sections = {}
     if "final_sections" in skeleton:
         for section in skeleton["final_sections"]:
             section_name = section.get("section_name", "Untitled Final Section")
             instructions = section.get("instructions", "")
-            target_wc = section.get("target_word_count", 500)
             prompt_final = f"""
 You are a master technical editor.
 Generate detailed HTML content for the final section titled "{section_name}".
@@ -1870,14 +1865,14 @@ Return only the HTML content for this section (do not include outer <html> or <b
             final_section_content = openai_call(
                 prompt=prompt_final,
                 model="o3-mini",
-                max_tokens_param=target_wc * 5,
                 temperature=0
             )
             final_section_content = final_section_content.strip()
             generated_final_sections[section_name] = final_section_content
             previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
-    # Step 4: Generate a Table of Contents from the section titles.
     toc_titles = []
     for section in skeleton.get("core_sections", []):
         if "section_name" in section:
@@ -1889,8 +1884,8 @@ Return only the HTML content for this section (do not include outer <html> or <b
 You are a technical editor.
 Based on the following list of section titles: {', '.join(toc_titles)},
 generate a concise HTML snippet for a Table of Contents,
-with each item appearing on a separate line and numbered (e.g., "1. Section Title", "2. Section Title", etc.).
-Return only the HTML snippet for the Table of Contents without additional commentary.
 """
     toc_html = openai_call(
         prompt=prompt_toc,
@@ -1899,7 +1894,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
         temperature=0
     ).strip()
-    # Step 5: Assemble the final HTML document.
     final_report_html = f"""<html>
   <head>
     <meta charset="utf-8" />
@@ -1956,7 +1951,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
   {content}
 </div>
 """
-    # Append a designated report ending marker.
     final_report_html += """
     <iframe class="visual-frame" srcdoc='
     <!DOCTYPE html>
@@ -1971,8 +1966,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
   </body>
 </html>
 """
-    # Step 6: Process the raw HTML to replace placeholder markers with actual placeholder code.
     final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
     final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
     final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)

 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
                           aggregated_crumbs: str, references: list, pages: int = 8) -> str:
     """
+    Revised generate_final_report with explicit type conversions for max_tokens values
+    and a fallback for incomplete JSON parsing.
+    This function:
+      1. Generates a JSON skeleton outlining the report sections and placeholder allocations.
+      2. For each core section, generates HTML content using the assigned token
+         (target_wc * 5) ensuring target_wc is an integer.
+      3. Generates final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
+      4. Assembles the Table of Contents and the final HTML.
+      5. Passes the raw HTML through the placeholder replacement functions before returning.
     """
+    import json, logging, re
     # Calculate overall target word count (approximate)
     total_word_count = pages * 500
         "No external summaries were directly extracted. It is not possible to analyze relevance."
     )
+    # --- Step 1: Generate the JSON skeleton outline with placeholder allocation decisions ---
     prompt_skeleton = f"""
 You are a master technical editor.
 Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
        • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
        • "target_word_count": An approximate desired word count for that section.
        • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
+       • "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
+         **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
+         graph placeholders should be in the same range, and focus placeholders between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
+2. "final_sections": These should be generated after core sections and include:
        - "Introduction"
        - "Abstract"
        - "Conclusion"
        - "Reference Summary Table"
+   Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
    distributed evenly among them.
 Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
 """
     skeleton_response = openai_call(
         prompt=prompt_skeleton,
         model="o3-mini",
+        max_tokens_param=int(1500),
         temperature=0
     )
     try:
         skeleton = json.loads(skeleton_response)
     except Exception as e:
         logging.error(f"Error parsing skeleton JSON: {e}")
+        # Fallback: attempt to extract JSON from a markdown code fence.
+        match = re.search(r"```json(.*?)```", skeleton_response, re.DOTALL)
+        if match:
+            try:
+                skeleton = json.loads(match.group(1).strip())
+            except Exception as e2:
+                logging.error(f"Fallback JSON parsing failed: {e2}")
+                skeleton = {"core_sections": [], "final_sections": []}
+        else:
+            skeleton = {"core_sections": [], "final_sections": []}
+    # --- Step 2: Generate content for each core section sequentially.
     generated_core_sections = {}
     previous_sections_content = ""
     if "core_sections" in skeleton:
         for section in skeleton["core_sections"]:
             section_name = section.get("section_name", "Untitled Section")
             instructions = section.get("instructions", "")
+            # Ensure target_word_count is an integer:
+            try:
+                target_wc = int(section.get("target_word_count", 500))
+            except ValueError:
+                target_wc = 500
             key_elements = section.get("key_content_elements", [])
             placeholders = section.get("placeholders", {})
+            # Build placeholder directive based on allocated booleans.
             placeholder_directive = ""
             if placeholders.get("visual", False):
                 placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
             section_content = openai_call(
                 prompt=prompt_section,
                 model="o3-mini",
+                max_tokens_param=int(target_wc * 5),
                 temperature=0
             )
             section_content = section_content.strip()
             generated_core_sections[section_name] = section_content
             previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
+    # --- Step 3: Generate content for each final section.
     generated_final_sections = {}
     if "final_sections" in skeleton:
         for section in skeleton["final_sections"]:
             section_name = section.get("section_name", "Untitled Final Section")
             instructions = section.get("instructions", "")
+            try:
+                target_wc = int(section.get("target_word_count", 500))
+            except ValueError:
+                target_wc = 500
             prompt_final = f"""
 You are a master technical editor.
 Generate detailed HTML content for the final section titled "{section_name}".
             final_section_content = openai_call(
                 prompt=prompt_final,
                 model="o3-mini",
+                max_tokens_param=int(target_wc * 5),
                 temperature=0
             )
             final_section_content = final_section_content.strip()
             generated_final_sections[section_name] = final_section_content
             previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
+    # --- Step 4: Generate a Table of Contents from section titles.
     toc_titles = []
     for section in skeleton.get("core_sections", []):
         if "section_name" in section:
 You are a technical editor.
 Based on the following list of section titles: {', '.join(toc_titles)},
 generate a concise HTML snippet for a Table of Contents,
+with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
+Return only the HTML snippet without additional commentary.
 """
     toc_html = openai_call(
         prompt=prompt_toc,
         temperature=0
     ).strip()
+    # --- Step 5: Assemble the final HTML document.
     final_report_html = f"""<html>
   <head>
     <meta charset="utf-8" />
   {content}
 </div>
 """
+    # Append an ending marker.
     final_report_html += """
     <iframe class="visual-frame" srcdoc='
     <!DOCTYPE html>
   </body>
 </html>
 """
+    # --- Step 6: Replace placeholder markers with actual content.
     final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
     final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
     final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)