Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Mar 7, 2025

Commit

55e4cf1

verified ·

1 Parent(s): 65fa006

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -250

app.py CHANGED Viewed

@@ -1718,272 +1718,248 @@ def compress_text(text: str, target_length: int) -> str:
 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
                           aggregated_crumbs: str, references: list, pages: int = 8) -> str:
-    """
-    Revised generate_final_report with improved JSON extraction for the skeleton output.
-    The function:
-      1. Generates a JSON skeleton outlining the report sections and placeholder allocations.
-      2. For each core section, generates HTML content using the assigned token (target_wc * 5),
-         ensuring that target_wc is treated as an integer.
-      3. Generates final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
-      4. Assembles the Table of Contents and the final HTML.
-      5. Passes the raw HTML through placeholder replacement functions before returning.
-    Improvements:
-      - Increased fallback extraction attempts if the JSON skeleton is incomplete.
-      - Ensures that max_tokens parameters are integers.
-    """
-    import json, logging, re
-    # Calculate overall target word count (approximate)
-    total_word_count = pages * 500
-    combined_learnings = "\n".join(learnings) if learnings else (
-        "No external summaries were directly extracted. It is not possible to analyze relevance."
-    )
-    # --- Step 1: Generate the JSON skeleton outline with placeholder allocation decisions ---
-    prompt_skeleton = f"""
-You are a master technical editor.
-Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
-The overall report should be approximately {total_word_count} words long.
-Divide the report into two groups:
-1. "core_sections": These are the main content sections that address key sub-topics drawn from the context, research learnings, and search results.
-   - There should be between 4 and 6 core sections. Their combined target word count should be about 70% of the total (approximately {int(0.7 * total_word_count)} words).
-   - For each core section, provide:
-       • "section_name": A concise title.
-       • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
-       • "target_word_count": An approximate desired word count for that section.
-       • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
-       • "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
-         **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
-         graph placeholders should be in the same range, and focus placeholders should be between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
-2. "final_sections": These should be generated after core sections and include:
-       - "Introduction"
-       - "Abstract"
-       - "Conclusion"
-       - "Reference Summary Table"
-   Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
-   distributed evenly among them.
-Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
-"""
-    # Increase the token allocation if needed (e.g., 2000 tokens)
-    skeleton_response = openai_call(
-        prompt=prompt_skeleton,
-        model="o3-mini",
-        max_tokens_param=int(2000),
-        temperature=0
-    )
-    # --- Fallback extraction for JSON skeleton ---
-    try:
-        skeleton = json.loads(skeleton_response)
-    except Exception as e:
-        logging.error(f"Error parsing skeleton JSON: {e}")
-        # First attempt: extract JSON from a markdown code fence.
-        match = re.search(r"```json(.*?)```", skeleton_response, re.DOTALL)
-        json_str = ""
-        if match:
-            json_str = match.group(1).strip()
-        else:
-            # Second attempt: extract any substring that starts with '{' and ends with '}'.
-            json_match = re.search(r'({.*})', skeleton_response, re.DOTALL)
-            if json_match:
-                json_str = json_match.group(1).strip()
-        try:
-            skeleton = json.loads(json_str) if json_str else {"core_sections": [], "final_sections": []}
-        except Exception as e2:
-            logging.error(f"Fallback JSON parsing failed: {e2}")
-            skeleton = {"core_sections": [], "final_sections": []}
-    # --- Step 2: Generate content for each core section sequentially.
-    generated_core_sections = {}
-    previous_sections_content = ""
-    if "core_sections" in skeleton:
-        for section in skeleton["core_sections"]:
-            section_name = section.get("section_name", "Untitled Section")
-            instructions = section.get("instructions", "")
-            # Ensure target_word_count is an integer:
-            try:
-                target_wc = int(section.get("target_word_count", 500))
-            except ValueError:
-                target_wc = 500
-            key_elements = section.get("key_content_elements", [])
-            placeholders = section.get("placeholders", {})
-            # Build a placeholder directive based on allocated booleans.
-            placeholder_directive = ""
-            if placeholders.get("visual", False):
-                placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
-            if placeholders.get("graph", False):
-                placeholder_directive += "[[Graph Placeholder: Insert one graph here.]]\n"
-            if placeholders.get("focus", False):
-                placeholder_directive += "[[Focus Placeholder: Insert one focus box here if deeper analysis is needed.]]\n"
-            prompt_section = f"""
-You are an expert technical editor.
-Generate detailed HTML content for the research report section titled "{section_name}".
-Instructions: {instructions}
-Target word count: Approximately {target_wc} words.
-Key content elements to include: {", ".join(key_elements)}.
-Additionally, please embed the following placeholder directives exactly where appropriate in the content:
-{placeholder_directive if placeholder_directive else "No placeholders required for this section."}
-Context: {context}
-Initial Query: {initial_query}
-Report Style: {reportstyle}
-Learnings: {combined_learnings}
-Aggregated Search Results: {aggregated_crumbs}
-Previously generated sections (if any): {previous_sections_content}
-Return only the HTML content for this section (do not include outer <html> or <body> tags).
-"""
-            section_content = openai_call(
-                prompt=prompt_section,
-                model="o3-mini",
-                max_tokens_param=int(target_wc * 5),
-                temperature=0
-            )
-            section_content = section_content.strip()
-            generated_core_sections[section_name] = section_content
-            previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
-    # --- Step 3: Generate content for each final section.
-    generated_final_sections = {}
-    if "final_sections" in skeleton:
-        for section in skeleton["final_sections"]:
-            section_name = section.get("section_name", "Untitled Final Section")
-            instructions = section.get("instructions", "")
-            try:
-                target_wc = int(section.get("target_word_count", 500))
-            except ValueError:
-                target_wc = 500
-            prompt_final = f"""
-You are a master technical editor.
-Generate detailed HTML content for the final section titled "{section_name}".
-Instructions: {instructions}
-Target word count: Approximately {target_wc} words.
-Context: {context}
-Initial Query: {initial_query}
-Report Style: {reportstyle}
-Learnings: {combined_learnings}
-Aggregated Search Results: {aggregated_crumbs}
-Previously generated core sections: {previous_sections_content}
-Return only the HTML content for this section (do not include outer <html> or <body> tags).
-"""
-            final_section_content = openai_call(
-                prompt=prompt_final,
-                model="o3-mini",
-                max_tokens_param=int(target_wc * 5),
-                temperature=0
-            )
-            final_section_content = final_section_content.strip()
-            generated_final_sections[section_name] = final_section_content
-            previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
-    # --- Step 4: Generate a Table of Contents from section titles.
-    toc_titles = []
-    for section in skeleton.get("core_sections", []):
-        if "section_name" in section:
-            toc_titles.append(section["section_name"])
-    for section in skeleton.get("final_sections", []):
-        if "section_name" in section:
-            toc_titles.append(section["section_name"])
-    prompt_toc = f"""
-You are a technical editor.
-Based on the following list of section titles: {', '.join(toc_titles)},
-generate a concise HTML snippet for a Table of Contents,
-with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
-Return only the HTML snippet without additional commentary.
-"""
-    toc_html = openai_call(
-        prompt=prompt_toc,
-        model="o3-mini",
-        max_tokens_param=int(500),
-        temperature=0
-    ).strip()
-    # --- Step 5: Assemble the final HTML document.
-    final_report_html = f"""<html>
-  <head>
-    <meta charset="utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <style>
-      body {{
-        font-family: Arial, sans-serif;
-        margin: 20px;
-        padding: 0;
-        background-color: #ffffff;
-      }}
-      h1 {{
-        text-align: center;
-        margin-bottom: 20px;
-      }}
-      h2 {{
-        text-align: left;
-        margin-top: 20px;
-        margin-bottom: 10px;
-      }}
-      .section {{
-        margin-bottom: 30px;
-      }}
-      .toc {{
-        margin: 20px 0;
-        border: 1px solid #ccc;
-        padding: 10px;
-      }}
-    </style>
-  </head>
-  <body>
-    <!-- Report Title -->
-    <h1>{initial_query}</h1>
-    <!-- Table of Contents -->
-    <div class="toc">
-      {toc_html}
-    </div>
-"""
-    # Append core sections.
-    for section in skeleton.get("core_sections", []):
-        section_name = section.get("section_name", "Untitled Section")
-        content = generated_core_sections.get(section_name, "")
-        final_report_html += f"""<div class="section">
-  <h2>{section_name}</h2>
-  {content}
-</div>
-"""
-    # Append final sections.
-    for section in skeleton.get("final_sections", []):
-        section_name = section.get("section_name", "Untitled Final Section")
-        content = generated_final_sections.get(section_name, "")
-        final_report_html += f"""<div class="section">
-  <h2>{section_name}</h2>
-  {content}
-</div>
-"""
-    # Append an ending marker.
-    final_report_html += """
     <iframe class="visual-frame" srcdoc='
     <!DOCTYPE html>
     <html>
-    <head></head>
         <body>
             <div>
 -end-
             </div>
         </body>
     </html>' width="100px" height="15px" style="border:none;"></iframe>
-  </body>
 </html>
-"""
-    # --- Step 6: Replace placeholder markers with actual content.
-    final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
-    final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
-    final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
-    logging.info("generate_final_report: Report generated successfully with integrated placeholder allocation decisions.")
-    return final_report_html
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
     # Filter out already seen results
     new_results = []

 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
                           aggregated_crumbs: str, references: list, pages: int = 8) -> str:
+    fallback_text = ""
+    if not learnings:
+        fallback_text = "No external summaries were directly extracted. It is not possible to analyze relevance."
+    combined_learnings = "\n".join(learnings) if learnings else fallback_text
+    word_count = pages * 500
+    prompt = (f"""
+Produce a comprehensive report in html format.
+The report should be very detailed and lengthy.
+// Requirements
+- All text alignment has to be on the left
+- The report should be {pages} long or {word_count} words (excluding html formatting)
+- It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
+Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
+- No more than 10 sentences per div blocks, skip lines and add line breaks when changing topic.
+- The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
+- For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
+Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
+- Do not put a numbered list (ex: 1.1, ...) for every sentences! It should be used parcimoniously for real sub-sections.
+- Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
+- Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
+- Titles for sections and sub-sections should systematically use the tags:
+  <h1> for sections (ex: <h1>3. Examination of State-of-the-Art of AI</h1>)
+  <h2> for sub-sections (ex: <h2>3.2 AI Performance in Mathematics</h2>)
+  <h3> for sub-sub-sections (ex: <h3>3.2.1 Illustration with math conjecture demonstration</h3>)
+  <h4> for bulletpoint title (ex: <h4>item to detail:</h4> description of the item to detail ...)
+- Use inline formatting for the tables with homogeneous border and colors
+- Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
+- For the Table of contents: do not mention the pages, but make each item on separate line
+- Put "Table of contents" and "Abstract" title in h1 format.
+- The Table of contents should skip the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
+- Exceptionally - for sections requiring specific improvements - put it between <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
+// Reference citations
+- The name of the reference table should be: "Reference Summary Table"
+- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
+- The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
+- For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
+- Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
+- Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
+- The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
+- The table generated should have in-line styling to have word-wrap and 100% width
+// Instructions:
+1. Integrate numbers from the sources but always mention the source
+2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
+3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
+Note: This is for academic purposes, so thorough citations and referencing are essential.
+4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
+Note: put the full reference url (no generic domain address), down to the html page or the pdf
+// Style
+The report must follow this writing style {reportstyle}.
+// Format when mentioning sources, organisations and individuals
+- We will perform a post-processing on the output
+- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
+example 1: {{[{{Organisation}}]}}'s CEO, {{[{{CEO name}}]}} ...
+example 2: in a report from the {{[{{University name}}]}} titled "{{[{{report title}}]}}"...
+example 3: the CEO of {{[{{Company name}}]}} , {{[{{Name}}]}}, said that "the best way to..."
+eexample 4: the project {{[{{project name}}]}}, anounced by {{[{{...}}]}} in collaboration with {{[{{...}}]}}
+example 5: Mr. {{[{{person}}]}}, Marketing director in {{[{{company}}]}}, mentioned that ...
+Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
+- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
+- You should have approximately {2 * pages} mention of organisations, people, projects or people, use the prescribed format
+- The same item cannot be mentioned more than 3 times, don't over do it
+- Do not mix sources that are not directly related in the search results, don't put together organisations or people that have nothing to do with each other
+- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
+Note: LinkedIn is not a relevant source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
+// Sources
+Use the following learnings and merged reference details from a deep research process on:
+'{initial_query}'
+Taking also into consideration the context:
+{context}
+--------------- Placeholders -----------
+In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
+There are 3 types: visual, graphs, focus - each with their own purpose
+// Visual placeholders
+- Create special visual placeholders that will be rendered in mermaid afterwards.
+- The Visual placeholders should follow this format:
+Source:source_name [y]
+[[Visual Placeholder n:
+- Purpose of this visual is:...
+- Relevant content to generate it:
+o ex: arguments
+o ex: lists of elements
+o ex: data points
+o ...
+- Message to convey: ...
+]]
+with:
+- n as the reference number,
+- source_name as the full name of the main source used and
+- y as the number ref of the source reference in the reference table.
+Important note for visual placeholders:
+- on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
+- after [[ put "Visual Placeholder n:" explicitly (with n as the ref number of the placeholder box created). This will be used in a regex
+- the only types of mermaid diagram that can be generated are: flowchart, sequence, gantt, pie, mindmap (no charts) // Take this into consideration when providing the instructions for the diagram
+- do not make mention in the report to "visual placeholders" just mention the visual and the number..
+- in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
+- these placeholders text should contain:
+    o the purpose of the future visual
+    o the relevant data to generate it
+- there should be between {round(pages/10,0)} and {round(pages/5,0)} of these visuals placeholders within the report (all between introduction and conclusion)
+- 2 visual placeholders cannot be in the same section
+Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
+// Graph placeholders
+- Create special graph placeholders that will be rendered in d3.js afterwards based on your guidance:
+Source:source_name [y]
+[[Graph Placeholder n:
+- Purpose of this graph is:...
+- Relevant numbers to generate it:
+table format
+- Message to convey: ...
+]]
+with:
+- n as the reference number,
+- source_name as the  full name of the main source used and
+- y as the source reference in the reference table.
+- the table containing all the required data has to include data points FROM the learnings / results from the search below
+Important note for graph placeholders:
+- on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
+- use p tag for the source and source reference number
+- after [[ put "Graph Placeholder n:" explicitly (with n as the ref number of the graph created). This will be used in a regex
+- Do not make things up - every data points have to be from a real source
+- All types of graphs (using d3.js library) can be generated // Take this into consideration when providing the instructions for the graph data
+- do not make mention in the report to "graph placeholders" just mention graph.
+- in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
+- these placeholders text should contain:
+    o the purpose of the future graph
+    o the relevant data to generate it
+- there should be between {round(pages/10,0)} and {round(pages/5,0)} of these graphs placeholders within the report (all between introduction and conclusion)
+- 2 graph placeholders cannot be in the same section
+Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
+// Focus placeholders
+- To drill down on specific topic that would be deserve to be developped extensively separately, create special focus placeholders in [[...]] double backets
+Note: outside of the placeholder, do not make reference in the report to "focus placeholders" just mention the "Focus box n".
+- in the Focus placeholder, make a mention to the prescribed sources used (no need to add the source before or after the placeholder)
+- do not make the placeholder on the exact same topic as the section or the sub-section where it is positioned, it has to be either:
+    o a special case that deserves attention
+    o a recent development / innovation
+    o a theoretical drill-down
+    o a contrarian point of view / objection
+- these placeholders text should contain:
+    o the purpose of the focus box
+    o the relevant data to generate it
+    o the guidance in terms of style and message to convey
+    Note: Be specific if you want some particular point developped, keep it consistent across the report.
+- there should be between {round(pages/20,0)} and {round(pages/10,0)} of these focus placeholders within the report (all between introduction and conclusion)
+- 2 focus placeholders cannot be in the same section and should be a few pages apart in the report
+- Mention all the sources that should be used to generate this focus placeholder and list also the references that will be mentioned in the References section later (ex: [1], [2])
+Note: the Focus placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
+// Format:
+[[Focus Placeholder n:
+- Topic of this focus:...
+- Relevant info to generate it:...
+- Specific angle of the focus placeholder:...
+- Key elements to mention:
+o ...
+o ...
+...
+]]
+with:
+- n as the reference number,
+Important note for focus placeholders:
+- after [[ put "Focus Placeholder n:" explicitly (with n as the ref number of the focus box created). This will be used in a regex
+- Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
+// Report ending required
+End the report with the following sequence:
     <iframe class="visual-frame" srcdoc='
     <!DOCTYPE html>
     <html>
+    </head>
         <body>
             <div>
 -end-
             </div>
         </body>
     </html>' width="100px" height="15px" style="border:none;"></iframe>
+Then close the html code from the broader report
+</body>
 </html>
+// Structure the overall report as follows:
+{{Do not add anything before - no introductory meta comment or content}}
+- Abstract
+- Table of contents
+- Introduction
+- [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
+- Conclusion
+- References summary table
+- Report ending formatting (as mentioned before)
+{{Do not add anything after - no conclusive meta comment or content}}
+Important note: placeholders (visual, graph or focus) can only appear in the sections or sub-sections not in introduction, the conclusion, the references or after the references
+// Material to use to ground your report:
+- Learnings:
+{json.dumps(learnings, indent=2)}
+- Results from searches:
+{aggregated_crumbs}
+Take a deep breath, do your best.
+Now, produce the report please.
+"""
+    )
+    tokentarget = word_count * 5  # rough multiplier for token target
+    report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
+    # Post-processing
+    report = re.sub(r'\{\[\{(.*?)\}\]\}', r'\1', report)
+    report = re.sub(r'\[\{(.*?)\}\]', r'\1', report)
+    # If the report is too long, compress it.
+    if len(report) > MAX_MESSAGE_LENGTH:
+        report = compress_text(report, MAX_MESSAGE_LENGTH)
+    if report.startswith("Error calling OpenAI API"):
+        logging.error(f"generate_final_report error: {report}")
+        return f"Error generating report: {report}"
+    logging.info("generate_final_report: Report generated successfully.")
+    return report
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
     # Filter out already seen results
     new_results = []