Guiyom commited on
Commit
55e4cf1
·
verified ·
1 Parent(s): 65fa006

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -250
app.py CHANGED
@@ -1718,272 +1718,248 @@ def compress_text(text: str, target_length: int) -> str:
1718
 
1719
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
1720
  aggregated_crumbs: str, references: list, pages: int = 8) -> str:
1721
- """
1722
- Revised generate_final_report with improved JSON extraction for the skeleton output.
1723
-
1724
- The function:
1725
- 1. Generates a JSON skeleton outlining the report sections and placeholder allocations.
1726
- 2. For each core section, generates HTML content using the assigned token (target_wc * 5),
1727
- ensuring that target_wc is treated as an integer.
1728
- 3. Generates final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
1729
- 4. Assembles the Table of Contents and the final HTML.
1730
- 5. Passes the raw HTML through placeholder replacement functions before returning.
1731
-
1732
- Improvements:
1733
- - Increased fallback extraction attempts if the JSON skeleton is incomplete.
1734
- - Ensures that max_tokens parameters are integers.
1735
- """
1736
- import json, logging, re
1737
 
1738
- # Calculate overall target word count (approximate)
1739
- total_word_count = pages * 500
1740
- combined_learnings = "\n".join(learnings) if learnings else (
1741
- "No external summaries were directly extracted. It is not possible to analyze relevance."
1742
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1743
 
1744
- # --- Step 1: Generate the JSON skeleton outline with placeholder allocation decisions ---
1745
- prompt_skeleton = f"""
1746
- You are a master technical editor.
1747
- Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
1748
- The overall report should be approximately {total_word_count} words long.
1749
- Divide the report into two groups:
1750
-
1751
- 1. "core_sections": These are the main content sections that address key sub-topics drawn from the context, research learnings, and search results.
1752
- - There should be between 4 and 6 core sections. Their combined target word count should be about 70% of the total (approximately {int(0.7 * total_word_count)} words).
1753
- - For each core section, provide:
1754
- • "section_name": A concise title.
1755
- • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
1756
- • "target_word_count": An approximate desired word count for that section.
1757
- • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
1758
- • "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
1759
- **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
1760
- graph placeholders should be in the same range, and focus placeholders should be between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
1761
-
1762
- 2. "final_sections": These should be generated after core sections and include:
1763
- - "Introduction"
1764
- - "Abstract"
1765
- - "Conclusion"
1766
- - "Reference Summary Table"
1767
- Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
1768
- distributed evenly among them.
1769
- Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
1770
- """
1771
- # Increase the token allocation if needed (e.g., 2000 tokens)
1772
- skeleton_response = openai_call(
1773
- prompt=prompt_skeleton,
1774
- model="o3-mini",
1775
- max_tokens_param=int(2000),
1776
- temperature=0
1777
- )
1778
 
1779
- # --- Fallback extraction for JSON skeleton ---
1780
- try:
1781
- skeleton = json.loads(skeleton_response)
1782
- except Exception as e:
1783
- logging.error(f"Error parsing skeleton JSON: {e}")
1784
- # First attempt: extract JSON from a markdown code fence.
1785
- match = re.search(r"```json(.*?)```", skeleton_response, re.DOTALL)
1786
- json_str = ""
1787
- if match:
1788
- json_str = match.group(1).strip()
1789
- else:
1790
- # Second attempt: extract any substring that starts with '{' and ends with '}'.
1791
- json_match = re.search(r'({.*})', skeleton_response, re.DOTALL)
1792
- if json_match:
1793
- json_str = json_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1794
 
1795
- try:
1796
- skeleton = json.loads(json_str) if json_str else {"core_sections": [], "final_sections": []}
1797
- except Exception as e2:
1798
- logging.error(f"Fallback JSON parsing failed: {e2}")
1799
- skeleton = {"core_sections": [], "final_sections": []}
1800
-
1801
- # --- Step 2: Generate content for each core section sequentially.
1802
- generated_core_sections = {}
1803
- previous_sections_content = ""
1804
- if "core_sections" in skeleton:
1805
- for section in skeleton["core_sections"]:
1806
- section_name = section.get("section_name", "Untitled Section")
1807
- instructions = section.get("instructions", "")
1808
- # Ensure target_word_count is an integer:
1809
- try:
1810
- target_wc = int(section.get("target_word_count", 500))
1811
- except ValueError:
1812
- target_wc = 500
1813
- key_elements = section.get("key_content_elements", [])
1814
- placeholders = section.get("placeholders", {})
1815
- # Build a placeholder directive based on allocated booleans.
1816
- placeholder_directive = ""
1817
- if placeholders.get("visual", False):
1818
- placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
1819
- if placeholders.get("graph", False):
1820
- placeholder_directive += "[[Graph Placeholder: Insert one graph here.]]\n"
1821
- if placeholders.get("focus", False):
1822
- placeholder_directive += "[[Focus Placeholder: Insert one focus box here if deeper analysis is needed.]]\n"
1823
-
1824
- prompt_section = f"""
1825
- You are an expert technical editor.
1826
- Generate detailed HTML content for the research report section titled "{section_name}".
1827
- Instructions: {instructions}
1828
- Target word count: Approximately {target_wc} words.
1829
- Key content elements to include: {", ".join(key_elements)}.
1830
- Additionally, please embed the following placeholder directives exactly where appropriate in the content:
1831
- {placeholder_directive if placeholder_directive else "No placeholders required for this section."}
1832
- Context: {context}
1833
- Initial Query: {initial_query}
1834
- Report Style: {reportstyle}
1835
- Learnings: {combined_learnings}
1836
- Aggregated Search Results: {aggregated_crumbs}
1837
- Previously generated sections (if any): {previous_sections_content}
1838
-
1839
- Return only the HTML content for this section (do not include outer <html> or <body> tags).
1840
- """
1841
- section_content = openai_call(
1842
- prompt=prompt_section,
1843
- model="o3-mini",
1844
- max_tokens_param=int(target_wc * 5),
1845
- temperature=0
1846
- )
1847
- section_content = section_content.strip()
1848
- generated_core_sections[section_name] = section_content
1849
- previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
1850
-
1851
- # --- Step 3: Generate content for each final section.
1852
- generated_final_sections = {}
1853
- if "final_sections" in skeleton:
1854
- for section in skeleton["final_sections"]:
1855
- section_name = section.get("section_name", "Untitled Final Section")
1856
- instructions = section.get("instructions", "")
1857
- try:
1858
- target_wc = int(section.get("target_word_count", 500))
1859
- except ValueError:
1860
- target_wc = 500
1861
- prompt_final = f"""
1862
- You are a master technical editor.
1863
- Generate detailed HTML content for the final section titled "{section_name}".
1864
- Instructions: {instructions}
1865
- Target word count: Approximately {target_wc} words.
1866
- Context: {context}
1867
- Initial Query: {initial_query}
1868
- Report Style: {reportstyle}
1869
- Learnings: {combined_learnings}
1870
- Aggregated Search Results: {aggregated_crumbs}
1871
- Previously generated core sections: {previous_sections_content}
1872
-
1873
- Return only the HTML content for this section (do not include outer <html> or <body> tags).
1874
- """
1875
- final_section_content = openai_call(
1876
- prompt=prompt_final,
1877
- model="o3-mini",
1878
- max_tokens_param=int(target_wc * 5),
1879
- temperature=0
1880
- )
1881
- final_section_content = final_section_content.strip()
1882
- generated_final_sections[section_name] = final_section_content
1883
- previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
1884
-
1885
- # --- Step 4: Generate a Table of Contents from section titles.
1886
- toc_titles = []
1887
- for section in skeleton.get("core_sections", []):
1888
- if "section_name" in section:
1889
- toc_titles.append(section["section_name"])
1890
- for section in skeleton.get("final_sections", []):
1891
- if "section_name" in section:
1892
- toc_titles.append(section["section_name"])
1893
- prompt_toc = f"""
1894
- You are a technical editor.
1895
- Based on the following list of section titles: {', '.join(toc_titles)},
1896
- generate a concise HTML snippet for a Table of Contents,
1897
- with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
1898
- Return only the HTML snippet without additional commentary.
1899
- """
1900
- toc_html = openai_call(
1901
- prompt=prompt_toc,
1902
- model="o3-mini",
1903
- max_tokens_param=int(500),
1904
- temperature=0
1905
- ).strip()
1906
-
1907
- # --- Step 5: Assemble the final HTML document.
1908
- final_report_html = f"""<html>
1909
- <head>
1910
- <meta charset="utf-8" />
1911
- <meta name="viewport" content="width=device-width, initial-scale=1">
1912
- <style>
1913
- body {{
1914
- font-family: Arial, sans-serif;
1915
- margin: 20px;
1916
- padding: 0;
1917
- background-color: #ffffff;
1918
- }}
1919
- h1 {{
1920
- text-align: center;
1921
- margin-bottom: 20px;
1922
- }}
1923
- h2 {{
1924
- text-align: left;
1925
- margin-top: 20px;
1926
- margin-bottom: 10px;
1927
- }}
1928
- .section {{
1929
- margin-bottom: 30px;
1930
- }}
1931
- .toc {{
1932
- margin: 20px 0;
1933
- border: 1px solid #ccc;
1934
- padding: 10px;
1935
- }}
1936
- </style>
1937
- </head>
1938
- <body>
1939
- <!-- Report Title -->
1940
- <h1>{initial_query}</h1>
1941
- <!-- Table of Contents -->
1942
- <div class="toc">
1943
- {toc_html}
1944
- </div>
1945
- """
1946
- # Append core sections.
1947
- for section in skeleton.get("core_sections", []):
1948
- section_name = section.get("section_name", "Untitled Section")
1949
- content = generated_core_sections.get(section_name, "")
1950
- final_report_html += f"""<div class="section">
1951
- <h2>{section_name}</h2>
1952
- {content}
1953
- </div>
1954
- """
1955
- # Append final sections.
1956
- for section in skeleton.get("final_sections", []):
1957
- section_name = section.get("section_name", "Untitled Final Section")
1958
- content = generated_final_sections.get(section_name, "")
1959
- final_report_html += f"""<div class="section">
1960
- <h2>{section_name}</h2>
1961
- {content}
1962
- </div>
1963
- """
1964
- # Append an ending marker.
1965
- final_report_html += """
1966
  <iframe class="visual-frame" srcdoc='
1967
  <!DOCTYPE html>
1968
  <html>
1969
- <head></head>
1970
  <body>
1971
  <div>
1972
  -end-
1973
  </div>
1974
  </body>
1975
  </html>' width="100px" height="15px" style="border:none;"></iframe>
1976
- </body>
 
 
1977
  </html>
1978
- """
1979
- # --- Step 6: Replace placeholder markers with actual content.
1980
- final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1981
- final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1982
- final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1983
 
1984
- logging.info("generate_final_report: Report generated successfully with integrated placeholder allocation decisions.")
1985
- return final_report_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1986
 
 
 
 
 
 
 
 
 
 
1987
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
1988
  # Filter out already seen results
1989
  new_results = []
 
1718
 
1719
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
1720
  aggregated_crumbs: str, references: list, pages: int = 8) -> str:
1721
+ fallback_text = ""
1722
+ if not learnings:
1723
+ fallback_text = "No external summaries were directly extracted. It is not possible to analyze relevance."
1724
+ combined_learnings = "\n".join(learnings) if learnings else fallback_text
1725
+ word_count = pages * 500
1726
+ prompt = (f"""
1727
+ Produce a comprehensive report in html format.
1728
+ The report should be very detailed and lengthy.
 
 
 
 
 
 
 
 
1729
 
1730
+ // Requirements
1731
+ - All text alignment has to be on the left
1732
+ - The report should be {pages} long or {word_count} words (excluding html formatting)
1733
+ - It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
1734
+ Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
1735
+ - No more than 10 sentences per div blocks, skip lines and add line breaks when changing topic.
1736
+ - The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
1737
+ - For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
1738
+ Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
1739
+ - Do not put a numbered list (ex: 1.1, ...) for every sentences! It should be used parcimoniously for real sub-sections.
1740
+ - Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
1741
+ - Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
1742
+ - Titles for sections and sub-sections should systematically use the tags:
1743
+ <h1> for sections (ex: <h1>3. Examination of State-of-the-Art of AI</h1>)
1744
+ <h2> for sub-sections (ex: <h2>3.2 AI Performance in Mathematics</h2>)
1745
+ <h3> for sub-sub-sections (ex: <h3>3.2.1 Illustration with math conjecture demonstration</h3>)
1746
+ <h4> for bulletpoint title (ex: <h4>item to detail:</h4> description of the item to detail ...)
1747
+ - Use inline formatting for the tables with homogeneous border and colors
1748
+ - Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
1749
+ - For the Table of contents: do not mention the pages, but make each item on separate line
1750
+ - Put "Table of contents" and "Abstract" title in h1 format.
1751
+ - The Table of contents should skip the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
1752
+ - Exceptionally - for sections requiring specific improvements - put it between <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
1753
+
1754
+ // Reference citations
1755
+ - The name of the reference table should be: "Reference Summary Table"
1756
+ - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
1757
+ - The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
1758
+ - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
1759
+ - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
1760
+ - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
1761
+ - The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
1762
+ - The table generated should have in-line styling to have word-wrap and 100% width
1763
+
1764
+ // Instructions:
1765
+ 1. Integrate numbers from the sources but always mention the source
1766
+ 2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
1767
+ 3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
1768
+ Note: This is for academic purposes, so thorough citations and referencing are essential.
1769
+ 4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
1770
+ Note: put the full reference url (no generic domain address), down to the html page or the pdf
1771
+
1772
+
1773
+ // Style
1774
+ The report must follow this writing style {reportstyle}.
1775
+
1776
+ // Format when mentioning sources, organisations and individuals
1777
+ - We will perform a post-processing on the output
1778
+ - For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
1779
+ example 1: {{[{{Organisation}}]}}'s CEO, {{[{{CEO name}}]}} ...
1780
+ example 2: in a report from the {{[{{University name}}]}} titled "{{[{{report title}}]}}"...
1781
+ example 3: the CEO of {{[{{Company name}}]}} , {{[{{Name}}]}}, said that "the best way to..."
1782
+ eexample 4: the project {{[{{project name}}]}}, anounced by {{[{{...}}]}} in collaboration with {{[{{...}}]}}
1783
+ example 5: Mr. {{[{{person}}]}}, Marketing director in {{[{{company}}]}}, mentioned that ...
1784
+ Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
1785
+ - This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
1786
+ - You should have approximately {2 * pages} mention of organisations, people, projects or people, use the prescribed format
1787
+ - The same item cannot be mentioned more than 3 times, don't over do it
1788
+ - Do not mix sources that are not directly related in the search results, don't put together organisations or people that have nothing to do with each other
1789
+ - DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
1790
+ Note: LinkedIn is not a relevant source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
1791
 
1792
+ // Sources
1793
+ Use the following learnings and merged reference details from a deep research process on:
1794
+ '{initial_query}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1795
 
1796
+ Taking also into consideration the context:
1797
+ {context}
1798
+
1799
+ --------------- Placeholders -----------
1800
+ In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
1801
+ There are 3 types: visual, graphs, focus - each with their own purpose
1802
+
1803
+ // Visual placeholders
1804
+ - Create special visual placeholders that will be rendered in mermaid afterwards.
1805
+ - The Visual placeholders should follow this format:
1806
+
1807
+ Source:source_name [y]
1808
+ [[Visual Placeholder n:
1809
+ - Purpose of this visual is:...
1810
+ - Relevant content to generate it:
1811
+ o ex: arguments
1812
+ o ex: lists of elements
1813
+ o ex: data points
1814
+ o ...
1815
+ - Message to convey: ...
1816
+ ]]
1817
+
1818
+ with:
1819
+ - n as the reference number,
1820
+ - source_name as the full name of the main source used and
1821
+ - y as the number ref of the source reference in the reference table.
1822
+ Important note for visual placeholders:
1823
+ - on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
1824
+ - after [[ put "Visual Placeholder n:" explicitly (with n as the ref number of the placeholder box created). This will be used in a regex
1825
+ - the only types of mermaid diagram that can be generated are: flowchart, sequence, gantt, pie, mindmap (no charts) // Take this into consideration when providing the instructions for the diagram
1826
+ - do not make mention in the report to "visual placeholders" just mention the visual and the number..
1827
+ - in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
1828
+ - these placeholders text should contain:
1829
+ o the purpose of the future visual
1830
+ o the relevant data to generate it
1831
+ - there should be between {round(pages/10,0)} and {round(pages/5,0)} of these visuals placeholders within the report (all between introduction and conclusion)
1832
+ - 2 visual placeholders cannot be in the same section
1833
+ Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
1834
+
1835
+ // Graph placeholders
1836
+ - Create special graph placeholders that will be rendered in d3.js afterwards based on your guidance:
1837
+
1838
+ Source:source_name [y]
1839
+ [[Graph Placeholder n:
1840
+ - Purpose of this graph is:...
1841
+ - Relevant numbers to generate it:
1842
+ table format
1843
+ - Message to convey: ...
1844
+ ]]
1845
+
1846
+ with:
1847
+ - n as the reference number,
1848
+ - source_name as the full name of the main source used and
1849
+ - y as the source reference in the reference table.
1850
+ - the table containing all the required data has to include data points FROM the learnings / results from the search below
1851
+ Important note for graph placeholders:
1852
+ - on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
1853
+ - use p tag for the source and source reference number
1854
+ - after [[ put "Graph Placeholder n:" explicitly (with n as the ref number of the graph created). This will be used in a regex
1855
+ - Do not make things up - every data points have to be from a real source
1856
+ - All types of graphs (using d3.js library) can be generated // Take this into consideration when providing the instructions for the graph data
1857
+ - do not make mention in the report to "graph placeholders" just mention graph.
1858
+ - in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
1859
+ - these placeholders text should contain:
1860
+ o the purpose of the future graph
1861
+ o the relevant data to generate it
1862
+ - there should be between {round(pages/10,0)} and {round(pages/5,0)} of these graphs placeholders within the report (all between introduction and conclusion)
1863
+ - 2 graph placeholders cannot be in the same section
1864
+ Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
1865
+
1866
+ // Focus placeholders
1867
+ - To drill down on specific topic that would be deserve to be developped extensively separately, create special focus placeholders in [[...]] double backets
1868
+ Note: outside of the placeholder, do not make reference in the report to "focus placeholders" just mention the "Focus box n".
1869
+ - in the Focus placeholder, make a mention to the prescribed sources used (no need to add the source before or after the placeholder)
1870
+ - do not make the placeholder on the exact same topic as the section or the sub-section where it is positioned, it has to be either:
1871
+ o a special case that deserves attention
1872
+ o a recent development / innovation
1873
+ o a theoretical drill-down
1874
+ o a contrarian point of view / objection
1875
+ - these placeholders text should contain:
1876
+ o the purpose of the focus box
1877
+ o the relevant data to generate it
1878
+ o the guidance in terms of style and message to convey
1879
+ Note: Be specific if you want some particular point developped, keep it consistent across the report.
1880
+ - there should be between {round(pages/20,0)} and {round(pages/10,0)} of these focus placeholders within the report (all between introduction and conclusion)
1881
+ - 2 focus placeholders cannot be in the same section and should be a few pages apart in the report
1882
+ - Mention all the sources that should be used to generate this focus placeholder and list also the references that will be mentioned in the References section later (ex: [1], [2])
1883
+ Note: the Focus placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
1884
+
1885
+ // Format:
1886
+ [[Focus Placeholder n:
1887
+ - Topic of this focus:...
1888
+ - Relevant info to generate it:...
1889
+ - Specific angle of the focus placeholder:...
1890
+ - Key elements to mention:
1891
+ o ...
1892
+ o ...
1893
+ ...
1894
+ ]]
1895
+
1896
+ with:
1897
+ - n as the reference number,
1898
+
1899
+ Important note for focus placeholders:
1900
+ - after [[ put "Focus Placeholder n:" explicitly (with n as the ref number of the focus box created). This will be used in a regex
1901
+ - Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
1902
+
1903
+ // Report ending required
1904
+ End the report with the following sequence:
1905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1906
  <iframe class="visual-frame" srcdoc='
1907
  <!DOCTYPE html>
1908
  <html>
1909
+ </head>
1910
  <body>
1911
  <div>
1912
  -end-
1913
  </div>
1914
  </body>
1915
  </html>' width="100px" height="15px" style="border:none;"></iframe>
1916
+
1917
+ Then close the html code from the broader report
1918
+ </body>
1919
  </html>
 
 
 
 
 
1920
 
1921
+ // Structure the overall report as follows:
1922
+
1923
+ {{Do not add anything before - no introductory meta comment or content}}
1924
+
1925
+ - Abstract
1926
+ - Table of contents
1927
+ - Introduction
1928
+ - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
1929
+ - Conclusion
1930
+ - References summary table
1931
+ - Report ending formatting (as mentioned before)
1932
+
1933
+ {{Do not add anything after - no conclusive meta comment or content}}
1934
+
1935
+ Important note: placeholders (visual, graph or focus) can only appear in the sections or sub-sections not in introduction, the conclusion, the references or after the references
1936
+
1937
+ // Material to use to ground your report:
1938
+ - Learnings:
1939
+ {json.dumps(learnings, indent=2)}
1940
+
1941
+ - Results from searches:
1942
+ {aggregated_crumbs}
1943
+
1944
+ Take a deep breath, do your best.
1945
+ Now, produce the report please.
1946
+ """
1947
+ )
1948
+ tokentarget = word_count * 5 # rough multiplier for token target
1949
+ report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
1950
+ # Post-processing
1951
+ report = re.sub(r'\{\[\{(.*?)\}\]\}', r'\1', report)
1952
+ report = re.sub(r'\[\{(.*?)\}\]', r'\1', report)
1953
 
1954
+ # If the report is too long, compress it.
1955
+ if len(report) > MAX_MESSAGE_LENGTH:
1956
+ report = compress_text(report, MAX_MESSAGE_LENGTH)
1957
+ if report.startswith("Error calling OpenAI API"):
1958
+ logging.error(f"generate_final_report error: {report}")
1959
+ return f"Error generating report: {report}"
1960
+ logging.info("generate_final_report: Report generated successfully.")
1961
+ return report
1962
+
1963
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
1964
  # Filter out already seen results
1965
  new_results = []