Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1718,272 +1718,248 @@ def compress_text(text: str, target_length: int) -> str:
|
|
| 1718 |
|
| 1719 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
| 1720 |
aggregated_crumbs: str, references: list, pages: int = 8) -> str:
|
| 1721 |
-
""
|
| 1722 |
-
|
| 1723 |
-
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
|
| 1728 |
-
|
| 1729 |
-
4. Assembles the Table of Contents and the final HTML.
|
| 1730 |
-
5. Passes the raw HTML through placeholder replacement functions before returning.
|
| 1731 |
-
|
| 1732 |
-
Improvements:
|
| 1733 |
-
- Increased fallback extraction attempts if the JSON skeleton is incomplete.
|
| 1734 |
-
- Ensures that max_tokens parameters are integers.
|
| 1735 |
-
"""
|
| 1736 |
-
import json, logging, re
|
| 1737 |
|
| 1738 |
-
|
| 1739 |
-
|
| 1740 |
-
|
| 1741 |
-
|
| 1742 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1743 |
|
| 1744 |
-
|
| 1745 |
-
|
| 1746 |
-
|
| 1747 |
-
Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
|
| 1748 |
-
The overall report should be approximately {total_word_count} words long.
|
| 1749 |
-
Divide the report into two groups:
|
| 1750 |
-
|
| 1751 |
-
1. "core_sections": These are the main content sections that address key sub-topics drawn from the context, research learnings, and search results.
|
| 1752 |
-
- There should be between 4 and 6 core sections. Their combined target word count should be about 70% of the total (approximately {int(0.7 * total_word_count)} words).
|
| 1753 |
-
- For each core section, provide:
|
| 1754 |
-
• "section_name": A concise title.
|
| 1755 |
-
• "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
|
| 1756 |
-
• "target_word_count": An approximate desired word count for that section.
|
| 1757 |
-
• "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
|
| 1758 |
-
• "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
|
| 1759 |
-
**Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
|
| 1760 |
-
graph placeholders should be in the same range, and focus placeholders should be between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
|
| 1761 |
-
|
| 1762 |
-
2. "final_sections": These should be generated after core sections and include:
|
| 1763 |
-
- "Introduction"
|
| 1764 |
-
- "Abstract"
|
| 1765 |
-
- "Conclusion"
|
| 1766 |
-
- "Reference Summary Table"
|
| 1767 |
-
Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
|
| 1768 |
-
distributed evenly among them.
|
| 1769 |
-
Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
|
| 1770 |
-
"""
|
| 1771 |
-
# Increase the token allocation if needed (e.g., 2000 tokens)
|
| 1772 |
-
skeleton_response = openai_call(
|
| 1773 |
-
prompt=prompt_skeleton,
|
| 1774 |
-
model="o3-mini",
|
| 1775 |
-
max_tokens_param=int(2000),
|
| 1776 |
-
temperature=0
|
| 1777 |
-
)
|
| 1778 |
|
| 1779 |
-
|
| 1780 |
-
|
| 1781 |
-
|
| 1782 |
-
|
| 1783 |
-
|
| 1784 |
-
|
| 1785 |
-
|
| 1786 |
-
|
| 1787 |
-
|
| 1788 |
-
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
| 1792 |
-
|
| 1793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1794 |
|
| 1795 |
-
try:
|
| 1796 |
-
skeleton = json.loads(json_str) if json_str else {"core_sections": [], "final_sections": []}
|
| 1797 |
-
except Exception as e2:
|
| 1798 |
-
logging.error(f"Fallback JSON parsing failed: {e2}")
|
| 1799 |
-
skeleton = {"core_sections": [], "final_sections": []}
|
| 1800 |
-
|
| 1801 |
-
# --- Step 2: Generate content for each core section sequentially.
|
| 1802 |
-
generated_core_sections = {}
|
| 1803 |
-
previous_sections_content = ""
|
| 1804 |
-
if "core_sections" in skeleton:
|
| 1805 |
-
for section in skeleton["core_sections"]:
|
| 1806 |
-
section_name = section.get("section_name", "Untitled Section")
|
| 1807 |
-
instructions = section.get("instructions", "")
|
| 1808 |
-
# Ensure target_word_count is an integer:
|
| 1809 |
-
try:
|
| 1810 |
-
target_wc = int(section.get("target_word_count", 500))
|
| 1811 |
-
except ValueError:
|
| 1812 |
-
target_wc = 500
|
| 1813 |
-
key_elements = section.get("key_content_elements", [])
|
| 1814 |
-
placeholders = section.get("placeholders", {})
|
| 1815 |
-
# Build a placeholder directive based on allocated booleans.
|
| 1816 |
-
placeholder_directive = ""
|
| 1817 |
-
if placeholders.get("visual", False):
|
| 1818 |
-
placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
|
| 1819 |
-
if placeholders.get("graph", False):
|
| 1820 |
-
placeholder_directive += "[[Graph Placeholder: Insert one graph here.]]\n"
|
| 1821 |
-
if placeholders.get("focus", False):
|
| 1822 |
-
placeholder_directive += "[[Focus Placeholder: Insert one focus box here if deeper analysis is needed.]]\n"
|
| 1823 |
-
|
| 1824 |
-
prompt_section = f"""
|
| 1825 |
-
You are an expert technical editor.
|
| 1826 |
-
Generate detailed HTML content for the research report section titled "{section_name}".
|
| 1827 |
-
Instructions: {instructions}
|
| 1828 |
-
Target word count: Approximately {target_wc} words.
|
| 1829 |
-
Key content elements to include: {", ".join(key_elements)}.
|
| 1830 |
-
Additionally, please embed the following placeholder directives exactly where appropriate in the content:
|
| 1831 |
-
{placeholder_directive if placeholder_directive else "No placeholders required for this section."}
|
| 1832 |
-
Context: {context}
|
| 1833 |
-
Initial Query: {initial_query}
|
| 1834 |
-
Report Style: {reportstyle}
|
| 1835 |
-
Learnings: {combined_learnings}
|
| 1836 |
-
Aggregated Search Results: {aggregated_crumbs}
|
| 1837 |
-
Previously generated sections (if any): {previous_sections_content}
|
| 1838 |
-
|
| 1839 |
-
Return only the HTML content for this section (do not include outer <html> or <body> tags).
|
| 1840 |
-
"""
|
| 1841 |
-
section_content = openai_call(
|
| 1842 |
-
prompt=prompt_section,
|
| 1843 |
-
model="o3-mini",
|
| 1844 |
-
max_tokens_param=int(target_wc * 5),
|
| 1845 |
-
temperature=0
|
| 1846 |
-
)
|
| 1847 |
-
section_content = section_content.strip()
|
| 1848 |
-
generated_core_sections[section_name] = section_content
|
| 1849 |
-
previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
|
| 1850 |
-
|
| 1851 |
-
# --- Step 3: Generate content for each final section.
|
| 1852 |
-
generated_final_sections = {}
|
| 1853 |
-
if "final_sections" in skeleton:
|
| 1854 |
-
for section in skeleton["final_sections"]:
|
| 1855 |
-
section_name = section.get("section_name", "Untitled Final Section")
|
| 1856 |
-
instructions = section.get("instructions", "")
|
| 1857 |
-
try:
|
| 1858 |
-
target_wc = int(section.get("target_word_count", 500))
|
| 1859 |
-
except ValueError:
|
| 1860 |
-
target_wc = 500
|
| 1861 |
-
prompt_final = f"""
|
| 1862 |
-
You are a master technical editor.
|
| 1863 |
-
Generate detailed HTML content for the final section titled "{section_name}".
|
| 1864 |
-
Instructions: {instructions}
|
| 1865 |
-
Target word count: Approximately {target_wc} words.
|
| 1866 |
-
Context: {context}
|
| 1867 |
-
Initial Query: {initial_query}
|
| 1868 |
-
Report Style: {reportstyle}
|
| 1869 |
-
Learnings: {combined_learnings}
|
| 1870 |
-
Aggregated Search Results: {aggregated_crumbs}
|
| 1871 |
-
Previously generated core sections: {previous_sections_content}
|
| 1872 |
-
|
| 1873 |
-
Return only the HTML content for this section (do not include outer <html> or <body> tags).
|
| 1874 |
-
"""
|
| 1875 |
-
final_section_content = openai_call(
|
| 1876 |
-
prompt=prompt_final,
|
| 1877 |
-
model="o3-mini",
|
| 1878 |
-
max_tokens_param=int(target_wc * 5),
|
| 1879 |
-
temperature=0
|
| 1880 |
-
)
|
| 1881 |
-
final_section_content = final_section_content.strip()
|
| 1882 |
-
generated_final_sections[section_name] = final_section_content
|
| 1883 |
-
previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
|
| 1884 |
-
|
| 1885 |
-
# --- Step 4: Generate a Table of Contents from section titles.
|
| 1886 |
-
toc_titles = []
|
| 1887 |
-
for section in skeleton.get("core_sections", []):
|
| 1888 |
-
if "section_name" in section:
|
| 1889 |
-
toc_titles.append(section["section_name"])
|
| 1890 |
-
for section in skeleton.get("final_sections", []):
|
| 1891 |
-
if "section_name" in section:
|
| 1892 |
-
toc_titles.append(section["section_name"])
|
| 1893 |
-
prompt_toc = f"""
|
| 1894 |
-
You are a technical editor.
|
| 1895 |
-
Based on the following list of section titles: {', '.join(toc_titles)},
|
| 1896 |
-
generate a concise HTML snippet for a Table of Contents,
|
| 1897 |
-
with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
|
| 1898 |
-
Return only the HTML snippet without additional commentary.
|
| 1899 |
-
"""
|
| 1900 |
-
toc_html = openai_call(
|
| 1901 |
-
prompt=prompt_toc,
|
| 1902 |
-
model="o3-mini",
|
| 1903 |
-
max_tokens_param=int(500),
|
| 1904 |
-
temperature=0
|
| 1905 |
-
).strip()
|
| 1906 |
-
|
| 1907 |
-
# --- Step 5: Assemble the final HTML document.
|
| 1908 |
-
final_report_html = f"""<html>
|
| 1909 |
-
<head>
|
| 1910 |
-
<meta charset="utf-8" />
|
| 1911 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 1912 |
-
<style>
|
| 1913 |
-
body {{
|
| 1914 |
-
font-family: Arial, sans-serif;
|
| 1915 |
-
margin: 20px;
|
| 1916 |
-
padding: 0;
|
| 1917 |
-
background-color: #ffffff;
|
| 1918 |
-
}}
|
| 1919 |
-
h1 {{
|
| 1920 |
-
text-align: center;
|
| 1921 |
-
margin-bottom: 20px;
|
| 1922 |
-
}}
|
| 1923 |
-
h2 {{
|
| 1924 |
-
text-align: left;
|
| 1925 |
-
margin-top: 20px;
|
| 1926 |
-
margin-bottom: 10px;
|
| 1927 |
-
}}
|
| 1928 |
-
.section {{
|
| 1929 |
-
margin-bottom: 30px;
|
| 1930 |
-
}}
|
| 1931 |
-
.toc {{
|
| 1932 |
-
margin: 20px 0;
|
| 1933 |
-
border: 1px solid #ccc;
|
| 1934 |
-
padding: 10px;
|
| 1935 |
-
}}
|
| 1936 |
-
</style>
|
| 1937 |
-
</head>
|
| 1938 |
-
<body>
|
| 1939 |
-
<!-- Report Title -->
|
| 1940 |
-
<h1>{initial_query}</h1>
|
| 1941 |
-
<!-- Table of Contents -->
|
| 1942 |
-
<div class="toc">
|
| 1943 |
-
{toc_html}
|
| 1944 |
-
</div>
|
| 1945 |
-
"""
|
| 1946 |
-
# Append core sections.
|
| 1947 |
-
for section in skeleton.get("core_sections", []):
|
| 1948 |
-
section_name = section.get("section_name", "Untitled Section")
|
| 1949 |
-
content = generated_core_sections.get(section_name, "")
|
| 1950 |
-
final_report_html += f"""<div class="section">
|
| 1951 |
-
<h2>{section_name}</h2>
|
| 1952 |
-
{content}
|
| 1953 |
-
</div>
|
| 1954 |
-
"""
|
| 1955 |
-
# Append final sections.
|
| 1956 |
-
for section in skeleton.get("final_sections", []):
|
| 1957 |
-
section_name = section.get("section_name", "Untitled Final Section")
|
| 1958 |
-
content = generated_final_sections.get(section_name, "")
|
| 1959 |
-
final_report_html += f"""<div class="section">
|
| 1960 |
-
<h2>{section_name}</h2>
|
| 1961 |
-
{content}
|
| 1962 |
-
</div>
|
| 1963 |
-
"""
|
| 1964 |
-
# Append an ending marker.
|
| 1965 |
-
final_report_html += """
|
| 1966 |
<iframe class="visual-frame" srcdoc='
|
| 1967 |
<!DOCTYPE html>
|
| 1968 |
<html>
|
| 1969 |
-
<
|
| 1970 |
<body>
|
| 1971 |
<div>
|
| 1972 |
-end-
|
| 1973 |
</div>
|
| 1974 |
</body>
|
| 1975 |
</html>' width="100px" height="15px" style="border:none;"></iframe>
|
| 1976 |
-
|
|
|
|
|
|
|
| 1977 |
</html>
|
| 1978 |
-
"""
|
| 1979 |
-
# --- Step 6: Replace placeholder markers with actual content.
|
| 1980 |
-
final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1981 |
-
final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1982 |
-
final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1983 |
|
| 1984 |
-
|
| 1985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1987 |
def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
|
| 1988 |
# Filter out already seen results
|
| 1989 |
new_results = []
|
|
|
|
| 1718 |
|
| 1719 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
| 1720 |
aggregated_crumbs: str, references: list, pages: int = 8) -> str:
|
| 1721 |
+
fallback_text = ""
|
| 1722 |
+
if not learnings:
|
| 1723 |
+
fallback_text = "No external summaries were directly extracted. It is not possible to analyze relevance."
|
| 1724 |
+
combined_learnings = "\n".join(learnings) if learnings else fallback_text
|
| 1725 |
+
word_count = pages * 500
|
| 1726 |
+
prompt = (f"""
|
| 1727 |
+
Produce a comprehensive report in html format.
|
| 1728 |
+
The report should be very detailed and lengthy.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1729 |
|
| 1730 |
+
// Requirements
|
| 1731 |
+
- All text alignment has to be on the left
|
| 1732 |
+
- The report should be {pages} long or {word_count} words (excluding html formatting)
|
| 1733 |
+
- It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
|
| 1734 |
+
Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
|
| 1735 |
+
- No more than 10 sentences per div blocks, skip lines and add line breaks when changing topic.
|
| 1736 |
+
- The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
|
| 1737 |
+
- For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
|
| 1738 |
+
Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
|
| 1739 |
+
- Do not put a numbered list (ex: 1.1, ...) for every sentences! It should be used parcimoniously for real sub-sections.
|
| 1740 |
+
- Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
|
| 1741 |
+
- Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
|
| 1742 |
+
- Titles for sections and sub-sections should systematically use the tags:
|
| 1743 |
+
<h1> for sections (ex: <h1>3. Examination of State-of-the-Art of AI</h1>)
|
| 1744 |
+
<h2> for sub-sections (ex: <h2>3.2 AI Performance in Mathematics</h2>)
|
| 1745 |
+
<h3> for sub-sub-sections (ex: <h3>3.2.1 Illustration with math conjecture demonstration</h3>)
|
| 1746 |
+
<h4> for bulletpoint title (ex: <h4>item to detail:</h4> description of the item to detail ...)
|
| 1747 |
+
- Use inline formatting for the tables with homogeneous border and colors
|
| 1748 |
+
- Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
|
| 1749 |
+
- For the Table of contents: do not mention the pages, but make each item on separate line
|
| 1750 |
+
- Put "Table of contents" and "Abstract" title in h1 format.
|
| 1751 |
+
- The Table of contents should skip the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
|
| 1752 |
+
- Exceptionally - for sections requiring specific improvements - put it between <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
|
| 1753 |
+
|
| 1754 |
+
// Reference citations
|
| 1755 |
+
- The name of the reference table should be: "Reference Summary Table"
|
| 1756 |
+
- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
|
| 1757 |
+
- The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
|
| 1758 |
+
- For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
|
| 1759 |
+
- Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
|
| 1760 |
+
- Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
|
| 1761 |
+
- The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
|
| 1762 |
+
- The table generated should have in-line styling to have word-wrap and 100% width
|
| 1763 |
+
|
| 1764 |
+
// Instructions:
|
| 1765 |
+
1. Integrate numbers from the sources but always mention the source
|
| 1766 |
+
2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
|
| 1767 |
+
3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
|
| 1768 |
+
Note: This is for academic purposes, so thorough citations and referencing are essential.
|
| 1769 |
+
4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
|
| 1770 |
+
Note: put the full reference url (no generic domain address), down to the html page or the pdf
|
| 1771 |
+
|
| 1772 |
+
|
| 1773 |
+
// Style
|
| 1774 |
+
The report must follow this writing style {reportstyle}.
|
| 1775 |
+
|
| 1776 |
+
// Format when mentioning sources, organisations and individuals
|
| 1777 |
+
- We will perform a post-processing on the output
|
| 1778 |
+
- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
|
| 1779 |
+
example 1: {{[{{Organisation}}]}}'s CEO, {{[{{CEO name}}]}} ...
|
| 1780 |
+
example 2: in a report from the {{[{{University name}}]}} titled "{{[{{report title}}]}}"...
|
| 1781 |
+
example 3: the CEO of {{[{{Company name}}]}} , {{[{{Name}}]}}, said that "the best way to..."
|
| 1782 |
+
eexample 4: the project {{[{{project name}}]}}, anounced by {{[{{...}}]}} in collaboration with {{[{{...}}]}}
|
| 1783 |
+
example 5: Mr. {{[{{person}}]}}, Marketing director in {{[{{company}}]}}, mentioned that ...
|
| 1784 |
+
Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
|
| 1785 |
+
- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
| 1786 |
+
- You should have approximately {2 * pages} mention of organisations, people, projects or people, use the prescribed format
|
| 1787 |
+
- The same item cannot be mentioned more than 3 times, don't over do it
|
| 1788 |
+
- Do not mix sources that are not directly related in the search results, don't put together organisations or people that have nothing to do with each other
|
| 1789 |
+
- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
|
| 1790 |
+
Note: LinkedIn is not a relevant source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
|
| 1791 |
|
| 1792 |
+
// Sources
|
| 1793 |
+
Use the following learnings and merged reference details from a deep research process on:
|
| 1794 |
+
'{initial_query}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1795 |
|
| 1796 |
+
Taking also into consideration the context:
|
| 1797 |
+
{context}
|
| 1798 |
+
|
| 1799 |
+
--------------- Placeholders -----------
|
| 1800 |
+
In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
|
| 1801 |
+
There are 3 types: visual, graphs, focus - each with their own purpose
|
| 1802 |
+
|
| 1803 |
+
// Visual placeholders
|
| 1804 |
+
- Create special visual placeholders that will be rendered in mermaid afterwards.
|
| 1805 |
+
- The Visual placeholders should follow this format:
|
| 1806 |
+
|
| 1807 |
+
Source:source_name [y]
|
| 1808 |
+
[[Visual Placeholder n:
|
| 1809 |
+
- Purpose of this visual is:...
|
| 1810 |
+
- Relevant content to generate it:
|
| 1811 |
+
o ex: arguments
|
| 1812 |
+
o ex: lists of elements
|
| 1813 |
+
o ex: data points
|
| 1814 |
+
o ...
|
| 1815 |
+
- Message to convey: ...
|
| 1816 |
+
]]
|
| 1817 |
+
|
| 1818 |
+
with:
|
| 1819 |
+
- n as the reference number,
|
| 1820 |
+
- source_name as the full name of the main source used and
|
| 1821 |
+
- y as the number ref of the source reference in the reference table.
|
| 1822 |
+
Important note for visual placeholders:
|
| 1823 |
+
- on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
|
| 1824 |
+
- after [[ put "Visual Placeholder n:" explicitly (with n as the ref number of the placeholder box created). This will be used in a regex
|
| 1825 |
+
- the only types of mermaid diagram that can be generated are: flowchart, sequence, gantt, pie, mindmap (no charts) // Take this into consideration when providing the instructions for the diagram
|
| 1826 |
+
- do not make mention in the report to "visual placeholders" just mention the visual and the number..
|
| 1827 |
+
- in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
|
| 1828 |
+
- these placeholders text should contain:
|
| 1829 |
+
o the purpose of the future visual
|
| 1830 |
+
o the relevant data to generate it
|
| 1831 |
+
- there should be between {round(pages/10,0)} and {round(pages/5,0)} of these visuals placeholders within the report (all between introduction and conclusion)
|
| 1832 |
+
- 2 visual placeholders cannot be in the same section
|
| 1833 |
+
Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
|
| 1834 |
+
|
| 1835 |
+
// Graph placeholders
|
| 1836 |
+
- Create special graph placeholders that will be rendered in d3.js afterwards based on your guidance:
|
| 1837 |
+
|
| 1838 |
+
Source:source_name [y]
|
| 1839 |
+
[[Graph Placeholder n:
|
| 1840 |
+
- Purpose of this graph is:...
|
| 1841 |
+
- Relevant numbers to generate it:
|
| 1842 |
+
table format
|
| 1843 |
+
- Message to convey: ...
|
| 1844 |
+
]]
|
| 1845 |
+
|
| 1846 |
+
with:
|
| 1847 |
+
- n as the reference number,
|
| 1848 |
+
- source_name as the full name of the main source used and
|
| 1849 |
+
- y as the source reference in the reference table.
|
| 1850 |
+
- the table containing all the required data has to include data points FROM the learnings / results from the search below
|
| 1851 |
+
Important note for graph placeholders:
|
| 1852 |
+
- on the line before [[...]] mention the source with the reference number [x] in the form: ""Source: abc [n]" - only one source should be mentioned
|
| 1853 |
+
- use p tag for the source and source reference number
|
| 1854 |
+
- after [[ put "Graph Placeholder n:" explicitly (with n as the ref number of the graph created). This will be used in a regex
|
| 1855 |
+
- Do not make things up - every data points have to be from a real source
|
| 1856 |
+
- All types of graphs (using d3.js library) can be generated // Take this into consideration when providing the instructions for the graph data
|
| 1857 |
+
- do not make mention in the report to "graph placeholders" just mention graph.
|
| 1858 |
+
- in the placeholder, no need to add the references to the source or its ref number, but make sure ALL of the data points required has a source from the learning and reference material hereafter
|
| 1859 |
+
- these placeholders text should contain:
|
| 1860 |
+
o the purpose of the future graph
|
| 1861 |
+
o the relevant data to generate it
|
| 1862 |
+
- there should be between {round(pages/10,0)} and {round(pages/5,0)} of these graphs placeholders within the report (all between introduction and conclusion)
|
| 1863 |
+
- 2 graph placeholders cannot be in the same section
|
| 1864 |
+
Note: the placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
|
| 1865 |
+
|
| 1866 |
+
// Focus placeholders
|
| 1867 |
+
- To drill down on specific topic that would be deserve to be developped extensively separately, create special focus placeholders in [[...]] double backets
|
| 1868 |
+
Note: outside of the placeholder, do not make reference in the report to "focus placeholders" just mention the "Focus box n".
|
| 1869 |
+
- in the Focus placeholder, make a mention to the prescribed sources used (no need to add the source before or after the placeholder)
|
| 1870 |
+
- do not make the placeholder on the exact same topic as the section or the sub-section where it is positioned, it has to be either:
|
| 1871 |
+
o a special case that deserves attention
|
| 1872 |
+
o a recent development / innovation
|
| 1873 |
+
o a theoretical drill-down
|
| 1874 |
+
o a contrarian point of view / objection
|
| 1875 |
+
- these placeholders text should contain:
|
| 1876 |
+
o the purpose of the focus box
|
| 1877 |
+
o the relevant data to generate it
|
| 1878 |
+
o the guidance in terms of style and message to convey
|
| 1879 |
+
Note: Be specific if you want some particular point developped, keep it consistent across the report.
|
| 1880 |
+
- there should be between {round(pages/20,0)} and {round(pages/10,0)} of these focus placeholders within the report (all between introduction and conclusion)
|
| 1881 |
+
- 2 focus placeholders cannot be in the same section and should be a few pages apart in the report
|
| 1882 |
+
- Mention all the sources that should be used to generate this focus placeholder and list also the references that will be mentioned in the References section later (ex: [1], [2])
|
| 1883 |
+
Note: the Focus placeholders will then be processed separately by a llm to generate the specific code to display each of them so the instruction need to be clear enough.
|
| 1884 |
+
|
| 1885 |
+
// Format:
|
| 1886 |
+
[[Focus Placeholder n:
|
| 1887 |
+
- Topic of this focus:...
|
| 1888 |
+
- Relevant info to generate it:...
|
| 1889 |
+
- Specific angle of the focus placeholder:...
|
| 1890 |
+
- Key elements to mention:
|
| 1891 |
+
o ...
|
| 1892 |
+
o ...
|
| 1893 |
+
...
|
| 1894 |
+
]]
|
| 1895 |
+
|
| 1896 |
+
with:
|
| 1897 |
+
- n as the reference number,
|
| 1898 |
+
|
| 1899 |
+
Important note for focus placeholders:
|
| 1900 |
+
- after [[ put "Focus Placeholder n:" explicitly (with n as the ref number of the focus box created). This will be used in a regex
|
| 1901 |
+
- Do not add a title for the Focus placeholder just before the [[...]], the content that will replace the focus placeholder - generated later on - will already include a title
|
| 1902 |
+
|
| 1903 |
+
// Report ending required
|
| 1904 |
+
End the report with the following sequence:
|
| 1905 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1906 |
<iframe class="visual-frame" srcdoc='
|
| 1907 |
<!DOCTYPE html>
|
| 1908 |
<html>
|
| 1909 |
+
</head>
|
| 1910 |
<body>
|
| 1911 |
<div>
|
| 1912 |
-end-
|
| 1913 |
</div>
|
| 1914 |
</body>
|
| 1915 |
</html>' width="100px" height="15px" style="border:none;"></iframe>
|
| 1916 |
+
|
| 1917 |
+
Then close the html code from the broader report
|
| 1918 |
+
</body>
|
| 1919 |
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1920 |
|
| 1921 |
+
// Structure the overall report as follows:
|
| 1922 |
+
|
| 1923 |
+
{{Do not add anything before - no introductory meta comment or content}}
|
| 1924 |
+
|
| 1925 |
+
- Abstract
|
| 1926 |
+
- Table of contents
|
| 1927 |
+
- Introduction
|
| 1928 |
+
- [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
|
| 1929 |
+
- Conclusion
|
| 1930 |
+
- References summary table
|
| 1931 |
+
- Report ending formatting (as mentioned before)
|
| 1932 |
+
|
| 1933 |
+
{{Do not add anything after - no conclusive meta comment or content}}
|
| 1934 |
+
|
| 1935 |
+
Important note: placeholders (visual, graph or focus) can only appear in the sections or sub-sections not in introduction, the conclusion, the references or after the references
|
| 1936 |
+
|
| 1937 |
+
// Material to use to ground your report:
|
| 1938 |
+
- Learnings:
|
| 1939 |
+
{json.dumps(learnings, indent=2)}
|
| 1940 |
+
|
| 1941 |
+
- Results from searches:
|
| 1942 |
+
{aggregated_crumbs}
|
| 1943 |
+
|
| 1944 |
+
Take a deep breath, do your best.
|
| 1945 |
+
Now, produce the report please.
|
| 1946 |
+
"""
|
| 1947 |
+
)
|
| 1948 |
+
tokentarget = word_count * 5 # rough multiplier for token target
|
| 1949 |
+
report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
|
| 1950 |
+
# Post-processing
|
| 1951 |
+
report = re.sub(r'\{\[\{(.*?)\}\]\}', r'\1', report)
|
| 1952 |
+
report = re.sub(r'\[\{(.*?)\}\]', r'\1', report)
|
| 1953 |
|
| 1954 |
+
# If the report is too long, compress it.
|
| 1955 |
+
if len(report) > MAX_MESSAGE_LENGTH:
|
| 1956 |
+
report = compress_text(report, MAX_MESSAGE_LENGTH)
|
| 1957 |
+
if report.startswith("Error calling OpenAI API"):
|
| 1958 |
+
logging.error(f"generate_final_report error: {report}")
|
| 1959 |
+
return f"Error generating report: {report}"
|
| 1960 |
+
logging.info("generate_final_report: Report generated successfully.")
|
| 1961 |
+
return report
|
| 1962 |
+
|
| 1963 |
def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
|
| 1964 |
# Filter out already seen results
|
| 1965 |
new_results = []
|