Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1719,37 +1719,18 @@ def compress_text(text: str, target_length: int) -> str:
|
|
| 1719 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
| 1720 |
aggregated_crumbs: str, references: list, pages: int = 8) -> str:
|
| 1721 |
"""
|
| 1722 |
-
Revised generate_final_report with
|
|
|
|
| 1723 |
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
|
| 1728 |
-
|
| 1729 |
-
|
| 1730 |
-
|
| 1731 |
-
|
| 1732 |
-
The prompt instructs the LLM that, overall, the report should have:
|
| 1733 |
-
- Visual placeholders between ⌊pages/10⌋ and ⌈pages/5⌉ in total.
|
| 1734 |
-
- Graph placeholders in the same range as visual.
|
| 1735 |
-
- Focus placeholders between ⌊pages/20⌋ and ⌈pages/10⌉ in total.
|
| 1736 |
-
Not every section need have every placeholder.
|
| 1737 |
-
|
| 1738 |
-
2. **Section Generation:**
|
| 1739 |
-
Each core section is generated using a detailed prompt that incorporates the respective
|
| 1740 |
-
placeholder decisions along with context, initial query, report style, learnings, aggregated crumbs,
|
| 1741 |
-
and previously generated sections.
|
| 1742 |
-
|
| 1743 |
-
3. **Final Sections Generation and Assembly:**
|
| 1744 |
-
The final sections (Introduction, Abstract, Conclusion, Reference Summary Table) are generated afterward
|
| 1745 |
-
and a Table of Contents is created from all section titles. Finally, all parts are assembled into a complete
|
| 1746 |
-
HTML document.
|
| 1747 |
-
|
| 1748 |
-
4. **Placeholder Replacement:**
|
| 1749 |
-
Once the HTML report is assembled, the placeholder markers (e.g. [[Visual Placeholder: …]]) are replaced
|
| 1750 |
-
via the dedicated replacement functions.
|
| 1751 |
"""
|
| 1752 |
-
import json, logging
|
| 1753 |
|
| 1754 |
# Calculate overall target word count (approximate)
|
| 1755 |
total_word_count = pages * 500
|
|
@@ -1757,7 +1738,7 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
|
|
| 1757 |
"No external summaries were directly extracted. It is not possible to analyze relevance."
|
| 1758 |
)
|
| 1759 |
|
| 1760 |
-
# Step 1: Generate the JSON skeleton outline with
|
| 1761 |
prompt_skeleton = f"""
|
| 1762 |
You are a master technical editor.
|
| 1763 |
Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
|
|
@@ -1771,45 +1752,57 @@ Divide the report into two groups:
|
|
| 1771 |
• "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
|
| 1772 |
• "target_word_count": An approximate desired word count for that section.
|
| 1773 |
• "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
|
| 1774 |
-
• "placeholders": An object indicating which
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
- Include "focus": true or false.
|
| 1778 |
-
**Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉, graph placeholders should follow the same rule, and focus placeholders should appear between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which placeholder(s) to activate, ensuring that not every section receives all three.
|
| 1779 |
|
| 1780 |
-
2. "final_sections": These
|
| 1781 |
- "Introduction"
|
| 1782 |
- "Abstract"
|
| 1783 |
- "Conclusion"
|
| 1784 |
- "Reference Summary Table"
|
| 1785 |
-
|
| 1786 |
distributed evenly among them.
|
| 1787 |
-
|
| 1788 |
Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
|
| 1789 |
"""
|
| 1790 |
skeleton_response = openai_call(
|
| 1791 |
prompt=prompt_skeleton,
|
| 1792 |
model="o3-mini",
|
| 1793 |
-
max_tokens_param=1500,
|
| 1794 |
temperature=0
|
| 1795 |
)
|
|
|
|
| 1796 |
try:
|
| 1797 |
skeleton = json.loads(skeleton_response)
|
| 1798 |
except Exception as e:
|
| 1799 |
logging.error(f"Error parsing skeleton JSON: {e}")
|
| 1800 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1801 |
|
| 1802 |
-
# Step 2: Generate content for each core section sequentially.
|
| 1803 |
generated_core_sections = {}
|
| 1804 |
previous_sections_content = ""
|
| 1805 |
if "core_sections" in skeleton:
|
| 1806 |
for section in skeleton["core_sections"]:
|
| 1807 |
section_name = section.get("section_name", "Untitled Section")
|
| 1808 |
instructions = section.get("instructions", "")
|
| 1809 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1810 |
key_elements = section.get("key_content_elements", [])
|
| 1811 |
placeholders = section.get("placeholders", {})
|
| 1812 |
-
# Build
|
| 1813 |
placeholder_directive = ""
|
| 1814 |
if placeholders.get("visual", False):
|
| 1815 |
placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
|
|
@@ -1838,21 +1831,23 @@ Return only the HTML content for this section (do not include outer <html> or <b
|
|
| 1838 |
section_content = openai_call(
|
| 1839 |
prompt=prompt_section,
|
| 1840 |
model="o3-mini",
|
| 1841 |
-
max_tokens_param=target_wc * 5,
|
| 1842 |
temperature=0
|
| 1843 |
)
|
| 1844 |
section_content = section_content.strip()
|
| 1845 |
generated_core_sections[section_name] = section_content
|
| 1846 |
-
# Accumulate current section's content into a shared context for continuity.
|
| 1847 |
previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
|
| 1848 |
|
| 1849 |
-
# Step 3: Generate
|
| 1850 |
generated_final_sections = {}
|
| 1851 |
if "final_sections" in skeleton:
|
| 1852 |
for section in skeleton["final_sections"]:
|
| 1853 |
section_name = section.get("section_name", "Untitled Final Section")
|
| 1854 |
instructions = section.get("instructions", "")
|
| 1855 |
-
|
|
|
|
|
|
|
|
|
|
| 1856 |
prompt_final = f"""
|
| 1857 |
You are a master technical editor.
|
| 1858 |
Generate detailed HTML content for the final section titled "{section_name}".
|
|
@@ -1870,14 +1865,14 @@ Return only the HTML content for this section (do not include outer <html> or <b
|
|
| 1870 |
final_section_content = openai_call(
|
| 1871 |
prompt=prompt_final,
|
| 1872 |
model="o3-mini",
|
| 1873 |
-
max_tokens_param=target_wc * 5,
|
| 1874 |
temperature=0
|
| 1875 |
)
|
| 1876 |
final_section_content = final_section_content.strip()
|
| 1877 |
generated_final_sections[section_name] = final_section_content
|
| 1878 |
previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
|
| 1879 |
|
| 1880 |
-
# Step 4: Generate a Table of Contents from
|
| 1881 |
toc_titles = []
|
| 1882 |
for section in skeleton.get("core_sections", []):
|
| 1883 |
if "section_name" in section:
|
|
@@ -1889,8 +1884,8 @@ Return only the HTML content for this section (do not include outer <html> or <b
|
|
| 1889 |
You are a technical editor.
|
| 1890 |
Based on the following list of section titles: {', '.join(toc_titles)},
|
| 1891 |
generate a concise HTML snippet for a Table of Contents,
|
| 1892 |
-
with each item
|
| 1893 |
-
Return only the HTML snippet
|
| 1894 |
"""
|
| 1895 |
toc_html = openai_call(
|
| 1896 |
prompt=prompt_toc,
|
|
@@ -1899,7 +1894,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
|
|
| 1899 |
temperature=0
|
| 1900 |
).strip()
|
| 1901 |
|
| 1902 |
-
# Step 5: Assemble the final HTML document.
|
| 1903 |
final_report_html = f"""<html>
|
| 1904 |
<head>
|
| 1905 |
<meta charset="utf-8" />
|
|
@@ -1956,7 +1951,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
|
|
| 1956 |
{content}
|
| 1957 |
</div>
|
| 1958 |
"""
|
| 1959 |
-
# Append
|
| 1960 |
final_report_html += """
|
| 1961 |
<iframe class="visual-frame" srcdoc='
|
| 1962 |
<!DOCTYPE html>
|
|
@@ -1971,8 +1966,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
|
|
| 1971 |
</body>
|
| 1972 |
</html>
|
| 1973 |
"""
|
| 1974 |
-
|
| 1975 |
-
# Step 6: Process the raw HTML to replace placeholder markers with actual placeholder code.
|
| 1976 |
final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1977 |
final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1978 |
final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
|
|
|
| 1719 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
| 1720 |
aggregated_crumbs: str, references: list, pages: int = 8) -> str:
|
| 1721 |
"""
|
| 1722 |
+
Revised generate_final_report with explicit type conversions for max_tokens values
|
| 1723 |
+
and a fallback for incomplete JSON parsing.
|
| 1724 |
|
| 1725 |
+
This function:
|
| 1726 |
+
1. Generates a JSON skeleton outlining the report sections and placeholder allocations.
|
| 1727 |
+
2. For each core section, generates HTML content using the assigned token
|
| 1728 |
+
(target_wc * 5) ensuring target_wc is an integer.
|
| 1729 |
+
3. Generates final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
|
| 1730 |
+
4. Assembles the Table of Contents and the final HTML.
|
| 1731 |
+
5. Passes the raw HTML through the placeholder replacement functions before returning.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1732 |
"""
|
| 1733 |
+
import json, logging, re
|
| 1734 |
|
| 1735 |
# Calculate overall target word count (approximate)
|
| 1736 |
total_word_count = pages * 500
|
|
|
|
| 1738 |
"No external summaries were directly extracted. It is not possible to analyze relevance."
|
| 1739 |
)
|
| 1740 |
|
| 1741 |
+
# --- Step 1: Generate the JSON skeleton outline with placeholder allocation decisions ---
|
| 1742 |
prompt_skeleton = f"""
|
| 1743 |
You are a master technical editor.
|
| 1744 |
Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
|
|
|
|
| 1752 |
• "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
|
| 1753 |
• "target_word_count": An approximate desired word count for that section.
|
| 1754 |
• "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
|
| 1755 |
+
• "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
|
| 1756 |
+
**Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
|
| 1757 |
+
graph placeholders should be in the same range, and focus placeholders between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
|
|
|
|
|
|
|
| 1758 |
|
| 1759 |
+
2. "final_sections": These should be generated after core sections and include:
|
| 1760 |
- "Introduction"
|
| 1761 |
- "Abstract"
|
| 1762 |
- "Conclusion"
|
| 1763 |
- "Reference Summary Table"
|
| 1764 |
+
Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
|
| 1765 |
distributed evenly among them.
|
| 1766 |
+
|
| 1767 |
Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
|
| 1768 |
"""
|
| 1769 |
skeleton_response = openai_call(
|
| 1770 |
prompt=prompt_skeleton,
|
| 1771 |
model="o3-mini",
|
| 1772 |
+
max_tokens_param=int(1500),
|
| 1773 |
temperature=0
|
| 1774 |
)
|
| 1775 |
+
|
| 1776 |
try:
|
| 1777 |
skeleton = json.loads(skeleton_response)
|
| 1778 |
except Exception as e:
|
| 1779 |
logging.error(f"Error parsing skeleton JSON: {e}")
|
| 1780 |
+
# Fallback: attempt to extract JSON from a markdown code fence.
|
| 1781 |
+
match = re.search(r"```json(.*?)```", skeleton_response, re.DOTALL)
|
| 1782 |
+
if match:
|
| 1783 |
+
try:
|
| 1784 |
+
skeleton = json.loads(match.group(1).strip())
|
| 1785 |
+
except Exception as e2:
|
| 1786 |
+
logging.error(f"Fallback JSON parsing failed: {e2}")
|
| 1787 |
+
skeleton = {"core_sections": [], "final_sections": []}
|
| 1788 |
+
else:
|
| 1789 |
+
skeleton = {"core_sections": [], "final_sections": []}
|
| 1790 |
|
| 1791 |
+
# --- Step 2: Generate content for each core section sequentially.
|
| 1792 |
generated_core_sections = {}
|
| 1793 |
previous_sections_content = ""
|
| 1794 |
if "core_sections" in skeleton:
|
| 1795 |
for section in skeleton["core_sections"]:
|
| 1796 |
section_name = section.get("section_name", "Untitled Section")
|
| 1797 |
instructions = section.get("instructions", "")
|
| 1798 |
+
# Ensure target_word_count is an integer:
|
| 1799 |
+
try:
|
| 1800 |
+
target_wc = int(section.get("target_word_count", 500))
|
| 1801 |
+
except ValueError:
|
| 1802 |
+
target_wc = 500
|
| 1803 |
key_elements = section.get("key_content_elements", [])
|
| 1804 |
placeholders = section.get("placeholders", {})
|
| 1805 |
+
# Build placeholder directive based on allocated booleans.
|
| 1806 |
placeholder_directive = ""
|
| 1807 |
if placeholders.get("visual", False):
|
| 1808 |
placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
|
|
|
|
| 1831 |
section_content = openai_call(
|
| 1832 |
prompt=prompt_section,
|
| 1833 |
model="o3-mini",
|
| 1834 |
+
max_tokens_param=int(target_wc * 5),
|
| 1835 |
temperature=0
|
| 1836 |
)
|
| 1837 |
section_content = section_content.strip()
|
| 1838 |
generated_core_sections[section_name] = section_content
|
|
|
|
| 1839 |
previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
|
| 1840 |
|
| 1841 |
+
# --- Step 3: Generate content for each final section.
|
| 1842 |
generated_final_sections = {}
|
| 1843 |
if "final_sections" in skeleton:
|
| 1844 |
for section in skeleton["final_sections"]:
|
| 1845 |
section_name = section.get("section_name", "Untitled Final Section")
|
| 1846 |
instructions = section.get("instructions", "")
|
| 1847 |
+
try:
|
| 1848 |
+
target_wc = int(section.get("target_word_count", 500))
|
| 1849 |
+
except ValueError:
|
| 1850 |
+
target_wc = 500
|
| 1851 |
prompt_final = f"""
|
| 1852 |
You are a master technical editor.
|
| 1853 |
Generate detailed HTML content for the final section titled "{section_name}".
|
|
|
|
| 1865 |
final_section_content = openai_call(
|
| 1866 |
prompt=prompt_final,
|
| 1867 |
model="o3-mini",
|
| 1868 |
+
max_tokens_param=int(target_wc * 5),
|
| 1869 |
temperature=0
|
| 1870 |
)
|
| 1871 |
final_section_content = final_section_content.strip()
|
| 1872 |
generated_final_sections[section_name] = final_section_content
|
| 1873 |
previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
|
| 1874 |
|
| 1875 |
+
# --- Step 4: Generate a Table of Contents from section titles.
|
| 1876 |
toc_titles = []
|
| 1877 |
for section in skeleton.get("core_sections", []):
|
| 1878 |
if "section_name" in section:
|
|
|
|
| 1884 |
You are a technical editor.
|
| 1885 |
Based on the following list of section titles: {', '.join(toc_titles)},
|
| 1886 |
generate a concise HTML snippet for a Table of Contents,
|
| 1887 |
+
with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
|
| 1888 |
+
Return only the HTML snippet without additional commentary.
|
| 1889 |
"""
|
| 1890 |
toc_html = openai_call(
|
| 1891 |
prompt=prompt_toc,
|
|
|
|
| 1894 |
temperature=0
|
| 1895 |
).strip()
|
| 1896 |
|
| 1897 |
+
# --- Step 5: Assemble the final HTML document.
|
| 1898 |
final_report_html = f"""<html>
|
| 1899 |
<head>
|
| 1900 |
<meta charset="utf-8" />
|
|
|
|
| 1951 |
{content}
|
| 1952 |
</div>
|
| 1953 |
"""
|
| 1954 |
+
# Append an ending marker.
|
| 1955 |
final_report_html += """
|
| 1956 |
<iframe class="visual-frame" srcdoc='
|
| 1957 |
<!DOCTYPE html>
|
|
|
|
| 1966 |
</body>
|
| 1967 |
</html>
|
| 1968 |
"""
|
| 1969 |
+
# --- Step 6: Replace placeholder markers with actual content.
|
|
|
|
| 1970 |
final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1971 |
final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|
| 1972 |
final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
|