Guiyom commited on
Commit
508a2e2
·
verified ·
1 Parent(s): 3e33744

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -55
app.py CHANGED
@@ -1719,37 +1719,18 @@ def compress_text(text: str, target_length: int) -> str:
1719
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
1720
  aggregated_crumbs: str, references: list, pages: int = 8) -> str:
1721
  """
1722
- Revised generate_final_report with placeholder allocation decisions in the initial JSON skeleton.
 
1723
 
1724
- The function proceeds as follows:
1725
-
1726
- 1. **Skeleton Generation:**
1727
- It first builds a JSON skeleton outline for the report. For core sections,
1728
- in addition to "section_name", "instructions", "target_word_count", and "key_content_elements",
1729
- an extra field "placeholders" is generated. This field is an object with boolean values
1730
- indicating whether to include a visual, graph, and/or focus placeholder.
1731
-
1732
- The prompt instructs the LLM that, overall, the report should have:
1733
- - Visual placeholders between ⌊pages/10⌋ and ⌈pages/5⌉ in total.
1734
- - Graph placeholders in the same range as visual.
1735
- - Focus placeholders between ⌊pages/20⌋ and ⌈pages/10⌉ in total.
1736
- Not every section need have every placeholder.
1737
-
1738
- 2. **Section Generation:**
1739
- Each core section is generated using a detailed prompt that incorporates the respective
1740
- placeholder decisions along with context, initial query, report style, learnings, aggregated crumbs,
1741
- and previously generated sections.
1742
-
1743
- 3. **Final Sections Generation and Assembly:**
1744
- The final sections (Introduction, Abstract, Conclusion, Reference Summary Table) are generated afterward
1745
- and a Table of Contents is created from all section titles. Finally, all parts are assembled into a complete
1746
- HTML document.
1747
-
1748
- 4. **Placeholder Replacement:**
1749
- Once the HTML report is assembled, the placeholder markers (e.g. [[Visual Placeholder: …]]) are replaced
1750
- via the dedicated replacement functions.
1751
  """
1752
- import json, logging
1753
 
1754
  # Calculate overall target word count (approximate)
1755
  total_word_count = pages * 500
@@ -1757,7 +1738,7 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
1757
  "No external summaries were directly extracted. It is not possible to analyze relevance."
1758
  )
1759
 
1760
- # Step 1: Generate the JSON skeleton outline with explicit placeholder allocation decisions.
1761
  prompt_skeleton = f"""
1762
  You are a master technical editor.
1763
  Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
@@ -1771,45 +1752,57 @@ Divide the report into two groups:
1771
  • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
1772
  • "target_word_count": An approximate desired word count for that section.
1773
  • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
1774
- • "placeholders": An object indicating which placeholder types to include.
1775
- - Include "visual": true or false.
1776
- - Include "graph": true or false.
1777
- - Include "focus": true or false.
1778
- **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉, graph placeholders should follow the same rule, and focus placeholders should appear between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which placeholder(s) to activate, ensuring that not every section receives all three.
1779
 
1780
- 2. "final_sections": These sections frame the report and include:
1781
  - "Introduction"
1782
  - "Abstract"
1783
  - "Conclusion"
1784
  - "Reference Summary Table"
1785
- The combined target word count for final sections should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
1786
  distributed evenly among them.
1787
-
1788
  Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
1789
  """
1790
  skeleton_response = openai_call(
1791
  prompt=prompt_skeleton,
1792
  model="o3-mini",
1793
- max_tokens_param=1500,
1794
  temperature=0
1795
  )
 
1796
  try:
1797
  skeleton = json.loads(skeleton_response)
1798
  except Exception as e:
1799
  logging.error(f"Error parsing skeleton JSON: {e}")
1800
- skeleton = {"core_sections": [], "final_sections": []}
 
 
 
 
 
 
 
 
 
1801
 
1802
- # Step 2: Generate content for each core section sequentially.
1803
  generated_core_sections = {}
1804
  previous_sections_content = ""
1805
  if "core_sections" in skeleton:
1806
  for section in skeleton["core_sections"]:
1807
  section_name = section.get("section_name", "Untitled Section")
1808
  instructions = section.get("instructions", "")
1809
- target_wc = section.get("target_word_count", 500)
 
 
 
 
1810
  key_elements = section.get("key_content_elements", [])
1811
  placeholders = section.get("placeholders", {})
1812
- # Build a placeholder directive based on the allocated booleans.
1813
  placeholder_directive = ""
1814
  if placeholders.get("visual", False):
1815
  placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
@@ -1838,21 +1831,23 @@ Return only the HTML content for this section (do not include outer <html> or <b
1838
  section_content = openai_call(
1839
  prompt=prompt_section,
1840
  model="o3-mini",
1841
- max_tokens_param=target_wc * 5,
1842
  temperature=0
1843
  )
1844
  section_content = section_content.strip()
1845
  generated_core_sections[section_name] = section_content
1846
- # Accumulate current section's content into a shared context for continuity.
1847
  previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
1848
 
1849
- # Step 3: Generate final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
1850
  generated_final_sections = {}
1851
  if "final_sections" in skeleton:
1852
  for section in skeleton["final_sections"]:
1853
  section_name = section.get("section_name", "Untitled Final Section")
1854
  instructions = section.get("instructions", "")
1855
- target_wc = section.get("target_word_count", 500)
 
 
 
1856
  prompt_final = f"""
1857
  You are a master technical editor.
1858
  Generate detailed HTML content for the final section titled "{section_name}".
@@ -1870,14 +1865,14 @@ Return only the HTML content for this section (do not include outer <html> or <b
1870
  final_section_content = openai_call(
1871
  prompt=prompt_final,
1872
  model="o3-mini",
1873
- max_tokens_param=target_wc * 5,
1874
  temperature=0
1875
  )
1876
  final_section_content = final_section_content.strip()
1877
  generated_final_sections[section_name] = final_section_content
1878
  previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
1879
 
1880
- # Step 4: Generate a Table of Contents from the section titles.
1881
  toc_titles = []
1882
  for section in skeleton.get("core_sections", []):
1883
  if "section_name" in section:
@@ -1889,8 +1884,8 @@ Return only the HTML content for this section (do not include outer <html> or <b
1889
  You are a technical editor.
1890
  Based on the following list of section titles: {', '.join(toc_titles)},
1891
  generate a concise HTML snippet for a Table of Contents,
1892
- with each item appearing on a separate line and numbered (e.g., "1. Section Title", "2. Section Title", etc.).
1893
- Return only the HTML snippet for the Table of Contents without additional commentary.
1894
  """
1895
  toc_html = openai_call(
1896
  prompt=prompt_toc,
@@ -1899,7 +1894,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
1899
  temperature=0
1900
  ).strip()
1901
 
1902
- # Step 5: Assemble the final HTML document.
1903
  final_report_html = f"""<html>
1904
  <head>
1905
  <meta charset="utf-8" />
@@ -1956,7 +1951,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
1956
  {content}
1957
  </div>
1958
  """
1959
- # Append a designated report ending marker.
1960
  final_report_html += """
1961
  <iframe class="visual-frame" srcdoc='
1962
  <!DOCTYPE html>
@@ -1971,8 +1966,7 @@ Return only the HTML snippet for the Table of Contents without additional commen
1971
  </body>
1972
  </html>
1973
  """
1974
-
1975
- # Step 6: Process the raw HTML to replace placeholder markers with actual placeholder code.
1976
  final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1977
  final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1978
  final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
 
1719
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
1720
  aggregated_crumbs: str, references: list, pages: int = 8) -> str:
1721
  """
1722
+ Revised generate_final_report with explicit type conversions for max_tokens values
1723
+ and a fallback for incomplete JSON parsing.
1724
 
1725
+ This function:
1726
+ 1. Generates a JSON skeleton outlining the report sections and placeholder allocations.
1727
+ 2. For each core section, generates HTML content using the assigned token
1728
+ (target_wc * 5) ensuring target_wc is an integer.
1729
+ 3. Generates final sections (Introduction, Abstract, Conclusion, Reference Summary Table).
1730
+ 4. Assembles the Table of Contents and the final HTML.
1731
+ 5. Passes the raw HTML through the placeholder replacement functions before returning.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1732
  """
1733
+ import json, logging, re
1734
 
1735
  # Calculate overall target word count (approximate)
1736
  total_word_count = pages * 500
 
1738
  "No external summaries were directly extracted. It is not possible to analyze relevance."
1739
  )
1740
 
1741
+ # --- Step 1: Generate the JSON skeleton outline with placeholder allocation decisions ---
1742
  prompt_skeleton = f"""
1743
  You are a master technical editor.
1744
  Produce a detailed JSON skeleton outline for a comprehensive academic research report titled "{initial_query}".
 
1752
  • "instructions": Detailed guidelines on which sub-topics, facts, and arguments to cover.
1753
  • "target_word_count": An approximate desired word count for that section.
1754
  • "key_content_elements": An array of 3 to 5 bullet points that must be mentioned.
1755
+ • "placeholders": An object with boolean keys "visual", "graph", and "focus" indicating which placeholders to include.
1756
+ **Overall guidance**: Across all core sections, the total number of visual placeholders should be between ⌊{pages}/10⌋ and ⌈{pages}/5⌉,
1757
+ graph placeholders should be in the same range, and focus placeholders between ⌊{pages}/20⌋ and ⌈{pages}/10⌉. Decide per section which to activate.
 
 
1758
 
1759
+ 2. "final_sections": These should be generated after core sections and include:
1760
  - "Introduction"
1761
  - "Abstract"
1762
  - "Conclusion"
1763
  - "Reference Summary Table"
1764
+ Their combined target word count should be about 30% of the total (approximately {int(0.3 * total_word_count)} words),
1765
  distributed evenly among them.
1766
+
1767
  Return only valid JSON with two keys: "core_sections" and "final_sections", with no additional commentary.
1768
  """
1769
  skeleton_response = openai_call(
1770
  prompt=prompt_skeleton,
1771
  model="o3-mini",
1772
+ max_tokens_param=int(1500),
1773
  temperature=0
1774
  )
1775
+
1776
  try:
1777
  skeleton = json.loads(skeleton_response)
1778
  except Exception as e:
1779
  logging.error(f"Error parsing skeleton JSON: {e}")
1780
+ # Fallback: attempt to extract JSON from a markdown code fence.
1781
+ match = re.search(r"```json(.*?)```", skeleton_response, re.DOTALL)
1782
+ if match:
1783
+ try:
1784
+ skeleton = json.loads(match.group(1).strip())
1785
+ except Exception as e2:
1786
+ logging.error(f"Fallback JSON parsing failed: {e2}")
1787
+ skeleton = {"core_sections": [], "final_sections": []}
1788
+ else:
1789
+ skeleton = {"core_sections": [], "final_sections": []}
1790
 
1791
+ # --- Step 2: Generate content for each core section sequentially.
1792
  generated_core_sections = {}
1793
  previous_sections_content = ""
1794
  if "core_sections" in skeleton:
1795
  for section in skeleton["core_sections"]:
1796
  section_name = section.get("section_name", "Untitled Section")
1797
  instructions = section.get("instructions", "")
1798
+ # Ensure target_word_count is an integer:
1799
+ try:
1800
+ target_wc = int(section.get("target_word_count", 500))
1801
+ except ValueError:
1802
+ target_wc = 500
1803
  key_elements = section.get("key_content_elements", [])
1804
  placeholders = section.get("placeholders", {})
1805
+ # Build placeholder directive based on allocated booleans.
1806
  placeholder_directive = ""
1807
  if placeholders.get("visual", False):
1808
  placeholder_directive += "[[Visual Placeholder: Insert one visual here.]]\n"
 
1831
  section_content = openai_call(
1832
  prompt=prompt_section,
1833
  model="o3-mini",
1834
+ max_tokens_param=int(target_wc * 5),
1835
  temperature=0
1836
  )
1837
  section_content = section_content.strip()
1838
  generated_core_sections[section_name] = section_content
 
1839
  previous_sections_content += f"\n<!-- {section_name} -->\n" + section_content
1840
 
1841
+ # --- Step 3: Generate content for each final section.
1842
  generated_final_sections = {}
1843
  if "final_sections" in skeleton:
1844
  for section in skeleton["final_sections"]:
1845
  section_name = section.get("section_name", "Untitled Final Section")
1846
  instructions = section.get("instructions", "")
1847
+ try:
1848
+ target_wc = int(section.get("target_word_count", 500))
1849
+ except ValueError:
1850
+ target_wc = 500
1851
  prompt_final = f"""
1852
  You are a master technical editor.
1853
  Generate detailed HTML content for the final section titled "{section_name}".
 
1865
  final_section_content = openai_call(
1866
  prompt=prompt_final,
1867
  model="o3-mini",
1868
+ max_tokens_param=int(target_wc * 5),
1869
  temperature=0
1870
  )
1871
  final_section_content = final_section_content.strip()
1872
  generated_final_sections[section_name] = final_section_content
1873
  previous_sections_content += f"\n<!-- {section_name} -->\n" + final_section_content
1874
 
1875
+ # --- Step 4: Generate a Table of Contents from section titles.
1876
  toc_titles = []
1877
  for section in skeleton.get("core_sections", []):
1878
  if "section_name" in section:
 
1884
  You are a technical editor.
1885
  Based on the following list of section titles: {', '.join(toc_titles)},
1886
  generate a concise HTML snippet for a Table of Contents,
1887
+ with each item on a separate numbered line (e.g., "1. Section Title", "2. Section Title", etc.).
1888
+ Return only the HTML snippet without additional commentary.
1889
  """
1890
  toc_html = openai_call(
1891
  prompt=prompt_toc,
 
1894
  temperature=0
1895
  ).strip()
1896
 
1897
+ # --- Step 5: Assemble the final HTML document.
1898
  final_report_html = f"""<html>
1899
  <head>
1900
  <meta charset="utf-8" />
 
1951
  {content}
1952
  </div>
1953
  """
1954
+ # Append an ending marker.
1955
  final_report_html += """
1956
  <iframe class="visual-frame" srcdoc='
1957
  <!DOCTYPE html>
 
1966
  </body>
1967
  </html>
1968
  """
1969
+ # --- Step 6: Replace placeholder markers with actual content.
 
1970
  final_report_html = replace_visual_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1971
  final_report_html = replace_graph_placeholders(final_report_html, context, initial_query, aggregated_crumbs)
1972
  final_report_html = replace_focus_placeholders(final_report_html, context, initial_query, aggregated_crumbs)