Guiyom commited on
Commit
cf000a0
·
verified ·
1 Parent(s): a76f63d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -117
app.py CHANGED
@@ -84,15 +84,6 @@ def clean_llm_response(response: str) -> str:
84
  # Collapse multiple spaces into one.
85
  cleaned = re.sub(r'\s+', ' ', cleaned)
86
 
87
- # Optionally, if you suspect unescaped quotes in the content,
88
- # you might try to protect them. For example, if the improved field contains inner double quotes,
89
- # ensure they are properly escaped. This can be a bit tricky because you want to preserve valid escapes.
90
- #
91
- # Example (if needed):
92
- # cleaned = cleaned.replace('\"', '\\\"')
93
- #
94
- # But be cautious: too many replacements may ruin valid escapes.
95
-
96
  return cleaned.strip()
97
 
98
  def snippet_in_tag(tag: Tag, snippet: str) -> bool:
@@ -113,15 +104,6 @@ def snippet_in_tag(tag: Tag, snippet: str) -> bool:
113
  return False
114
 
115
  def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
116
- """
117
- Given a BeautifulSoup object and a snippet of text, this function finds the element that contains the snippet.
118
- It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
119
- until the highest level <iframe> is reached or (if no <iframe> is present) until a <div> or <table> is
120
- encountered—the first allowed container (<div> or <table>) found is used. If neither is found,
121
- it returns the candidate element itself.
122
-
123
- Logging is provided at each key step.
124
- """
125
  allowed_tags = {"div", "table"}
126
 
127
  logging.info("Searching for all elements containing the snippet: '%s'", snippet)
@@ -173,8 +155,8 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
173
  # Step 1: (LLM call to get unique strings) ...
174
  # [Assume this part remains unchanged and unique_strings is obtained]
175
 
176
- prompt_identify = (
177
- f"""You are a meticulous technical editor.
178
 
179
  Below is the full report HTML and a user adjustment request.
180
  Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
@@ -189,9 +171,9 @@ Extract one or more unique plain-text string(s) (without any HTML tags or format
189
  Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
190
  Ensure these strings exactly match the content in the report.
191
 
192
- Note: if the unique string is from within a code snippet (ex: javascript graph or a mermaid code), don't use the code as snippet,
193
  For example instead of "A[Fundamental AI Research - Emerging Theories and Paradigms] --&gt; B[Algorithm Innovation - Novel ML and NLP Models]"
194
- Rather use "Fundamental AI Research - Emerging Theories and Paradigms"
195
  This would make it easier to find it
196
 
197
  Full Report HTML:
@@ -234,8 +216,8 @@ Only output valid JSON."""
234
  logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
235
 
236
  # Step 3: Call the LLM to adjust this container.
237
- prompt_adjust = (
238
- f"""You are a technical editor.
239
  Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
240
  produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
241
  The updated version will be put back in the exact same location and must have the same outer tags.
@@ -261,7 +243,7 @@ Output a JSON object with exactly two keys:
261
  - "improved" (the corrected container's full HTML) and
262
  - "summary" (a brief explanation of the changes)
263
 
264
- Only output valid JSON."""
265
  )
266
 
267
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
@@ -290,11 +272,13 @@ Only output valid JSON."""
290
 
291
  # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
292
 
293
- prompt_refs = (
294
- f"You are a technical editor. Review the following updated report HTML. "
295
- f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
296
- f"generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.\n\n"
297
- f"Updated Report HTML:\n{updated_report_html}"
 
 
298
  )
299
  updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
300
  updated_refs = updated_refs.strip().strip("```")
@@ -329,17 +313,24 @@ def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key:
329
  os.environ["OPENAI_API_KEY"] = openai_api_key
330
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
331
 
332
- prompt = (
333
- "You are a technical editor. Based on the following full HTML report, generate exactly 10 proposed improvement suggestions. "
334
- "Format each proposal as a numbered list item in the following style:\n"
335
- "1) in the section xyz, adjust ...\n"
336
- "2) after the paragraph abc, detail the graph further ...\n"
337
- "3) in the focus placeholder xxx, add a mention about ...\n"
338
- "4) make a reference to ... in the section 3.2\n"
339
- "...\n"
340
- "10) final improvement suggestion...\n"
341
- "Only output the suggestions exactly as a numbered list.\n\n"
342
- f"Full Report HTML:\n{report_html}"
 
 
 
 
 
 
 
343
  )
344
  suggestions = openai_call(prompt=prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
345
  return suggestions.strip().strip("```").strip()
@@ -366,7 +357,7 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
366
  chat_history.append([user_message, answer])
367
  return chat_history, "", updated_report
368
 
369
- def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_history, report_text, crumbs_text):
370
  os.environ["OPENAI_API_KEY"] = openai_api_key
371
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
372
 
@@ -376,15 +367,21 @@ def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_histor
376
  if "http://" in user_message or "https://" in user_message:
377
  answer = handle_link_request(user_message)
378
  else:
379
- system_prompt = f"""You are a knowledgeable research assistant. Based on the following report:
 
 
 
 
380
  {report_text}
381
 
382
- Source Crumbs:
383
  {crumbs_text}
384
 
385
- User Question:
386
  {user_message}
387
 
 
 
388
  Your Answer:"""
389
  answer = openai_call(prompt=system_prompt, model="o3-mini", max_tokens_param=10000)
390
  updated_history = chat_history + [[user_message, answer]]
@@ -1420,8 +1417,10 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
1420
  words = text.split()
1421
  if len(words) <= chunk_size:
1422
  # If the text is short, simply return it (or you could call a simple summarization)
1423
- return text
1424
-
 
 
1425
  chunks = []
1426
  i = 0
1427
  while i < len(words):
@@ -1436,20 +1435,8 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
1436
  chunk_prompt = (f"""
1437
  Summarize the following text, preserving all key details and ensuring that any tables or structured data are also summarized:
1438
  {chunk}
1439
-
1440
- // Mentioning sources, organisations and individuals
1441
- - We will perform a post-processing on the output
1442
- - For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
1443
- ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
1444
- ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
1445
- ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
1446
- ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
1447
- ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
1448
- Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
1449
- - This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
1450
- - You should have approximately 10 mention of organisations, people, projects or people, use the prescribed format
1451
- - DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
1452
- Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
1453
  """
1454
  )
1455
  summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
@@ -1465,20 +1452,8 @@ Note: LinkedIn is not a source - if you want to use a source related to LinkedIn
1465
  final_prompt = (f"""
1466
  Combine the following summaries into one concise summary that preserves all critical details, including any relevant table or structured data:
1467
  {combined_summary}
1468
-
1469
- // Mentioning sources, organisations and individuals
1470
- - We will perform a post-processing on the output
1471
- - For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
1472
- ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
1473
- ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
1474
- ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
1475
- ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
1476
- ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
1477
- Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
1478
- - This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
1479
- - You should have approximately 10 mention of organisations, people, projects or people, use the prescribed format
1480
- - DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
1481
- Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
1482
  """
1483
  )
1484
  final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
@@ -1502,7 +1477,8 @@ def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: floa
1502
 
1503
  client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
1504
 
1505
- prompt = (f"""Analyze the following content from a query result:
 
1506
 
1507
  {snippet}
1508
 
@@ -1713,49 +1689,19 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
1713
  combined_learnings = "\n".join(learnings) if learnings else fallback_text
1714
  word_count = pages * 500
1715
  prompt = (f"""
1716
- // Instructions:
1717
- 1. Integrate numbers from the sources but always mention the source
1718
- 2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
1719
- 3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
1720
- Note: This is for academic purposes, so thorough citations and referencing are essential.
1721
- 4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
1722
- Note: put the full reference url (no generic domain address), down to the html page or the pdf
1723
- 5. It must follow this writing style {reportstyle}.
1724
-
1725
- // Mentioning sources, organisations and individuals
1726
- - We will perform a post-processing on the output
1727
- - For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
1728
- ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
1729
- ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
1730
- ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
1731
- ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
1732
- ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
1733
- Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
1734
- - This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
1735
- - You should have approximately {10 * pages} mention of organisations, people, projects or people, use the prescribed format
1736
- - DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
1737
- Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
1738
-
1739
- // Sources
1740
- Use the following learnings and merged reference details from a deep research process on:
1741
- '{initial_query}'
1742
- Taking also into consideration the context:
1743
- {context}
1744
-
1745
- Produce a comprehensive research report in html format.
1746
- The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
1747
- For sections requiring specific improvements, put it in <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
1748
 
1749
  // Requirements
1750
  - All text alignment has to be on the left
 
1751
  - It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
1752
  Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
1753
- - No more than 7 sentences per div blocks, skip lines and add line breaks when changing topic.
1754
  - The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
1755
  - For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
1756
  Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
1757
  - Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
1758
- - Text Alignment has to be to the left, including for the titles
1759
  - Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
1760
  - Titles for sections and sub-sections should systematically use the tags:
1761
  <h1> for sections (ex: 3. Examination of State-of-the-Art of AI)
@@ -1766,18 +1712,52 @@ Note: Exclude the use of html numbered lists format, they don't get correctly im
1766
  - Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
1767
  - For the Table of contents: do not mention the pages, but make each item on separate line
1768
  - Put "Table of contents" and "Abstract" title in h1 format.
1769
- - The Table of contents should not mention the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
 
1770
 
1771
  // Reference citations
1772
  - The name of the reference table should be: "Reference Summary Table"
1773
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
1774
  - The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
1775
  - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
1776
- - Do not add any inline citations reference in the visual and graph placeholders descriptions below, you can add them in focus though.
1777
  - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
1778
  - The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
1779
  - The table generated should have in-line styling to have word-wrap and 100% width
1780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1781
  --------------- Placeholders -----------
1782
  In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
1783
  There are 3 types: visual, graphs, focus - each with their own purpose
@@ -1888,7 +1868,7 @@ Then close the html code from the broader report
1888
  </body>
1889
  </html>
1890
 
1891
- // Structure of the overall report as follows:
1892
 
1893
  {{Do not add anything before - no introductory meta comment or content}}
1894
 
@@ -1897,7 +1877,7 @@ Then close the html code from the broader report
1897
  - Introduction
1898
  - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
1899
  - Conclusion
1900
- - References of the documents used in the inline citations
1901
  - Report ending formatting (as mentioned before)
1902
 
1903
  {{Do not add anything after - no conclusive meta comment or content}}
@@ -1911,8 +1891,8 @@ Important note: placeholders (visual, graph or focus) can only appear in the sec
1911
  - Results from searches:
1912
  {aggregated_crumbs}
1913
 
1914
-
1915
-
1916
  """
1917
  )
1918
  tokentarget = word_count * 5 # rough multiplier for token target
@@ -3055,7 +3035,7 @@ def main():
3055
 
3056
  send_button.click(
3057
  fn=send_chat_message,
3058
- inputs=[chat_input, openai_api_key_input, serpapi_api_key_input, chatbot, final_report, crumbs_box],
3059
  outputs=[chatbot, chat_input, final_report]
3060
  )
3061
 
 
84
  # Collapse multiple spaces into one.
85
  cleaned = re.sub(r'\s+', ' ', cleaned)
86
 
 
 
 
 
 
 
 
 
 
87
  return cleaned.strip()
88
 
89
  def snippet_in_tag(tag: Tag, snippet: str) -> bool:
 
104
  return False
105
 
106
  def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
 
 
 
 
 
 
 
 
 
107
  allowed_tags = {"div", "table"}
108
 
109
  logging.info("Searching for all elements containing the snippet: '%s'", snippet)
 
155
  # Step 1: (LLM call to get unique strings) ...
156
  # [Assume this part remains unchanged and unique_strings is obtained]
157
 
158
+ prompt_identify = (f"""
159
+ You are a meticulous technical editor.
160
 
161
  Below is the full report HTML and a user adjustment request.
162
  Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
 
171
  Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
172
  Ensure these strings exactly match the content in the report.
173
 
174
+ Note: if the unique string is from within a code snippet (ex: javascript graph or a mermaid code), don't use the code as part of the snippet,
175
  For example instead of "A[Fundamental AI Research - Emerging Theories and Paradigms] --&gt; B[Algorithm Innovation - Novel ML and NLP Models]"
176
+ Simply use "Fundamental AI Research - Emerging Theories and Paradigms"
177
  This would make it easier to find it
178
 
179
  Full Report HTML:
 
216
  logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
217
 
218
  # Step 3: Call the LLM to adjust this container.
219
+ prompt_adjust = (f"""
220
+ You are a technical editor.
221
  Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
222
  produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
223
  The updated version will be put back in the exact same location and must have the same outer tags.
 
243
  - "improved" (the corrected container's full HTML) and
244
  - "summary" (a brief explanation of the changes)
245
 
246
+ Only output valid JSON with no comments or code fences."""
247
  )
248
 
249
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
 
272
 
273
  # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
274
 
275
+ prompt_refs = (f"""
276
+ You are a technical editor.
277
+
278
+ Review the following updated report HTML.
279
+ If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
280
+ generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
281
+ Updated Report HTML:\n{updated_report_html}"""
282
  )
283
  updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
284
  updated_refs = updated_refs.strip().strip("```")
 
313
  os.environ["OPENAI_API_KEY"] = openai_api_key
314
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
315
 
316
+ prompt = (f"""
317
+ You are a technical editor.
318
+ Based on the following full HTML report, generate improvement suggestions - at least 3."
319
+ Format each proposal as a numbered list item in the following style:\n"
320
+ Examples:
321
+ 1) in the section xyz, adjust ...
322
+ 2) after the paragraph abc, detail the graph further ...
323
+ 3) in the focus placeholder xxx, add a mention about ...
324
+ 4) make a reference to ... in the section 3.2
325
+ ...
326
+ n) final improvement suggestion...
327
+
328
+ Only output the suggestions exactly as a numbered list (text)
329
+
330
+ Full Report HTML:
331
+ {report_html}
332
+
333
+ Now provide your suggestions."""
334
  )
335
  suggestions = openai_call(prompt=prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
336
  return suggestions.strip().strip("```").strip()
 
357
  chat_history.append([user_message, answer])
358
  return chat_history, "", updated_report
359
 
360
+ def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_history, report_text, crumbs_text, style):
361
  os.environ["OPENAI_API_KEY"] = openai_api_key
362
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
363
 
 
367
  if "http://" in user_message or "https://" in user_message:
368
  answer = handle_link_request(user_message)
369
  else:
370
+ system_prompt = f"""
371
+ You are a knowledgeable research assistant.
372
+
373
+ Based on the following
374
+ - Report:
375
  {report_text}
376
 
377
+ - Source Crumbs:
378
  {crumbs_text}
379
 
380
+ - User Question:
381
  {user_message}
382
 
383
+ Provide a response in the desired style: {style}
384
+
385
  Your Answer:"""
386
  answer = openai_call(prompt=system_prompt, model="o3-mini", max_tokens_param=10000)
387
  updated_history = chat_history + [[user_message, answer]]
 
1417
  words = text.split()
1418
  if len(words) <= chunk_size:
1419
  # If the text is short, simply return it (or you could call a simple summarization)
1420
+ if len(words) < 500:
1421
+ return "Not a coherent text or not worth processing - discard."
1422
+ else
1423
+ return text
1424
  chunks = []
1425
  i = 0
1426
  while i < len(words):
 
1435
  chunk_prompt = (f"""
1436
  Summarize the following text, preserving all key details and ensuring that any tables or structured data are also summarized:
1437
  {chunk}
1438
+ Maintain the original sources.
1439
+ Keep all mentions of names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
 
 
 
 
 
 
 
 
 
 
 
 
1440
  """
1441
  )
1442
  summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
 
1452
  final_prompt = (f"""
1453
  Combine the following summaries into one concise summary that preserves all critical details, including any relevant table or structured data:
1454
  {combined_summary}
1455
+ Maintain the original sources.
1456
+ Keep all mentions of names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
 
 
 
 
 
 
 
 
 
 
 
 
1457
  """
1458
  )
1459
  final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
 
1477
 
1478
  client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
1479
 
1480
+ prompt = (f"""
1481
+ Analyze the following content from a query result:
1482
 
1483
  {snippet}
1484
 
 
1689
  combined_learnings = "\n".join(learnings) if learnings else fallback_text
1690
  word_count = pages * 500
1691
  prompt = (f"""
1692
+ Produce a comprehensive report in html format.
1693
+ The report should be very detailed and lengthy.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1694
 
1695
  // Requirements
1696
  - All text alignment has to be on the left
1697
+ - The report should be {pages} long or {word_count} words (excluding html formatting)
1698
  - It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
1699
  Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
1700
+ - No more than 10 sentences per div blocks, skip lines and add line breaks when changing topic.
1701
  - The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
1702
  - For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
1703
  Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
1704
  - Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
 
1705
  - Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
1706
  - Titles for sections and sub-sections should systematically use the tags:
1707
  <h1> for sections (ex: 3. Examination of State-of-the-Art of AI)
 
1712
  - Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
1713
  - For the Table of contents: do not mention the pages, but make each item on separate line
1714
  - Put "Table of contents" and "Abstract" title in h1 format.
1715
+ - The Table of contents should skip the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
1716
+ - For sections requiring specific improvements, put it in <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
1717
 
1718
  // Reference citations
1719
  - The name of the reference table should be: "Reference Summary Table"
1720
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
1721
  - The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
1722
  - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
1723
+ - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
1724
  - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
1725
  - The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
1726
  - The table generated should have in-line styling to have word-wrap and 100% width
1727
 
1728
+ // Instructions:
1729
+ 1. Integrate numbers from the sources but always mention the source
1730
+ 2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
1731
+ 3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
1732
+ Note: This is for academic purposes, so thorough citations and referencing are essential.
1733
+ 4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
1734
+ Note: put the full reference url (no generic domain address), down to the html page or the pdf
1735
+
1736
+
1737
+ // Style
1738
+ The report must follow this writing style {reportstyle}.
1739
+
1740
+ // Format when mentioning sources, organisations and individuals
1741
+ - We will perform a post-processing on the output
1742
+ - For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
1743
+ ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
1744
+ ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
1745
+ ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
1746
+ ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
1747
+ ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
1748
+ Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
1749
+ - This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
1750
+ - You should have approximately {10 * pages} mention of organisations, people, projects or people, use the prescribed format
1751
+ - DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
1752
+ Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
1753
+
1754
+ // Sources
1755
+ Use the following learnings and merged reference details from a deep research process on:
1756
+ '{initial_query}'
1757
+
1758
+ Taking also into consideration the context:
1759
+ {context}
1760
+
1761
  --------------- Placeholders -----------
1762
  In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
1763
  There are 3 types: visual, graphs, focus - each with their own purpose
 
1868
  </body>
1869
  </html>
1870
 
1871
+ // Structure the overall report as follows:
1872
 
1873
  {{Do not add anything before - no introductory meta comment or content}}
1874
 
 
1877
  - Introduction
1878
  - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
1879
  - Conclusion
1880
+ - References summary table
1881
  - Report ending formatting (as mentioned before)
1882
 
1883
  {{Do not add anything after - no conclusive meta comment or content}}
 
1891
  - Results from searches:
1892
  {aggregated_crumbs}
1893
 
1894
+ Take a deep breath, do your best.
1895
+ Now, produce the report please.
1896
  """
1897
  )
1898
  tokentarget = word_count * 5 # rough multiplier for token target
 
3035
 
3036
  send_button.click(
3037
  fn=send_chat_message,
3038
+ inputs=[chat_input, openai_api_key_input, serpapi_api_key_input, chatbot, final_report, crumbs_box, reportstyle],
3039
  outputs=[chatbot, chat_input, final_report]
3040
  )
3041