Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -84,15 +84,6 @@ def clean_llm_response(response: str) -> str:
|
|
| 84 |
# Collapse multiple spaces into one.
|
| 85 |
cleaned = re.sub(r'\s+', ' ', cleaned)
|
| 86 |
|
| 87 |
-
# Optionally, if you suspect unescaped quotes in the content,
|
| 88 |
-
# you might try to protect them. For example, if the improved field contains inner double quotes,
|
| 89 |
-
# ensure they are properly escaped. This can be a bit tricky because you want to preserve valid escapes.
|
| 90 |
-
#
|
| 91 |
-
# Example (if needed):
|
| 92 |
-
# cleaned = cleaned.replace('\"', '\\\"')
|
| 93 |
-
#
|
| 94 |
-
# But be cautious: too many replacements may ruin valid escapes.
|
| 95 |
-
|
| 96 |
return cleaned.strip()
|
| 97 |
|
| 98 |
def snippet_in_tag(tag: Tag, snippet: str) -> bool:
|
|
@@ -113,15 +104,6 @@ def snippet_in_tag(tag: Tag, snippet: str) -> bool:
|
|
| 113 |
return False
|
| 114 |
|
| 115 |
def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
|
| 116 |
-
"""
|
| 117 |
-
Given a BeautifulSoup object and a snippet of text, this function finds the element that contains the snippet.
|
| 118 |
-
It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
|
| 119 |
-
until the highest level <iframe> is reached or (if no <iframe> is present) until a <div> or <table> is
|
| 120 |
-
encountered—the first allowed container (<div> or <table>) found is used. If neither is found,
|
| 121 |
-
it returns the candidate element itself.
|
| 122 |
-
|
| 123 |
-
Logging is provided at each key step.
|
| 124 |
-
"""
|
| 125 |
allowed_tags = {"div", "table"}
|
| 126 |
|
| 127 |
logging.info("Searching for all elements containing the snippet: '%s'", snippet)
|
|
@@ -173,8 +155,8 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 173 |
# Step 1: (LLM call to get unique strings) ...
|
| 174 |
# [Assume this part remains unchanged and unique_strings is obtained]
|
| 175 |
|
| 176 |
-
prompt_identify = (
|
| 177 |
-
|
| 178 |
|
| 179 |
Below is the full report HTML and a user adjustment request.
|
| 180 |
Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
|
|
@@ -189,9 +171,9 @@ Extract one or more unique plain-text string(s) (without any HTML tags or format
|
|
| 189 |
Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
|
| 190 |
Ensure these strings exactly match the content in the report.
|
| 191 |
|
| 192 |
-
Note: if the unique string is from within a code snippet (ex: javascript graph or a mermaid code), don't use the code as snippet,
|
| 193 |
For example instead of "A[Fundamental AI Research - Emerging Theories and Paradigms] --> B[Algorithm Innovation - Novel ML and NLP Models]"
|
| 194 |
-
|
| 195 |
This would make it easier to find it
|
| 196 |
|
| 197 |
Full Report HTML:
|
|
@@ -234,8 +216,8 @@ Only output valid JSON."""
|
|
| 234 |
logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
|
| 235 |
|
| 236 |
# Step 3: Call the LLM to adjust this container.
|
| 237 |
-
prompt_adjust = (
|
| 238 |
-
|
| 239 |
Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
|
| 240 |
produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
|
| 241 |
The updated version will be put back in the exact same location and must have the same outer tags.
|
|
@@ -261,7 +243,7 @@ Output a JSON object with exactly two keys:
|
|
| 261 |
- "improved" (the corrected container's full HTML) and
|
| 262 |
- "summary" (a brief explanation of the changes)
|
| 263 |
|
| 264 |
-
Only output valid JSON."""
|
| 265 |
)
|
| 266 |
|
| 267 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
|
@@ -290,11 +272,13 @@ Only output valid JSON."""
|
|
| 290 |
|
| 291 |
# (Step 5 and Step 6 remain as before to update the reference table and the QA log)
|
| 292 |
|
| 293 |
-
prompt_refs = (
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
|
|
|
|
|
|
| 298 |
)
|
| 299 |
updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 300 |
updated_refs = updated_refs.strip().strip("```")
|
|
@@ -329,17 +313,24 @@ def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key:
|
|
| 329 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 330 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 331 |
|
| 332 |
-
prompt = (
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
)
|
| 344 |
suggestions = openai_call(prompt=prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 345 |
return suggestions.strip().strip("```").strip()
|
|
@@ -366,7 +357,7 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
|
|
| 366 |
chat_history.append([user_message, answer])
|
| 367 |
return chat_history, "", updated_report
|
| 368 |
|
| 369 |
-
def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_history, report_text, crumbs_text):
|
| 370 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 371 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 372 |
|
|
@@ -376,15 +367,21 @@ def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_histor
|
|
| 376 |
if "http://" in user_message or "https://" in user_message:
|
| 377 |
answer = handle_link_request(user_message)
|
| 378 |
else:
|
| 379 |
-
system_prompt = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
{report_text}
|
| 381 |
|
| 382 |
-
Source Crumbs:
|
| 383 |
{crumbs_text}
|
| 384 |
|
| 385 |
-
User Question:
|
| 386 |
{user_message}
|
| 387 |
|
|
|
|
|
|
|
| 388 |
Your Answer:"""
|
| 389 |
answer = openai_call(prompt=system_prompt, model="o3-mini", max_tokens_param=10000)
|
| 390 |
updated_history = chat_history + [[user_message, answer]]
|
|
@@ -1420,8 +1417,10 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
|
|
| 1420 |
words = text.split()
|
| 1421 |
if len(words) <= chunk_size:
|
| 1422 |
# If the text is short, simply return it (or you could call a simple summarization)
|
| 1423 |
-
|
| 1424 |
-
|
|
|
|
|
|
|
| 1425 |
chunks = []
|
| 1426 |
i = 0
|
| 1427 |
while i < len(words):
|
|
@@ -1436,20 +1435,8 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
|
|
| 1436 |
chunk_prompt = (f"""
|
| 1437 |
Summarize the following text, preserving all key details and ensuring that any tables or structured data are also summarized:
|
| 1438 |
{chunk}
|
| 1439 |
-
|
| 1440 |
-
//
|
| 1441 |
-
- We will perform a post-processing on the output
|
| 1442 |
-
- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
|
| 1443 |
-
ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
|
| 1444 |
-
ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
|
| 1445 |
-
ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
|
| 1446 |
-
ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
|
| 1447 |
-
ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
|
| 1448 |
-
Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
|
| 1449 |
-
- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
| 1450 |
-
- You should have approximately 10 mention of organisations, people, projects or people, use the prescribed format
|
| 1451 |
-
- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
|
| 1452 |
-
Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
|
| 1453 |
"""
|
| 1454 |
)
|
| 1455 |
summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
|
|
@@ -1465,20 +1452,8 @@ Note: LinkedIn is not a source - if you want to use a source related to LinkedIn
|
|
| 1465 |
final_prompt = (f"""
|
| 1466 |
Combine the following summaries into one concise summary that preserves all critical details, including any relevant table or structured data:
|
| 1467 |
{combined_summary}
|
| 1468 |
-
|
| 1469 |
-
//
|
| 1470 |
-
- We will perform a post-processing on the output
|
| 1471 |
-
- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
|
| 1472 |
-
ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
|
| 1473 |
-
ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
|
| 1474 |
-
ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
|
| 1475 |
-
ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
|
| 1476 |
-
ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
|
| 1477 |
-
Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
|
| 1478 |
-
- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
| 1479 |
-
- You should have approximately 10 mention of organisations, people, projects or people, use the prescribed format
|
| 1480 |
-
- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
|
| 1481 |
-
Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
|
| 1482 |
"""
|
| 1483 |
)
|
| 1484 |
final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
|
|
@@ -1502,7 +1477,8 @@ def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: floa
|
|
| 1502 |
|
| 1503 |
client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
|
| 1504 |
|
| 1505 |
-
prompt = (f"""
|
|
|
|
| 1506 |
|
| 1507 |
{snippet}
|
| 1508 |
|
|
@@ -1713,49 +1689,19 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
|
|
| 1713 |
combined_learnings = "\n".join(learnings) if learnings else fallback_text
|
| 1714 |
word_count = pages * 500
|
| 1715 |
prompt = (f"""
|
| 1716 |
-
|
| 1717 |
-
|
| 1718 |
-
2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
|
| 1719 |
-
3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
|
| 1720 |
-
Note: This is for academic purposes, so thorough citations and referencing are essential.
|
| 1721 |
-
4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
|
| 1722 |
-
Note: put the full reference url (no generic domain address), down to the html page or the pdf
|
| 1723 |
-
5. It must follow this writing style {reportstyle}.
|
| 1724 |
-
|
| 1725 |
-
// Mentioning sources, organisations and individuals
|
| 1726 |
-
- We will perform a post-processing on the output
|
| 1727 |
-
- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
|
| 1728 |
-
ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
|
| 1729 |
-
ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
|
| 1730 |
-
ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
|
| 1731 |
-
ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
|
| 1732 |
-
ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
|
| 1733 |
-
Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
|
| 1734 |
-
- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
| 1735 |
-
- You should have approximately {10 * pages} mention of organisations, people, projects or people, use the prescribed format
|
| 1736 |
-
- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
|
| 1737 |
-
Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
|
| 1738 |
-
|
| 1739 |
-
// Sources
|
| 1740 |
-
Use the following learnings and merged reference details from a deep research process on:
|
| 1741 |
-
'{initial_query}'
|
| 1742 |
-
Taking also into consideration the context:
|
| 1743 |
-
{context}
|
| 1744 |
-
|
| 1745 |
-
Produce a comprehensive research report in html format.
|
| 1746 |
-
The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
|
| 1747 |
-
For sections requiring specific improvements, put it in <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
|
| 1748 |
|
| 1749 |
// Requirements
|
| 1750 |
- All text alignment has to be on the left
|
|
|
|
| 1751 |
- It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
|
| 1752 |
Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
|
| 1753 |
-
- No more than
|
| 1754 |
- The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
|
| 1755 |
- For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
|
| 1756 |
Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
|
| 1757 |
- Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
|
| 1758 |
-
- Text Alignment has to be to the left, including for the titles
|
| 1759 |
- Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
|
| 1760 |
- Titles for sections and sub-sections should systematically use the tags:
|
| 1761 |
<h1> for sections (ex: 3. Examination of State-of-the-Art of AI)
|
|
@@ -1766,18 +1712,52 @@ Note: Exclude the use of html numbered lists format, they don't get correctly im
|
|
| 1766 |
- Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
|
| 1767 |
- For the Table of contents: do not mention the pages, but make each item on separate line
|
| 1768 |
- Put "Table of contents" and "Abstract" title in h1 format.
|
| 1769 |
-
- The Table of contents should
|
|
|
|
| 1770 |
|
| 1771 |
// Reference citations
|
| 1772 |
- The name of the reference table should be: "Reference Summary Table"
|
| 1773 |
- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
|
| 1774 |
- The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
|
| 1775 |
- For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
|
| 1776 |
-
- Do not add any inline citations reference in the visual and graph placeholders descriptions
|
| 1777 |
- Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
|
| 1778 |
- The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
|
| 1779 |
- The table generated should have in-line styling to have word-wrap and 100% width
|
| 1780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1781 |
--------------- Placeholders -----------
|
| 1782 |
In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
|
| 1783 |
There are 3 types: visual, graphs, focus - each with their own purpose
|
|
@@ -1888,7 +1868,7 @@ Then close the html code from the broader report
|
|
| 1888 |
</body>
|
| 1889 |
</html>
|
| 1890 |
|
| 1891 |
-
// Structure
|
| 1892 |
|
| 1893 |
{{Do not add anything before - no introductory meta comment or content}}
|
| 1894 |
|
|
@@ -1897,7 +1877,7 @@ Then close the html code from the broader report
|
|
| 1897 |
- Introduction
|
| 1898 |
- [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
|
| 1899 |
- Conclusion
|
| 1900 |
-
- References
|
| 1901 |
- Report ending formatting (as mentioned before)
|
| 1902 |
|
| 1903 |
{{Do not add anything after - no conclusive meta comment or content}}
|
|
@@ -1911,8 +1891,8 @@ Important note: placeholders (visual, graph or focus) can only appear in the sec
|
|
| 1911 |
- Results from searches:
|
| 1912 |
{aggregated_crumbs}
|
| 1913 |
|
| 1914 |
-
|
| 1915 |
-
|
| 1916 |
"""
|
| 1917 |
)
|
| 1918 |
tokentarget = word_count * 5 # rough multiplier for token target
|
|
@@ -3055,7 +3035,7 @@ def main():
|
|
| 3055 |
|
| 3056 |
send_button.click(
|
| 3057 |
fn=send_chat_message,
|
| 3058 |
-
inputs=[chat_input, openai_api_key_input, serpapi_api_key_input, chatbot, final_report, crumbs_box],
|
| 3059 |
outputs=[chatbot, chat_input, final_report]
|
| 3060 |
)
|
| 3061 |
|
|
|
|
| 84 |
# Collapse multiple spaces into one.
|
| 85 |
cleaned = re.sub(r'\s+', ' ', cleaned)
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return cleaned.strip()
|
| 88 |
|
| 89 |
def snippet_in_tag(tag: Tag, snippet: str) -> bool:
|
|
|
|
| 104 |
return False
|
| 105 |
|
| 106 |
def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
allowed_tags = {"div", "table"}
|
| 108 |
|
| 109 |
logging.info("Searching for all elements containing the snippet: '%s'", snippet)
|
|
|
|
| 155 |
# Step 1: (LLM call to get unique strings) ...
|
| 156 |
# [Assume this part remains unchanged and unique_strings is obtained]
|
| 157 |
|
| 158 |
+
prompt_identify = (f"""
|
| 159 |
+
You are a meticulous technical editor.
|
| 160 |
|
| 161 |
Below is the full report HTML and a user adjustment request.
|
| 162 |
Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
|
|
|
|
| 171 |
Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
|
| 172 |
Ensure these strings exactly match the content in the report.
|
| 173 |
|
| 174 |
+
Note: if the unique string is from within a code snippet (ex: javascript graph or a mermaid code), don't use the code as part of the snippet,
|
| 175 |
For example instead of "A[Fundamental AI Research - Emerging Theories and Paradigms] --> B[Algorithm Innovation - Novel ML and NLP Models]"
|
| 176 |
+
Simply use "Fundamental AI Research - Emerging Theories and Paradigms"
|
| 177 |
This would make it easier to find it
|
| 178 |
|
| 179 |
Full Report HTML:
|
|
|
|
| 216 |
logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
|
| 217 |
|
| 218 |
# Step 3: Call the LLM to adjust this container.
|
| 219 |
+
prompt_adjust = (f"""
|
| 220 |
+
You are a technical editor.
|
| 221 |
Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
|
| 222 |
produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
|
| 223 |
The updated version will be put back in the exact same location and must have the same outer tags.
|
|
|
|
| 243 |
- "improved" (the corrected container's full HTML) and
|
| 244 |
- "summary" (a brief explanation of the changes)
|
| 245 |
|
| 246 |
+
Only output valid JSON with no comments or code fences."""
|
| 247 |
)
|
| 248 |
|
| 249 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
|
|
|
| 272 |
|
| 273 |
# (Step 5 and Step 6 remain as before to update the reference table and the QA log)
|
| 274 |
|
| 275 |
+
prompt_refs = (f"""
|
| 276 |
+
You are a technical editor.
|
| 277 |
+
|
| 278 |
+
Review the following updated report HTML.
|
| 279 |
+
If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
|
| 280 |
+
generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
|
| 281 |
+
Updated Report HTML:\n{updated_report_html}"""
|
| 282 |
)
|
| 283 |
updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 284 |
updated_refs = updated_refs.strip().strip("```")
|
|
|
|
| 313 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 314 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 315 |
|
| 316 |
+
prompt = (f"""
|
| 317 |
+
You are a technical editor.
|
| 318 |
+
Based on the following full HTML report, generate improvement suggestions - at least 3."
|
| 319 |
+
Format each proposal as a numbered list item in the following style:\n"
|
| 320 |
+
Examples:
|
| 321 |
+
1) in the section xyz, adjust ...
|
| 322 |
+
2) after the paragraph abc, detail the graph further ...
|
| 323 |
+
3) in the focus placeholder xxx, add a mention about ...
|
| 324 |
+
4) make a reference to ... in the section 3.2
|
| 325 |
+
...
|
| 326 |
+
n) final improvement suggestion...
|
| 327 |
+
|
| 328 |
+
Only output the suggestions exactly as a numbered list (text)
|
| 329 |
+
|
| 330 |
+
Full Report HTML:
|
| 331 |
+
{report_html}
|
| 332 |
+
|
| 333 |
+
Now provide your suggestions."""
|
| 334 |
)
|
| 335 |
suggestions = openai_call(prompt=prompt, model="o3-mini", max_tokens_param=1000, temperature=0.5)
|
| 336 |
return suggestions.strip().strip("```").strip()
|
|
|
|
| 357 |
chat_history.append([user_message, answer])
|
| 358 |
return chat_history, "", updated_report
|
| 359 |
|
| 360 |
+
def send_chat_message(user_message, openai_api_key, serpapi_api_key, chat_history, report_text, crumbs_text, style):
|
| 361 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 362 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 363 |
|
|
|
|
| 367 |
if "http://" in user_message or "https://" in user_message:
|
| 368 |
answer = handle_link_request(user_message)
|
| 369 |
else:
|
| 370 |
+
system_prompt = f"""
|
| 371 |
+
You are a knowledgeable research assistant.
|
| 372 |
+
|
| 373 |
+
Based on the following
|
| 374 |
+
- Report:
|
| 375 |
{report_text}
|
| 376 |
|
| 377 |
+
- Source Crumbs:
|
| 378 |
{crumbs_text}
|
| 379 |
|
| 380 |
+
- User Question:
|
| 381 |
{user_message}
|
| 382 |
|
| 383 |
+
Provide a response in the desired style: {style}
|
| 384 |
+
|
| 385 |
Your Answer:"""
|
| 386 |
answer = openai_call(prompt=system_prompt, model="o3-mini", max_tokens_param=10000)
|
| 387 |
updated_history = chat_history + [[user_message, answer]]
|
|
|
|
| 1417 |
words = text.split()
|
| 1418 |
if len(words) <= chunk_size:
|
| 1419 |
# If the text is short, simply return it (or you could call a simple summarization)
|
| 1420 |
+
if len(words) < 500:
|
| 1421 |
+
return "Not a coherent text or not worth processing - discard."
|
| 1422 |
+
else
|
| 1423 |
+
return text
|
| 1424 |
chunks = []
|
| 1425 |
i = 0
|
| 1426 |
while i < len(words):
|
|
|
|
| 1435 |
chunk_prompt = (f"""
|
| 1436 |
Summarize the following text, preserving all key details and ensuring that any tables or structured data are also summarized:
|
| 1437 |
{chunk}
|
| 1438 |
+
Maintain the original sources.
|
| 1439 |
+
Keep all mentions of names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1440 |
"""
|
| 1441 |
)
|
| 1442 |
summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
|
|
|
|
| 1452 |
final_prompt = (f"""
|
| 1453 |
Combine the following summaries into one concise summary that preserves all critical details, including any relevant table or structured data:
|
| 1454 |
{combined_summary}
|
| 1455 |
+
Maintain the original sources.
|
| 1456 |
+
Keep all mentions of names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1457 |
"""
|
| 1458 |
)
|
| 1459 |
final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
|
|
|
|
| 1477 |
|
| 1478 |
client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
|
| 1479 |
|
| 1480 |
+
prompt = (f"""
|
| 1481 |
+
Analyze the following content from a query result:
|
| 1482 |
|
| 1483 |
{snippet}
|
| 1484 |
|
|
|
|
| 1689 |
combined_learnings = "\n".join(learnings) if learnings else fallback_text
|
| 1690 |
word_count = pages * 500
|
| 1691 |
prompt = (f"""
|
| 1692 |
+
Produce a comprehensive report in html format.
|
| 1693 |
+
The report should be very detailed and lengthy.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1694 |
|
| 1695 |
// Requirements
|
| 1696 |
- All text alignment has to be on the left
|
| 1697 |
+
- The report should be {pages} long or {word_count} words (excluding html formatting)
|
| 1698 |
- It must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below
|
| 1699 |
Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
|
| 1700 |
+
- No more than 10 sentences per div blocks, skip lines and add line breaks when changing topic.
|
| 1701 |
- The report must include between {round(pages/10,0)} and {round(pages/5,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
|
| 1702 |
- For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...).
|
| 1703 |
Note: Exclude the use of html numbered lists format, they don't get correctly implemented. Use plain text format for numbering of sections and sub-sections
|
| 1704 |
- Put paragraphs, sentences that are part of the same section in a div tag, this will be used for formatting.
|
|
|
|
| 1705 |
- Add on top of the report the report title (with the <h1> tag) - this is the only part that should be centered (in-line style)
|
| 1706 |
- Titles for sections and sub-sections should systematically use the tags:
|
| 1707 |
<h1> for sections (ex: 3. Examination of State-of-the-Art of AI)
|
|
|
|
| 1712 |
- Avoid Chinese characters in the output (use the Pinyin version) since they won't display correcly in the pdf (black boxes)
|
| 1713 |
- For the Table of contents: do not mention the pages, but make each item on separate line
|
| 1714 |
- Put "Table of contents" and "Abstract" title in h1 format.
|
| 1715 |
+
- The Table of contents should skip the abstract and table of contents, the numbering should start from the introduction and end with References Summary Table
|
| 1716 |
+
- For sections requiring specific improvements, put it in <div class="improvable-chunk">...</div> (but don't mention it in the report, this will be managed through post-processing)
|
| 1717 |
|
| 1718 |
// Reference citations
|
| 1719 |
- The name of the reference table should be: "Reference Summary Table"
|
| 1720 |
- The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
|
| 1721 |
- The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
|
| 1722 |
- For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
|
| 1723 |
+
- Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
|
| 1724 |
- Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
|
| 1725 |
- The references / citations should be only coming from the most reputable sources amongst all the Learnings and Results from searches below
|
| 1726 |
- The table generated should have in-line styling to have word-wrap and 100% width
|
| 1727 |
|
| 1728 |
+
// Instructions:
|
| 1729 |
+
1. Integrate numbers from the sources but always mention the source
|
| 1730 |
+
2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
|
| 1731 |
+
3. Again, Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
|
| 1732 |
+
Note: This is for academic purposes, so thorough citations and referencing are essential.
|
| 1733 |
+
4. Focus on reputable sources that will not be disputed (generally social media posts cannot be an opposable sources, but some of them may mention reputable sources)
|
| 1734 |
+
Note: put the full reference url (no generic domain address), down to the html page or the pdf
|
| 1735 |
+
|
| 1736 |
+
|
| 1737 |
+
// Style
|
| 1738 |
+
The report must follow this writing style {reportstyle}.
|
| 1739 |
+
|
| 1740 |
+
// Format when mentioning sources, organisations and individuals
|
| 1741 |
+
- We will perform a post-processing on the output
|
| 1742 |
+
- For this reasons use this format for any specific name, organisation or project: {{[{{name}}]}}
|
| 1743 |
+
ex1: {{[{{Google}}]}} CEO, {{[{{Sundar Pichai}}]}} ...
|
| 1744 |
+
ex2: in a report from the {{[{{university of Berkeley}}]}} titled "{{[{{The great acceleration}}]}}"...
|
| 1745 |
+
ex3: the CEO of {{[{{Softbank}}]}} , {{[{{Masayoshi Son}}]}}, said that "the best way to..."
|
| 1746 |
+
ex4: the project {{[{{Stargate}}]}}, anounced by {{[{{OpenAI}}]}} in collaboration with {{[{{Salesforce}}]}}
|
| 1747 |
+
ex5: Mr. {{[{{Michael Parrot}}]}}, Marketing director in {{[{{Panasonic}}]}}, mentioned that ...
|
| 1748 |
+
Note: the output will be processed through regex and the identifiers removed, but this way we can keep track of all sources and citations without disclosing them.
|
| 1749 |
+
- This should apply to names, people/titles, dates, papers, reports, organisation/institute/NGO/government bodies quotes, products, project names, ...
|
| 1750 |
+
- You should have approximately {10 * pages} mention of organisations, people, projects or people, use the prescribed format
|
| 1751 |
+
- DO NOT MENTION this formmatting requirement, just apply it. The user doesn't have to know about this technicality.
|
| 1752 |
+
Note: LinkedIn is not a source - if you want to use a source related to LinkedIn, you should check the author of the page visited, this is the real source, mention the name of the author as "'authorName' from LinkedIn Pulse"
|
| 1753 |
+
|
| 1754 |
+
// Sources
|
| 1755 |
+
Use the following learnings and merged reference details from a deep research process on:
|
| 1756 |
+
'{initial_query}'
|
| 1757 |
+
|
| 1758 |
+
Taking also into consideration the context:
|
| 1759 |
+
{context}
|
| 1760 |
+
|
| 1761 |
--------------- Placeholders -----------
|
| 1762 |
In order to enrich the content, within the core sections (between introduction and conclusion), you can inject some placeholders that will be developped later on.
|
| 1763 |
There are 3 types: visual, graphs, focus - each with their own purpose
|
|
|
|
| 1868 |
</body>
|
| 1869 |
</html>
|
| 1870 |
|
| 1871 |
+
// Structure the overall report as follows:
|
| 1872 |
|
| 1873 |
{{Do not add anything before - no introductory meta comment or content}}
|
| 1874 |
|
|
|
|
| 1877 |
- Introduction
|
| 1878 |
- [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
|
| 1879 |
- Conclusion
|
| 1880 |
+
- References summary table
|
| 1881 |
- Report ending formatting (as mentioned before)
|
| 1882 |
|
| 1883 |
{{Do not add anything after - no conclusive meta comment or content}}
|
|
|
|
| 1891 |
- Results from searches:
|
| 1892 |
{aggregated_crumbs}
|
| 1893 |
|
| 1894 |
+
Take a deep breath, do your best.
|
| 1895 |
+
Now, produce the report please.
|
| 1896 |
"""
|
| 1897 |
)
|
| 1898 |
tokentarget = word_count * 5 # rough multiplier for token target
|
|
|
|
| 3035 |
|
| 3036 |
send_button.click(
|
| 3037 |
fn=send_chat_message,
|
| 3038 |
+
inputs=[chat_input, openai_api_key_input, serpapi_api_key_input, chatbot, final_report, crumbs_box, reportstyle],
|
| 3039 |
outputs=[chatbot, chat_input, final_report]
|
| 3040 |
)
|
| 3041 |
|