Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -110,9 +110,11 @@ G --> H[Performance Evaluation - 45% Speed Improvement, 35% Risk Profiling, 50%
|
|
| 110 |
- Take a deep breath, think step by step and think it well.
|
| 111 |
|
| 112 |
// Examples
|
|
|
|
|
|
|
| 113 |
-- flowchart --
|
| 114 |
Important:
|
| 115 |
-
- If the flow is "broader" than deep, choose LR (Left Right)
|
| 116 |
- If the flow is "deeper" than broad (>3 levels), choose TD (Top Down)
|
| 117 |
|
| 118 |
Top Down:
|
|
@@ -393,6 +395,12 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
|
|
| 393 |
return err_msg
|
| 394 |
|
| 395 |
def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
| 397 |
prompt = (f"""Analyze the following content from a query result:
|
| 398 |
|
|
@@ -423,6 +431,9 @@ Note: General Optimization Guidelines:
|
|
| 423 |
For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
|
| 424 |
Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
|
| 425 |
|
|
|
|
|
|
|
|
|
|
| 426 |
Proceed."""
|
| 427 |
)
|
| 428 |
try:
|
|
@@ -533,7 +544,17 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
|
|
| 533 |
word_count = pages * 500
|
| 534 |
prompt = (f"""
|
| 535 |
// Instructions:
|
| 536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
'{initial_query}'
|
| 538 |
Taking also into consideration the context:
|
| 539 |
{context}
|
|
@@ -854,7 +875,7 @@ def generate_tailored_questions(openai_api_key: str, query: str, existing_qa: st
|
|
| 854 |
def backup_fields(research_query: str,
|
| 855 |
include_domains: str, exclude_keywords: str, additional_clarifications: str,
|
| 856 |
selected_engines, results_per_query, breadth, depth, clarification_text: str,
|
| 857 |
-
existing_report: str, existing_log: str, crumbs_box: str, final_report: str) -> str:
|
| 858 |
data = {
|
| 859 |
"openai_api_key": "",
|
| 860 |
"serpapi_api_key": "",
|
|
@@ -870,7 +891,8 @@ def backup_fields(research_query: str,
|
|
| 870 |
"existing_report": existing_report,
|
| 871 |
"existing_log": existing_log,
|
| 872 |
"crumbs_box": crumbs_box,
|
| 873 |
-
"final_report": final_report
|
|
|
|
| 874 |
}
|
| 875 |
backup_json = json.dumps(data, indent=2)
|
| 876 |
logging.info(f"backup_fields: Data backed up: {backup_json}")
|
|
@@ -894,10 +916,11 @@ def load_fields(backup_json: str):
|
|
| 894 |
data.get("existing_report", ""),
|
| 895 |
data.get("existing_log", ""),
|
| 896 |
data.get("crumbs_box", ""),
|
| 897 |
-
data.get("final_report", "")
|
|
|
|
| 898 |
except Exception as e:
|
| 899 |
logging.error(f"load_fields error: {e}")
|
| 900 |
-
return ("", "", "", "", "", "", [], 10, 4, 2, "", "", "", "", "")
|
| 901 |
|
| 902 |
def refine_query(query: str, openai_api_key: str) -> str:
|
| 903 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
|
@@ -1192,7 +1215,7 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
|
|
| 1192 |
generator = iterative_deep_research_gen(
|
| 1193 |
disruptive_query, reportstyle, breadth, depth, followup_clarifications,
|
| 1194 |
include_domains, exclude_keywords, additional_clarifications,
|
| 1195 |
-
extra_context="", selected_engines=selected_engines, results_per_query=results_per_query, go_deeper=1
|
| 1196 |
)
|
| 1197 |
extension_report = ""
|
| 1198 |
for progress, rep, proc_log, new_crumbs in generator:
|
|
@@ -1203,6 +1226,32 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
|
|
| 1203 |
appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
|
| 1204 |
return appended_report
|
| 1205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1206 |
def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
|
| 1207 |
followup_clarifications: str,
|
| 1208 |
include_domains: str,
|
|
@@ -1211,6 +1260,7 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1211 |
extra_context: str = "",
|
| 1212 |
selected_engines=None,
|
| 1213 |
results_per_query: int = 10,
|
|
|
|
| 1214 |
go_deeper: int = 8):
|
| 1215 |
overall_context = extra_context + f"Initial Query: {initial_query}\n"
|
| 1216 |
if followup_clarifications.strip():
|
|
@@ -1218,6 +1268,12 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1218 |
process_log = "Starting research with context:\n" + overall_context + "\n"
|
| 1219 |
overall_learnings = []
|
| 1220 |
visited_urls = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
crumbs_list = []
|
| 1222 |
ref_counter = 1
|
| 1223 |
references_list = []
|
|
@@ -1232,10 +1288,20 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1232 |
unique_suggestions = list(set(followup_suggestions))
|
| 1233 |
combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
|
| 1234 |
queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
|
| 1235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1236 |
iteration_learnings = []
|
| 1237 |
followup_suggestions = [] # reset for current iteration
|
| 1238 |
-
for query_tuple in
|
| 1239 |
query_str, engine = query_tuple
|
| 1240 |
mod_query = query_str
|
| 1241 |
if include_domains.strip():
|
|
@@ -1272,9 +1338,13 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1272 |
logging.error(f"Error retrieving content from {url}: {e}")
|
| 1273 |
process_log += f"Error retrieving content from {url}: {e}\n"
|
| 1274 |
continue
|
| 1275 |
-
|
| 1276 |
-
# Clean
|
| 1277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1278 |
|
| 1279 |
# Analyze the cleaned content with GPT-4o-mini
|
| 1280 |
analysis = analyze_with_gpt4o(initial_query, cleaned_content, breadth)
|
|
@@ -1317,7 +1387,17 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1317 |
process_log += "Appended additional clarifications to the context.\n"
|
| 1318 |
progress_pct = int((iteration / depth) * 100)
|
| 1319 |
yield (f"Progress: {progress_pct}%", None, None, None)
|
| 1320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1321 |
final_report = generate_final_report(initial_query, combined_context, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
|
| 1322 |
|
| 1323 |
# --- NEW STEP: Post-process final_report to replace visual and focus placeholders ---
|
|
@@ -1330,8 +1410,49 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1330 |
f"<p>---------</p><p><b>Report alignment assessment:</b> {alignment_assessment}</p> </div> </body></html>"
|
| 1331 |
)
|
| 1332 |
logging.info("iterative_deep_research_gen: Final report generated.")
|
| 1333 |
-
|
|
|
|
|
|
|
|
|
|
| 1334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1335 |
def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
|
| 1336 |
prompt = (
|
| 1337 |
"Please assess the following research report in terms of its alignment with the initial user request "
|
|
@@ -1349,7 +1470,7 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
|
|
| 1349 |
followup_clarifications: str, include_domains: str,
|
| 1350 |
exclude_keywords: str, additional_clarifications: str,
|
| 1351 |
results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
|
| 1352 |
-
pages: str, surprise_me: bool):
|
| 1353 |
if not openai_api_key or not serpapi_api_key:
|
| 1354 |
logging.error("run_deep_research: Invalid API keys provided.")
|
| 1355 |
return "Please input valid API keys", "", "", "", ""
|
|
@@ -1370,13 +1491,13 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
|
|
| 1370 |
final_process_log = ""
|
| 1371 |
final_crumbs = ""
|
| 1372 |
logging.info("run_deep_research: Starting deep research process.")
|
| 1373 |
-
for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
|
| 1374 |
initial_query, reportstyle, breadth, depth, followup_clarifications,
|
| 1375 |
include_domains, exclude_keywords, additional_clarifications,
|
| 1376 |
-
extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
|
| 1377 |
if rep is None:
|
| 1378 |
final_progress = progress
|
| 1379 |
-
yield final_progress, None, None, None, None
|
| 1380 |
else:
|
| 1381 |
final_report = rep
|
| 1382 |
final_process_log = proc_log
|
|
@@ -1391,7 +1512,7 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
|
|
| 1391 |
final_report = extended_report
|
| 1392 |
final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
|
| 1393 |
logging.info("run_deep_research: Deep research process completed.")
|
| 1394 |
-
yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
|
| 1395 |
|
| 1396 |
def load_example(example_choice: str) -> str:
|
| 1397 |
filename = ""
|
|
@@ -1521,10 +1642,11 @@ def main():
|
|
| 1521 |
report_file = gr.File(label="Download Report", visible=False, interactive=False, file_types=[".pdf"])
|
| 1522 |
generate_button = gr.Button("Generate Report")
|
| 1523 |
|
| 1524 |
-
with gr.Accordion("6] Extra Context (Crumbs, Existing Report & Log)", open=False):
|
| 1525 |
-
existing_report = gr.Textbox(label="Existing Report (if any)",
|
| 1526 |
-
existing_log = gr.Textbox(label="Existing Process Log (if any)",
|
| 1527 |
-
crumbs_box = gr.Textbox(label="Existing Crumbs (All
|
|
|
|
| 1528 |
|
| 1529 |
with gr.Accordion("7] Backup / Restore Fields", open=False):
|
| 1530 |
backup_text = gr.Textbox(label="Backup JSON", placeholder="Backup output will appear here. You can also paste JSON here to load fields.", lines=6, interactive=True)
|
|
@@ -1550,9 +1672,9 @@ def main():
|
|
| 1550 |
run_btn.click(
|
| 1551 |
fn=run_deep_research,
|
| 1552 |
inputs=[openai_api_key_input, serpapi_api_key_input, research_query, reportstyle, breadth, depth, clarification_text, include_domains, exclude_keywords,
|
| 1553 |
-
additional_clarifications, results_per_query, selected_engines, existing_report, existing_log, crumbs_box,
|
| 1554 |
pages_dropdown, surprise_me_checkbox],
|
| 1555 |
-
outputs=[progress_display, final_report, existing_report, existing_log, crumbs_box],
|
| 1556 |
show_progress=True,
|
| 1557 |
api_name="deep_research"
|
| 1558 |
)
|
|
|
|
| 110 |
- Take a deep breath, think step by step and think it well.
|
| 111 |
|
| 112 |
// Examples
|
| 113 |
+
Note: Pay attention for each example to what type of parenthesis / bracket is used and respect it scrupulously
|
| 114 |
+
|
| 115 |
-- flowchart --
|
| 116 |
Important:
|
| 117 |
+
- If the flow is "broader" than deep (>3 branches at the same level), choose LR (Left Right)
|
| 118 |
- If the flow is "deeper" than broad (>3 levels), choose TD (Top Down)
|
| 119 |
|
| 120 |
Top Down:
|
|
|
|
| 395 |
return err_msg
|
| 396 |
|
| 397 |
def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
|
| 398 |
+
# measure snippet length
|
| 399 |
+
snippet_words = len(snippet.split())
|
| 400 |
+
# decide a proportional max tokens (cap at 3000 for example)
|
| 401 |
+
# e.g. 1 token ~ ~0.75 words, so we do something simplistic:
|
| 402 |
+
dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
|
| 403 |
+
|
| 404 |
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
| 405 |
prompt = (f"""Analyze the following content from a query result:
|
| 406 |
|
|
|
|
| 431 |
For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
|
| 432 |
Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
|
| 433 |
|
| 434 |
+
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 435 |
+
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 436 |
+
|
| 437 |
Proceed."""
|
| 438 |
)
|
| 439 |
try:
|
|
|
|
| 544 |
word_count = pages * 500
|
| 545 |
prompt = (f"""
|
| 546 |
// Instructions:
|
| 547 |
+
- We want to incorporate as many relevant numbers, statistics, factual references, quotes from the sources,
|
| 548 |
+
- Explicit mentions of organizations, tools, projects, or people from the crumb data as possible.
|
| 549 |
+
- In your writing, do the following:
|
| 550 |
+
1. Integrate numbers, quotes, and factual references systematically.
|
| 551 |
+
2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
|
| 552 |
+
3. Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
|
| 553 |
+
4. This is for academic purposes, so thorough citations and referencing are essential.
|
| 554 |
+
Note: Do not be shy to use the names (organizations, people, project, application, tools...) mentioned in the sources, we need this for academic correctness
|
| 555 |
+
|
| 556 |
+
// Sources
|
| 557 |
+
Use the following learnings and merged reference details from a deep research process on:
|
| 558 |
'{initial_query}'
|
| 559 |
Taking also into consideration the context:
|
| 560 |
{context}
|
|
|
|
| 875 |
def backup_fields(research_query: str,
|
| 876 |
include_domains: str, exclude_keywords: str, additional_clarifications: str,
|
| 877 |
selected_engines, results_per_query, breadth, depth, clarification_text: str,
|
| 878 |
+
existing_report: str, existing_log: str, crumbs_box: str, final_report: str, existing_queries_box: str) -> str:
|
| 879 |
data = {
|
| 880 |
"openai_api_key": "",
|
| 881 |
"serpapi_api_key": "",
|
|
|
|
| 891 |
"existing_report": existing_report,
|
| 892 |
"existing_log": existing_log,
|
| 893 |
"crumbs_box": crumbs_box,
|
| 894 |
+
"final_report": final_report,
|
| 895 |
+
"existing_queries": existing_queries_box
|
| 896 |
}
|
| 897 |
backup_json = json.dumps(data, indent=2)
|
| 898 |
logging.info(f"backup_fields: Data backed up: {backup_json}")
|
|
|
|
| 916 |
data.get("existing_report", ""),
|
| 917 |
data.get("existing_log", ""),
|
| 918 |
data.get("crumbs_box", ""),
|
| 919 |
+
data.get("final_report", ""),
|
| 920 |
+
data.get("existing_queries",""))
|
| 921 |
except Exception as e:
|
| 922 |
logging.error(f"load_fields error: {e}")
|
| 923 |
+
return ("", "", "", "", "", "", [], 10, 4, 2, "", "", "", "", "", "")
|
| 924 |
|
| 925 |
def refine_query(query: str, openai_api_key: str) -> str:
|
| 926 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
|
|
|
| 1215 |
generator = iterative_deep_research_gen(
|
| 1216 |
disruptive_query, reportstyle, breadth, depth, followup_clarifications,
|
| 1217 |
include_domains, exclude_keywords, additional_clarifications,
|
| 1218 |
+
extra_context="", selected_engines=selected_engines, results_per_query=results_per_query, existing_queries, go_deeper=1
|
| 1219 |
)
|
| 1220 |
extension_report = ""
|
| 1221 |
for progress, rep, proc_log, new_crumbs in generator:
|
|
|
|
| 1226 |
appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
|
| 1227 |
return appended_report
|
| 1228 |
|
| 1229 |
+
def extract_structured_insights(html_text: str) -> str:
|
| 1230 |
+
"""
|
| 1231 |
+
Extract only facts, figures, arguments, and quotes in a concise manner.
|
| 1232 |
+
Use BeautifulSoup to parse and remove anything not relevant to these categories.
|
| 1233 |
+
This function returns a short text suitable for summarization by the LLM.
|
| 1234 |
+
"""
|
| 1235 |
+
soup = BeautifulSoup(html_text, "html.parser")
|
| 1236 |
+
|
| 1237 |
+
# We can decide to keep paragraphs that contain digits (numbers),
|
| 1238 |
+
# or words like "claim", "argument", "quote", etc. This is just an example heuristic.
|
| 1239 |
+
paragraphs = soup.find_all('p')
|
| 1240 |
+
curated_excerpts = []
|
| 1241 |
+
for p in paragraphs:
|
| 1242 |
+
text = p.get_text().strip()
|
| 1243 |
+
# If it has digits or certain keywords, we keep it
|
| 1244 |
+
if re.search(r'\d+', text) or re.search(r'\bargument\b|\bfact\b|\bfigure\b|\bstudy\b|\bquote\b', text, re.IGNORECASE):
|
| 1245 |
+
curated_excerpts.append(text)
|
| 1246 |
+
|
| 1247 |
+
# Combine them into a shorter snippet
|
| 1248 |
+
snippet = "\n".join(curated_excerpts)
|
| 1249 |
+
# If snippet is too short, fallback to the entire cleaned text
|
| 1250 |
+
if len(snippet.split()) < 30:
|
| 1251 |
+
snippet = clean_content(html_text)[:2000] # or some fallback length
|
| 1252 |
+
|
| 1253 |
+
return snippet
|
| 1254 |
+
|
| 1255 |
def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
|
| 1256 |
followup_clarifications: str,
|
| 1257 |
include_domains: str,
|
|
|
|
| 1260 |
extra_context: str = "",
|
| 1261 |
selected_engines=None,
|
| 1262 |
results_per_query: int = 10,
|
| 1263 |
+
existing_queries: str,
|
| 1264 |
go_deeper: int = 8):
|
| 1265 |
overall_context = extra_context + f"Initial Query: {initial_query}\n"
|
| 1266 |
if followup_clarifications.strip():
|
|
|
|
| 1268 |
process_log = "Starting research with context:\n" + overall_context + "\n"
|
| 1269 |
overall_learnings = []
|
| 1270 |
visited_urls = set()
|
| 1271 |
+
# Parse previously processed queries from existing_queries if provided
|
| 1272 |
+
processed_queries = set()
|
| 1273 |
+
for q_line in existing_queries.splitlines():
|
| 1274 |
+
q_line = q_line.strip()
|
| 1275 |
+
if q_line:
|
| 1276 |
+
processed_queries.add(q_line)
|
| 1277 |
crumbs_list = []
|
| 1278 |
ref_counter = 1
|
| 1279 |
references_list = []
|
|
|
|
| 1288 |
unique_suggestions = list(set(followup_suggestions))
|
| 1289 |
combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
|
| 1290 |
queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
|
| 1291 |
+
|
| 1292 |
+
# ===================================================================
|
| 1293 |
+
# Skip queries already in processed_queries
|
| 1294 |
+
filtered_query_tuples = []
|
| 1295 |
+
for q_tuple in queries:
|
| 1296 |
+
q_text, eng = q_tuple
|
| 1297 |
+
if q_text not in processed_queries:
|
| 1298 |
+
filtered_query_tuples.append(q_tuple)
|
| 1299 |
+
processed_queries.add(q_text) # remember we've processed it
|
| 1300 |
+
# ===================================================================
|
| 1301 |
+
process_log += f"\nWill run {len(filtered_query_tuples)} new queries this iteration instead of {len(queries)} total.\n"
|
| 1302 |
iteration_learnings = []
|
| 1303 |
followup_suggestions = [] # reset for current iteration
|
| 1304 |
+
for query_tuple in filtered_query_tuples:
|
| 1305 |
query_str, engine = query_tuple
|
| 1306 |
mod_query = query_str
|
| 1307 |
if include_domains.strip():
|
|
|
|
| 1338 |
logging.error(f"Error retrieving content from {url}: {e}")
|
| 1339 |
process_log += f"Error retrieving content from {url}: {e}\n"
|
| 1340 |
continue
|
| 1341 |
+
|
| 1342 |
+
# 1) Clean and do minimal parse
|
| 1343 |
+
cleaned_html = clean_content(raw_content)
|
| 1344 |
+
# 2) Extract structured data
|
| 1345 |
+
semantically_rich_snippet = extract_structured_insights(cleaned_html)
|
| 1346 |
+
# 3) Summarize with LLM
|
| 1347 |
+
analysis = analyze_with_gpt4o(initial_query, semantically_rich_snippet, breadth)
|
| 1348 |
|
| 1349 |
# Analyze the cleaned content with GPT-4o-mini
|
| 1350 |
analysis = analyze_with_gpt4o(initial_query, cleaned_content, breadth)
|
|
|
|
| 1387 |
process_log += "Appended additional clarifications to the context.\n"
|
| 1388 |
progress_pct = int((iteration / depth) * 100)
|
| 1389 |
yield (f"Progress: {progress_pct}%", None, None, None)
|
| 1390 |
+
|
| 1391 |
+
# chunk and filter all crumbs if breadth>3 and depth>2
|
| 1392 |
+
filtered_crumbs_list = crumbs_list
|
| 1393 |
+
if breadth > 3 and depth > 2:
|
| 1394 |
+
filtered_crumbs_list = filter_crumbs_in_batches(crumbs_list, initial_query, followup_clarifications)
|
| 1395 |
+
|
| 1396 |
+
# Now build aggregated crumb text from filtered_crumbs_list only
|
| 1397 |
+
aggregated_crumbs = "\n\n".join([
|
| 1398 |
+
f"Title: {c.get('title','No Title')}\nURL: {c['url']}\nSummary: {c['summary']}"
|
| 1399 |
+
for c in filtered_crumbs_list
|
| 1400 |
+
])
|
| 1401 |
final_report = generate_final_report(initial_query, combined_context, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
|
| 1402 |
|
| 1403 |
# --- NEW STEP: Post-process final_report to replace visual and focus placeholders ---
|
|
|
|
| 1410 |
f"<p>---------</p><p><b>Report alignment assessment:</b> {alignment_assessment}</p> </div> </body></html>"
|
| 1411 |
)
|
| 1412 |
logging.info("iterative_deep_research_gen: Final report generated.")
|
| 1413 |
+
# We convert processed_queries to a string suitable for storing
|
| 1414 |
+
all_processed_queries_str = "\n".join(sorted(processed_queries))
|
| 1415 |
+
|
| 1416 |
+
yield ("", final_report, process_log, crumbs_list, all_processed_queries_str)
|
| 1417 |
|
| 1418 |
+
def filter_crumbs_in_batches(crumbs_list: list, initial_query: str, clarifications: str) -> list:
|
| 1419 |
+
"""
|
| 1420 |
+
Splits crumbs into batches of 20, calls an LLM to decide keep/ignore each crumb.
|
| 1421 |
+
Returns the final list of accepted crumbs.
|
| 1422 |
+
"""
|
| 1423 |
+
accepted = []
|
| 1424 |
+
batch_size = 20
|
| 1425 |
+
for i in range(0, len(crumbs_list), batch_size):
|
| 1426 |
+
batch = crumbs_list[i:i+batch_size]
|
| 1427 |
+
# Build a prompt describing each crumb
|
| 1428 |
+
prompt = "We have a set of crumbs. For each crumb, decide if it significantly adds new facts, figures, references, or quotes.\n"
|
| 1429 |
+
prompt += "Mark 'yes' if it is valuable for the final report, otherwise 'no'. Output JSON.\n\n"
|
| 1430 |
+
listing = []
|
| 1431 |
+
for idx, c in enumerate(batch):
|
| 1432 |
+
snippet_for_prompt = c["summary"][:500] # short snippet
|
| 1433 |
+
listing.append(f"Crumb {idx}: {snippet_for_prompt}")
|
| 1434 |
+
prompt += "\n".join(listing)
|
| 1435 |
+
|
| 1436 |
+
prompt += """
|
| 1437 |
+
Return a JSON object with structure:
|
| 1438 |
+
{
|
| 1439 |
+
"0": "yes" or "no",
|
| 1440 |
+
"1": "yes" or "no",
|
| 1441 |
+
...
|
| 1442 |
+
}
|
| 1443 |
+
"""
|
| 1444 |
+
decision_str = openai_call(prompt, model="o3-mini", max_tokens_param=1500)
|
| 1445 |
+
# parse JSON
|
| 1446 |
+
try:
|
| 1447 |
+
decisions = json.loads(decision_str)
|
| 1448 |
+
except:
|
| 1449 |
+
decisions = {}
|
| 1450 |
+
for idx, c in enumerate(batch):
|
| 1451 |
+
d = decisions.get(str(idx), "no").lower()
|
| 1452 |
+
if d == "yes":
|
| 1453 |
+
accepted.append(c)
|
| 1454 |
+
return accepted
|
| 1455 |
+
|
| 1456 |
def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
|
| 1457 |
prompt = (
|
| 1458 |
"Please assess the following research report in terms of its alignment with the initial user request "
|
|
|
|
| 1470 |
followup_clarifications: str, include_domains: str,
|
| 1471 |
exclude_keywords: str, additional_clarifications: str,
|
| 1472 |
results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
|
| 1473 |
+
existing_queries: str, pages: str, surprise_me: bool):
|
| 1474 |
if not openai_api_key or not serpapi_api_key:
|
| 1475 |
logging.error("run_deep_research: Invalid API keys provided.")
|
| 1476 |
return "Please input valid API keys", "", "", "", ""
|
|
|
|
| 1491 |
final_process_log = ""
|
| 1492 |
final_crumbs = ""
|
| 1493 |
logging.info("run_deep_research: Starting deep research process.")
|
| 1494 |
+
for progress, rep, proc_log, crumbs, all_processed_queries_str in iterative_deep_research_gen(
|
| 1495 |
initial_query, reportstyle, breadth, depth, followup_clarifications,
|
| 1496 |
include_domains, exclude_keywords, additional_clarifications,
|
| 1497 |
+
extra_context, selected_engines, results_per_query, existing_queries, go_deeper=int(pages)):
|
| 1498 |
if rep is None:
|
| 1499 |
final_progress = progress
|
| 1500 |
+
yield final_progress, None, None, None, None, all_processed_queries_str
|
| 1501 |
else:
|
| 1502 |
final_report = rep
|
| 1503 |
final_process_log = proc_log
|
|
|
|
| 1512 |
final_report = extended_report
|
| 1513 |
final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
|
| 1514 |
logging.info("run_deep_research: Deep research process completed.")
|
| 1515 |
+
yield (final_progress, final_report, final_report, final_process_log, final_crumbs, all_processed_queries_str)
|
| 1516 |
|
| 1517 |
def load_example(example_choice: str) -> str:
|
| 1518 |
filename = ""
|
|
|
|
| 1642 |
report_file = gr.File(label="Download Report", visible=False, interactive=False, file_types=[".pdf"])
|
| 1643 |
generate_button = gr.Button("Generate Report")
|
| 1644 |
|
| 1645 |
+
with gr.Accordion("6] Extra Context (Crumbs, Existing Report & Log, Processed Queries)", open=False):
|
| 1646 |
+
existing_report = gr.Textbox(label="Existing Report (if any)", ...)
|
| 1647 |
+
existing_log = gr.Textbox(label="Existing Process Log (if any)", ...)
|
| 1648 |
+
crumbs_box = gr.Textbox(label="Existing Crumbs (All sources, JSON)", ...)
|
| 1649 |
+
existing_queries_box = gr.Textbox(label="Existing Queries (processed queries)", placeholder="Paste processed queries here...", lines=4)
|
| 1650 |
|
| 1651 |
with gr.Accordion("7] Backup / Restore Fields", open=False):
|
| 1652 |
backup_text = gr.Textbox(label="Backup JSON", placeholder="Backup output will appear here. You can also paste JSON here to load fields.", lines=6, interactive=True)
|
|
|
|
| 1672 |
run_btn.click(
|
| 1673 |
fn=run_deep_research,
|
| 1674 |
inputs=[openai_api_key_input, serpapi_api_key_input, research_query, reportstyle, breadth, depth, clarification_text, include_domains, exclude_keywords,
|
| 1675 |
+
additional_clarifications, results_per_query, selected_engines, existing_report, existing_log, existing_queries, crumbs_box,
|
| 1676 |
pages_dropdown, surprise_me_checkbox],
|
| 1677 |
+
outputs=[progress_display, final_report, existing_report, existing_log, crumbs_box, existing_queries_box],
|
| 1678 |
show_progress=True,
|
| 1679 |
api_name="deep_research"
|
| 1680 |
)
|