Guiyom commited on
Commit
559e3f2
·
verified ·
1 Parent(s): 4c364e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -186
app.py CHANGED
@@ -12,8 +12,6 @@ import tempfile
12
  import logging
13
  import markdown
14
  import unicodedata
15
- import asyncio
16
- import aiohttp
17
  from datetime import datetime
18
  from reportlab.lib.pagesizes import A4
19
  from xhtml2pdf import pisa
@@ -29,7 +27,7 @@ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/
29
  "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
30
 
31
  # =============================================================================
32
- # Helper functions for external APIs, PDF Processing and Asynchronous Requests
33
  # =============================================================================
34
 
35
  def display_image():
@@ -178,24 +176,32 @@ def generate_final_report(initial_query: str, reportstyle: str, learnings: list,
178
  prompt = (f"""
179
  Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
180
  The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
181
- It must include inline citations (e.g., [1], [2], etc.) and follow this writing style: {reportstyle}.
182
- Include at least {round(pages/3,0)} tables from the sources used (citations added if necessary).
183
- The structure should have:
 
 
184
  - Abstract
185
  - Table of contents
186
  - Introduction
187
- - [Sections and sub-sections as needed]
188
  - Conclusion
189
- - References
 
 
 
 
 
190
 
191
- Important: Number titles and lists as 1., 1.1, etc.
192
  Learnings:
193
  {json.dumps(learnings, indent=2)}
 
194
  Merged Reference Details:
195
  {aggregated_crumbs}"""
196
  )
197
  tokentarget = word_count * 3 # rough multiplier for token target
198
  report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
 
199
  if len(report) > MAX_MESSAGE_LENGTH:
200
  report = compress_text(report, MAX_MESSAGE_LENGTH)
201
  if report.startswith("Error calling OpenAI API"):
@@ -205,21 +211,17 @@ Merged Reference Details:
205
  return report
206
 
207
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
208
- # Filter out already seen results by URL and domain (robust deduplication)
209
  new_results = []
210
  candidate_indexes = []
211
- seen_domains = set()
212
  for idx, res in enumerate(results):
213
  url = res.get("link", "")
214
  if url and url not in visited_urls:
215
- domain = url.split("/")[2] if "://" in url else url
216
- if domain in seen_domains:
217
- continue
218
  new_results.append(res)
219
  candidate_indexes.append(idx)
220
- seen_domains.add(domain)
221
  if not new_results:
222
  return []
 
223
  results_text = ""
224
  for idx, res in enumerate(new_results):
225
  title = res.get("title", "No Title")
@@ -229,18 +231,25 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
229
  prompt = (
230
  f"The following search results were obtained for the query '{query}' with clarifications:\n"
231
  f"{clarifications}\n\n"
232
- "For each result, decide if it might be relevant for deeper research. Return a JSON object with keys as result indices and values as 'yes' or 'no'.\n"
233
- f"Results:{results_text}\nOutput only the JSON object."
 
 
 
 
 
234
  )
235
  llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
236
  try:
237
  decision_map = json.loads(llm_response)
238
  except Exception as e:
239
  logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
 
240
  decision_map = {}
241
  filtered = []
242
  for idx, res in enumerate(new_results):
243
  url = res.get("link", "")
 
244
  visited_urls.add(url)
245
  decision = decision_map.get(str(idx), "no").strip().lower()
246
  if decision == "yes":
@@ -249,11 +258,12 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
249
  return filtered
250
 
251
  def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
252
- finalquery = f"({query})" # original query in parentheses
253
  languages_detected_list = languagesdetected.split(",")
254
  for lang in languages_detected_list:
255
  prompt2 = f"""The research query is: "{query}".
256
- Based on this query and context: "{context}", and using the detected language {lang}, provide a translated version (less than 20 words) preserving search operators.
 
257
  Output only the translated query."""
258
  translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
259
  finalquery += f" OR ({translatedquery})"
@@ -261,14 +271,11 @@ Output only the translated query."""
261
  return finalquery
262
 
263
  def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
264
- # Generate several variants of the query based on the desired breadth.
265
  base_terms = initial_query.strip()
266
- variants = [base_terms,
267
- base_terms + " detailed analysis",
268
- base_terms + " review",
269
- base_terms + " case study"]
270
- # Return only as many as needed (up to 'breadth')
271
- final_queries = variants[:min(len(variants), breadth)]
272
  logging.info(f"generate_query_tree: Generated queries: {final_queries}")
273
  return final_queries
274
 
@@ -276,11 +283,13 @@ def generate_serp_queries(context: str, breadth: int, depth: int, initial_query:
276
  selected_engines=None, results_per_query: int = 10) -> list:
277
  queries = generate_query_tree(initial_query, breadth, depth)
278
  prompt = f"""The research query is: "{initial_query}".
279
- Based on the context: "{context}", suggest non-English languages (if any) relevant.
280
  Output either:
281
  - "No local attributes detected"
282
- - A comma-separated list (e.g., "Spanish,Italian")
283
- Output only the result."""
 
 
284
  languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
285
  if languages_detected != "No local attributes detected":
286
  queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
@@ -288,12 +297,17 @@ Output only the result."""
288
  prompt_engines = f"""
289
  Examine these queries:
290
  {queries}
291
- Considering the context:
292
  {context}
293
- Identify among these search engines: google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews.
294
- Return a comma separated list (default "google" if none)."""
 
 
 
295
  identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
296
  selected_engines = identified_engines.split(",")
 
 
297
  final_queries = []
298
  for q in queries:
299
  for engine in selected_engines:
@@ -379,23 +393,6 @@ def refine_query(query: str, openai_api_key: str) -> str:
379
  logging.info(f"refine_query: Refined query: {refined}")
380
  return refined
381
 
382
- # --- New Asynchronous Helper for Parallel URL Fetching --- #
383
- async def async_fetch_url(session: aiohttp.ClientSession, url: str) -> str:
384
- """Fetch the URL asynchronously using aiohttp."""
385
- try:
386
- async with session.get(url, headers=HEADERS, timeout=10) as response:
387
- response.raise_for_status()
388
- text = await response.text()
389
- logging.info(f"async_fetch_url: Fetched content from {url}")
390
- return text
391
- except Exception as e:
392
- logging.error(f"async_fetch_url: Error retrieving content from {url}: {e}")
393
- return ""
394
-
395
- # =============================================================================
396
- # ReportGenerator and PDF generation (Enhanced CSS added)
397
- # =============================================================================
398
-
399
  class ReportGenerator:
400
  def __init__(self):
401
  pass
@@ -406,9 +403,9 @@ class ReportGenerator:
406
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
407
  # Remove markdown hyperlink syntax: replace [text](link) with just text.
408
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
409
- # Convert markdown to HTML using the "extra" and "tables" extensions.
410
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
411
- # Insert explicit page breaks before key headings (with added CSS for dynamic styling).
412
  html_content = html_content.replace("<h2>Table of Contents</h2>",
413
  "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
414
  html_content = html_content.replace("<h2>Introduction</h2>",
@@ -417,8 +414,10 @@ class ReportGenerator:
417
  "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
418
  html_content = html_content.replace("<h2>References</h2>",
419
  "<div style='page-break-before: always;'></div><h2>References</h2>")
 
420
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
421
  "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
 
422
  date_str = datetime.now().strftime("%Y-%m-%d")
423
  header = ""
424
  if metadata:
@@ -426,21 +425,33 @@ class ReportGenerator:
426
  <p>Author: {metadata.get('User name', 'N/A')}</p>
427
  <p>Date: {metadata.get('Date', date_str)}</p>
428
  <hr/>"""
 
429
  full_html = f"""
430
  <html>
431
  <head>
432
  <meta charset="utf-8" />
433
  <style>
434
- body {{ font-family: Helvetica, sans-serif; margin: 40px; background: #fefefe; }}
435
  h1 {{ font-size: 24pt; margin-bottom: 12px; }}
436
  h2 {{ font-size: 20pt; margin-bottom: 10px; }}
437
  h3 {{ font-size: 18pt; margin-bottom: 8px; }}
438
  p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
439
- ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
 
440
  hr {{ border: 1px solid #ccc; margin: 20px 0; }}
441
- table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
442
- th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
443
- th {{ background-color: #f2f2f2; }}
 
 
 
 
 
 
 
 
 
 
444
  </style>
445
  </head>
446
  <body>
@@ -449,6 +460,7 @@ class ReportGenerator:
449
  </body>
450
  </html>
451
  """
 
452
  pdf_buffer = io.BytesIO()
453
  pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
454
  if pisa_status.err:
@@ -469,6 +481,7 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
469
  final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
470
  pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
471
  metadata=metadata)
 
472
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
473
  tmp_file.write(pdf_bytes)
474
  tmp_path = tmp_file.name
@@ -479,6 +492,10 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
479
  return f"Error generating report: {str(e)}", None
480
 
481
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
 
 
 
 
482
  aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
483
  logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
484
  return aggregated
@@ -497,12 +514,16 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
497
  "Formulate this as a new research query that could lead to innovative insights.")
498
  disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
499
  logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
 
 
500
  clarifications_for_new = generate_tailored_questions(
501
  os.getenv("OPENAI_API_KEY"),
502
- disruptive_query + "\n\n IMPORTANT NOTE: in this iteration, generate also simulated responses for the questions asked",
503
  "", "", "", ""
504
  )
505
  logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
 
 
506
  generator = iterative_deep_research_gen(
507
  disruptive_query, reportstyle, breadth, depth, followup_clarifications,
508
  include_domains, exclude_keywords, additional_clarifications,
@@ -517,8 +538,7 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
517
  appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
518
  return appended_report
519
 
520
- # --- Adaptive and Parallel Organized Research (Dynamic Agent Orchestration) --- #
521
- async def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
522
  followup_clarifications: str,
523
  include_domains: str,
524
  exclude_keywords: str,
@@ -538,93 +558,90 @@ async def iterative_deep_research_gen(initial_query: str, reportstyle: str, brea
538
  references_list = []
539
  followup_suggestions = []
540
  logging.info("iterative_deep_research_gen: Research started.")
541
-
542
- # Create a single aiohttp session for parallel page fetching
543
- async with aiohttp.ClientSession() as session:
544
- for iteration in range(1, depth + 1):
545
- process_log += f"\n--- Iteration {iteration} ---\n"
546
- logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
547
- combined_context = overall_context
548
- if followup_suggestions:
549
- unique_suggestions = list(set(followup_suggestions))
550
- combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
551
- queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
552
- process_log += f"Generated queries: {queries}\n"
553
- iteration_learnings = []
554
- followup_suggestions = [] # reset for current iteration
555
-
556
- # For each query, perform SERPAPI search and fetch pages concurrently:
557
- for query_str, engine in queries:
558
- mod_query = query_str
559
- if include_domains.strip():
560
- domains = [d.strip() for d in include_domains.split(",") if d.strip()]
561
- domain_str = " OR ".join([f"site:{d}" for d in domains])
562
- mod_query += f" ({domain_str})"
563
- if exclude_keywords.strip():
564
- for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
565
- mod_query += f" -{ex}"
566
- process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
567
- results = perform_serpapi_search(mod_query, engine, results_per_query)
568
- filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
569
- process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
570
- async_tasks = []
571
- for res in filtered_results:
572
- url = res.get("link", "")
573
- if not url:
 
 
 
574
  continue
575
- if url.lower().endswith(".pdf"):
576
- content = process_pdf(url)
577
- process_log += f"Extracted PDF content from {url}\n"
578
- # Process synchronously for PDFs
579
- analysis = analyze_with_gpt4o(initial_query, content)
580
- analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
581
- crumbs_list.append({"url": url, "summary": analysis_summary, "full_content": content})
582
- if analysis.get("relevant", "no").lower() == "yes":
583
- link_str = f" <a href='{url}'>[{ref_counter}]</a>"
584
- summary_with_ref = analysis_summary + link_str
585
- iteration_learnings.append(summary_with_ref)
586
- references_list.append((ref_counter, url))
587
- ref_counter += 1
588
- if isinstance(analysis.get("followups"), list):
589
- followup_suggestions.extend(analysis.get("followups"))
 
 
 
 
 
 
 
 
 
590
  else:
591
- # Schedule asynchronous fetching for non-PDF pages
592
- async_tasks.append(async_fetch_url(session, url))
593
- # Wait for asynchronous fetches to complete
594
- if async_tasks:
595
- fetched_contents = await asyncio.gather(*async_tasks)
596
- for content in fetched_contents:
597
- if not content:
598
- continue
599
- analysis = analyze_with_gpt4o(initial_query, content)
600
- analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
601
- # Here we do not re-fetch URL since it is already processed
602
- crumbs_list.append({"url": "async_url", "summary": analysis_summary, "full_content": content})
603
- if analysis.get("relevant", "no").lower() == "yes":
604
- link_str = f" [*]" # Mark asynchronous fetched URLs.
605
- summary_with_ref = analysis_summary + link_str
606
- iteration_learnings.append(summary_with_ref)
607
- if isinstance(analysis.get("followups"), list):
608
- followup_suggestions.extend(analysis.get("followups"))
609
- process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
610
- logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
611
- overall_learnings.extend(iteration_learnings)
612
- overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
613
- if additional_clarifications.strip():
614
- overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
615
- process_log += "Appended additional clarifications to the context.\n"
616
- # Adaptive follow-up: if new followup suggestions emerged, call tailored questions generator
617
- if followup_suggestions:
618
- extra_questions = generate_tailored_questions(os.getenv("OPENAI_API_KEY"), initial_query, "", "", "", "")
619
- overall_context += "\nAdaptive Follow-Up Questions:\n" + extra_questions + "\n"
620
- progress_pct = int((iteration / depth) * 100)
621
- yield (f"Progress: {progress_pct}%", None, process_log, None)
622
- aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
623
- final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
624
- alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
625
- final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
626
- logging.info("iterative_deep_research_gen: Final report generated.")
627
- yield ("", final_report, process_log, crumbs_list)
628
 
629
  def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
630
  prompt = (
@@ -632,21 +649,21 @@ def assess_report_alignment(report: str, initial_query: str, clarifications: str
632
  "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
633
  "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
634
  "Research Report:\n" + report + "\n\n"
635
- "Provide a short paragraph assessment on how well the report aligns with these requirements."
636
  )
637
  assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
638
  logging.info(f"assess_report_alignment: Assessment result: {assessment}")
639
  return assessment
640
 
641
- # --- Main Deep Research Orchestrator (Wrapper for async execution) --- #
642
- async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str,
643
- breadth: int, depth: int, followup_clarifications: str, include_domains: str,
644
- exclude_keywords: str, additional_clarifications: str, results_per_query: int,
645
- selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
646
- pages: str, surprise_me: bool):
647
  if not openai_api_key or not serpapi_api_key:
648
- logging.error("orchestrate_deep_research: Invalid API keys provided.")
649
- return "Please input valid API keys", "", "", ""
 
650
  os.environ["OPENAI_API_KEY"] = openai_api_key
651
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
652
 
@@ -658,26 +675,33 @@ async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, i
658
  if existing_crumbs:
659
  extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
660
 
661
- loop = asyncio.get_event_loop()
662
- researcher = iterative_deep_research_gen(initial_query, reportstyle, breadth, depth, followup_clarifications,
663
- include_domains, exclude_keywords, additional_clarifications,
664
- extra_context, selected_engines, results_per_query, go_deeper=int(pages))
665
  final_report = ""
666
- process_log = ""
667
- async for progress, rep, proc_log, crumbs in researcher:
 
 
 
 
 
668
  if rep is None:
669
- current_progress = progress
670
- # You could yield intermediate progress if needed.
671
  else:
672
  final_report = rep
673
- process_log = proc_log
 
674
  break
675
  if surprise_me:
676
- extended_report = generate_surprise_report(final_report, crumbs, initial_query, reportstyle, breadth, depth,
677
- followup_clarifications, include_domains, exclude_keywords,
678
- additional_clarifications, results_per_query, selected_engines)
 
 
679
  final_report = extended_report
680
- return final_report, process_log, extra_context
 
 
681
 
682
  def load_example(example_choice: str) -> str:
683
  filename = ""
@@ -696,22 +720,6 @@ def load_example(example_choice: str) -> str:
696
  logging.error(f"load_example: Error loading {filename}: {e}")
697
  return ""
698
 
699
- def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
700
- followup_clarifications: str, include_domains: str, exclude_keywords: str, additional_clarifications: str,
701
- results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
702
- pages: str, surprise_me: bool):
703
- final_report, proc_log, extra_context = asyncio.run(
704
- orchestrate_deep_research(openai_api_key, serpapi_api_key, initial_query, reportstyle, breadth, depth,
705
- followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
706
- results_per_query, selected_engines, existing_crumbs, existing_report, existing_log,
707
- pages, surprise_me)
708
- )
709
- return ("Progress: 100%", final_report, existing_report, existing_log, existing_crumbs)
710
-
711
- # =============================================================================
712
- # Gradio Interface using gr.Blocks with Custom CSS
713
- # =============================================================================
714
-
715
  def main():
716
  custom_css = """
717
  /* Overall container customization */
@@ -764,16 +772,16 @@ def main():
764
  openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
765
  serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
766
  gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
767
- gr.Markdown("API keys are not stored or logged.")
768
 
769
- with gr.Accordion("2] Research topic", open=False):
770
  with gr.Row():
771
  research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
772
  refine_query_button = gr.Button("Refine my Query", scale=1)
773
 
774
  with gr.Accordion("3] Q&A", open=False):
775
  with gr.Row():
776
- clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale=4)
777
  gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
778
 
779
  with gr.Accordion("4] Search Parameters", open=False):
@@ -814,7 +822,7 @@ def main():
814
  with gr.Accordion("5] Report", open=False, elem_classes="folder"):
815
  progress_display = gr.Markdown("", elem_id="progress-display")
816
  run_btn = gr.Button("Generate report")
817
- final_report = gr.Markdown(label="Final Report (Markdown)", height=800, min_height=50)
818
  with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
819
  with gr.Column():
820
  query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)
 
12
  import logging
13
  import markdown
14
  import unicodedata
 
 
15
  from datetime import datetime
16
  from reportlab.lib.pagesizes import A4
17
  from xhtml2pdf import pisa
 
27
  "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
28
 
29
  # =============================================================================
30
+ # Helper functions for external APIs and PDF Processing
31
  # =============================================================================
32
 
33
  def display_image():
 
176
  prompt = (f"""
177
  Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
178
  The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
179
+ It must include inline citations (e.g., [1], [2], etc.).
180
+ It must follow this writing style {reportstyle}.
181
+ The report must include at least {round(pages/3,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
182
+
183
+ The structure of the report should be:
184
  - Abstract
185
  - Table of contents
186
  - Introduction
187
+ - [Sections and sub-sections, depending on the size and relevant topic]
188
  - Conclusion
189
+ - References of the documents used in the inline citations
190
+
191
+ Important: For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...). This is to avoid issues when converting markdown to html.
192
+ You should still use markdown for the stryling (titles levels, bold, italic), tables...
193
+
194
+ Output the report directly without any introductory meta comments.
195
 
 
196
  Learnings:
197
  {json.dumps(learnings, indent=2)}
198
+
199
  Merged Reference Details:
200
  {aggregated_crumbs}"""
201
  )
202
  tokentarget = word_count * 3 # rough multiplier for token target
203
  report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
204
+ # If the report is too long, compress it.
205
  if len(report) > MAX_MESSAGE_LENGTH:
206
  report = compress_text(report, MAX_MESSAGE_LENGTH)
207
  if report.startswith("Error calling OpenAI API"):
 
211
  return report
212
 
213
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
214
+ # Filter out already seen results
215
  new_results = []
216
  candidate_indexes = []
 
217
  for idx, res in enumerate(results):
218
  url = res.get("link", "")
219
  if url and url not in visited_urls:
 
 
 
220
  new_results.append(res)
221
  candidate_indexes.append(idx)
 
222
  if not new_results:
223
  return []
224
+ # Build the prompt with relaxed criteria.
225
  results_text = ""
226
  for idx, res in enumerate(new_results):
227
  title = res.get("title", "No Title")
 
231
  prompt = (
232
  f"The following search results were obtained for the query '{query}' with clarifications:\n"
233
  f"{clarifications}\n\n"
234
+ "For each result, decide whether it might be of interest for deeper research. "
235
+ "Even if not completely certain, lean towards including more potential references. "
236
+ "Return your decision as a JSON object where each key is the result index (as an integer) and the value is either 'yes' or 'no'. "
237
+ "For example: {\"0\": \"yes\", \"1\": \"no\", \"2\": \"yes\"}.\n"
238
+ "Consider the title, snippet, and URL in your decision."
239
+ f"\nResults:{results_text}\n"
240
+ "Output only the JSON object."
241
  )
242
  llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
243
  try:
244
  decision_map = json.loads(llm_response)
245
  except Exception as e:
246
  logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
247
+ # In case of error, default to no results selected.
248
  decision_map = {}
249
  filtered = []
250
  for idx, res in enumerate(new_results):
251
  url = res.get("link", "")
252
+ # Add each URL to visited regardless of decision.
253
  visited_urls.add(url)
254
  decision = decision_map.get(str(idx), "no").strip().lower()
255
  if decision == "yes":
 
258
  return filtered
259
 
260
  def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
261
+ finalquery = f"({query})" # original query is wrapped in parentheses
262
  languages_detected_list = languagesdetected.split(",")
263
  for lang in languages_detected_list:
264
  prompt2 = f"""The research query is: "{query}".
265
+ Based on this query and context: "{context}", and with the detected language {lang}, provide the translated version of the query in that language.
266
+ The translation must be less than 20 words and preserve search operators like AND, OR, parenthesis, quotation marks, and exclusion hyphens.
267
  Output only the translated query."""
268
  translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
269
  finalquery += f" OR ({translatedquery})"
 
271
  return finalquery
272
 
273
  def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
 
274
  base_terms = initial_query.strip()
275
+ # Here you may add refinements if necessary to keep queries short.
276
+ queries = [base_terms]
277
+ # If topics are to be added, you can extend this list.
278
+ final_queries = queries[:min(len(queries), breadth)]
 
 
279
  logging.info(f"generate_query_tree: Generated queries: {final_queries}")
280
  return final_queries
281
 
 
283
  selected_engines=None, results_per_query: int = 10) -> list:
284
  queries = generate_query_tree(initial_query, breadth, depth)
285
  prompt = f"""The research query is: "{initial_query}".
286
+ Based on this query and the context: "{context}", suggest one or several languages (other than English) that might be relevant.
287
  Output either:
288
  - "No local attributes detected"
289
+ - One language (e.g., "Spanish")
290
+ - Multiple languages comma separated (e.g., "Italian,Putonghua,Cantonese")
291
+ Output only the result.
292
+ """
293
  languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
294
  if languages_detected != "No local attributes detected":
295
  queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
 
297
  prompt_engines = f"""
298
  Examine these queries:
299
  {queries}
300
+ and considering the research context:
301
  {context}
302
+ Identify among these search engines:
303
+ google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews
304
+ Which are most relevant? Output a comma separated list (e.g., "google,baidu").
305
+ If none are found, output "google".
306
+ """
307
  identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
308
  selected_engines = identified_engines.split(",")
309
+ else:
310
+ selected_engines = selected_engines
311
  final_queries = []
312
  for q in queries:
313
  for engine in selected_engines:
 
393
  logging.info(f"refine_query: Refined query: {refined}")
394
  return refined
395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  class ReportGenerator:
397
  def __init__(self):
398
  pass
 
403
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
404
  # Remove markdown hyperlink syntax: replace [text](link) with just text.
405
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
406
+ # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
407
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
408
+ # Insert explicit page breaks before specific headings for main report sections.
409
  html_content = html_content.replace("<h2>Table of Contents</h2>",
410
  "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
411
  html_content = html_content.replace("<h2>Introduction</h2>",
 
414
  "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
415
  html_content = html_content.replace("<h2>References</h2>",
416
  "<div style='page-break-before: always;'></div><h2>References</h2>")
417
+ # For the Surprise-Me section, ensure it starts on a new page.
418
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
419
  "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
420
+ # Build header using metadata if provided.
421
  date_str = datetime.now().strftime("%Y-%m-%d")
422
  header = ""
423
  if metadata:
 
425
  <p>Author: {metadata.get('User name', 'N/A')}</p>
426
  <p>Date: {metadata.get('Date', date_str)}</p>
427
  <hr/>"""
428
+ # Build a complete HTML document with CSS.
429
  full_html = f"""
430
  <html>
431
  <head>
432
  <meta charset="utf-8" />
433
  <style>
434
+ body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
435
  h1 {{ font-size: 24pt; margin-bottom: 12px; }}
436
  h2 {{ font-size: 20pt; margin-bottom: 10px; }}
437
  h3 {{ font-size: 18pt; margin-bottom: 8px; }}
438
  p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
439
+ ol {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
440
+ ul {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
441
  hr {{ border: 1px solid #ccc; margin: 20px 0; }}
442
+ table {{
443
+ border-collapse: collapse;
444
+ width: 100%;
445
+ margin-bottom: 10px;
446
+ }}
447
+ th, td {{
448
+ border: 1px solid #ccc;
449
+ padding: 8px;
450
+ text-align: left;
451
+ }}
452
+ th {{
453
+ background-color: #f2f2f2;
454
+ }}
455
  </style>
456
  </head>
457
  <body>
 
460
  </body>
461
  </html>
462
  """
463
+ # Generate PDF from HTML using xhtml2pdf (pisa)
464
  pdf_buffer = io.BytesIO()
465
  pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
466
  if pisa_status.err:
 
481
  final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
482
  pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
483
  metadata=metadata)
484
+ # Create a temporary file for PDF download
485
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
486
  tmp_file.write(pdf_bytes)
487
  tmp_path = tmp_file.name
 
492
  return f"Error generating report: {str(e)}", None
493
 
494
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
495
+ """
496
+ Given a list of crumb records (each with 'url', 'summary', and 'full_content'),
497
+ extract and aggregate only the summary parts.
498
+ """
499
  aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
500
  logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
501
  return aggregated
 
514
  "Formulate this as a new research query that could lead to innovative insights.")
515
  disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
516
  logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
517
+
518
+ # Generate tailored clarification questions for the disruptive query
519
  clarifications_for_new = generate_tailored_questions(
520
  os.getenv("OPENAI_API_KEY"),
521
+ disruptive_query + "\n\n IMPORTANT NOTE: in this specific iteration, generate also the responses for the questions asked (simulated)",
522
  "", "", "", ""
523
  )
524
  logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
525
+
526
+ # Run iterative deep research for the disruptive query
527
  generator = iterative_deep_research_gen(
528
  disruptive_query, reportstyle, breadth, depth, followup_clarifications,
529
  include_domains, exclude_keywords, additional_clarifications,
 
538
  appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
539
  return appended_report
540
 
541
+ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
 
542
  followup_clarifications: str,
543
  include_domains: str,
544
  exclude_keywords: str,
 
558
  references_list = []
559
  followup_suggestions = []
560
  logging.info("iterative_deep_research_gen: Research started.")
561
+ for iteration in range(1, depth + 1):
562
+ process_log += f"\n--- Iteration {iteration} ---\n"
563
+ logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
564
+ combined_context = overall_context
565
+ if followup_suggestions:
566
+ # Deduplicate follow-up suggestions before adding them to context.
567
+ unique_suggestions = list(set(followup_suggestions))
568
+ combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
569
+ queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
570
+ process_log += f"Generated queries: {queries}\n"
571
+ iteration_learnings = []
572
+ followup_suggestions = [] # reset for current iteration
573
+ for query_tuple in queries:
574
+ query_str, engine = query_tuple
575
+ mod_query = query_str
576
+ if include_domains.strip():
577
+ domains = [d.strip() for d in include_domains.split(",") if d.strip()]
578
+ domain_str = " OR ".join([f"site:{d}" for d in domains])
579
+ mod_query += f" ({domain_str})"
580
+ if exclude_keywords.strip():
581
+ for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
582
+ mod_query += f" -{ex}"
583
+ process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
584
+ results = perform_serpapi_search(mod_query, engine, results_per_query)
585
+
586
+ # Instead of processing all results one-by-one, first filter them
587
+ filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
588
+ process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
589
+ for res in filtered_results:
590
+ url = res.get("link", "")
591
+ if not url:
592
+ continue
593
+ content = ""
594
+ if url.lower().endswith(".pdf"):
595
+ content = process_pdf(url)
596
+ if "Error processing PDF" in content:
597
  continue
598
+ process_log += f"Extracted PDF content from {url}\n"
599
+ else:
600
+ try:
601
+ response = requests.get(url, headers=HEADERS)
602
+ response.raise_for_status()
603
+ content = response.text
604
+ process_log += f"Extracted full page content from {url}\n"
605
+ except Exception as e:
606
+ logging.error(f"Error retrieving content from {url}: {e}")
607
+ process_log += f"Error retrieving content from {url}: {e}\n"
608
+ continue
609
+ analysis = analyze_with_gpt4o(initial_query, content)
610
+ analysis_summary = analysis.get("summary", "").strip()
611
+ process_log += (f"Summary: {analysis.get('summary')}, Follow-ups: {analysis.get('followups')}\n")
612
+ if not analysis_summary:
613
+ analysis_summary = content[:200] + "..." if len(content) > 200 else content
614
+ crumbs_list.append({
615
+ "url": url,
616
+ "summary": analysis_summary,
617
+ "full_content": content
618
+ })
619
+ if analysis.get("relevant", "no").lower() == "yes":
620
+ if url.startswith("http://") or url.startswith("https://"):
621
+ link_str = f" <a href='{url}'>[{ref_counter}]</a>"
622
  else:
623
+ link_str = f" [{ref_counter}]"
624
+ summary_with_ref = analysis_summary + link_str
625
+ iteration_learnings.append(summary_with_ref)
626
+ references_list.append((ref_counter, url))
627
+ ref_counter += 1
628
+ if isinstance(analysis.get("followups"), list):
629
+ followup_suggestions.extend(analysis.get("followups"))
630
+ process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
631
+ logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
632
+ overall_learnings.extend(iteration_learnings)
633
+ overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
634
+ if additional_clarifications.strip():
635
+ overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
636
+ process_log += "Appended additional clarifications to the context.\n"
637
+ progress_pct = int((iteration / depth) * 100)
638
+ yield (f"Progress: {progress_pct}%", None, None, None)
639
+ aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
640
+ final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
641
+ alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
642
+ final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
643
+ logging.info("iterative_deep_research_gen: Final report generated.")
644
+ yield ("", final_report, process_log, crumbs_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
  def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
647
  prompt = (
 
649
  "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
650
  "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
651
  "Research Report:\n" + report + "\n\n"
652
+ "Provide a short assessment in one paragraph on how well the report aligns with these requirements."
653
  )
654
  assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
655
  logging.info(f"assess_report_alignment: Assessment result: {assessment}")
656
  return assessment
657
 
658
+ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
659
+ followup_clarifications: str, include_domains: str,
660
+ exclude_keywords: str, additional_clarifications: str,
661
+ results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
662
+ pages: str, surprise_me: bool):
 
663
  if not openai_api_key or not serpapi_api_key:
664
+ logging.error("run_deep_research: Invalid API keys provided.")
665
+ return "Please input valid API keys", "", "", "", ""
666
+
667
  os.environ["OPENAI_API_KEY"] = openai_api_key
668
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
669
 
 
675
  if existing_crumbs:
676
  extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
677
 
678
+ final_progress = ""
 
 
 
679
  final_report = ""
680
+ final_process_log = ""
681
+ final_crumbs = ""
682
+ logging.info("run_deep_research: Starting deep research process.")
683
+ for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
684
+ initial_query, reportstyle, breadth, depth, followup_clarifications,
685
+ include_domains, exclude_keywords, additional_clarifications,
686
+ extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
687
  if rep is None:
688
+ final_progress = progress
689
+ yield final_progress, None, None, None, None
690
  else:
691
  final_report = rep
692
+ final_process_log = proc_log
693
+ final_crumbs = crumbs
694
  break
695
  if surprise_me:
696
+ extended_report = generate_surprise_report(
697
+ final_report, final_crumbs, initial_query, reportstyle, breadth, depth,
698
+ followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
699
+ results_per_query, selected_engines
700
+ )
701
  final_report = extended_report
702
+ final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
703
+ logging.info("run_deep_research: Deep research process completed.")
704
+ yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
705
 
706
  def load_example(example_choice: str) -> str:
707
  filename = ""
 
720
  logging.error(f"load_example: Error loading {filename}: {e}")
721
  return ""
722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  def main():
724
  custom_css = """
725
  /* Overall container customization */
 
772
  openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
773
  serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
774
  gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
775
+ gr.Markdown("You can check the open-source code - None of the user API keys are stored or logged.")
776
 
777
+ with gr.Accordion ("2] Research topic", open=False):
778
  with gr.Row():
779
  research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
780
  refine_query_button = gr.Button("Refine my Query", scale=1)
781
 
782
  with gr.Accordion("3] Q&A", open=False):
783
  with gr.Row():
784
+ clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale = 4)
785
  gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
786
 
787
  with gr.Accordion("4] Search Parameters", open=False):
 
822
  with gr.Accordion("5] Report", open=False, elem_classes="folder"):
823
  progress_display = gr.Markdown("", elem_id="progress-display")
824
  run_btn = gr.Button("Generate report")
825
+ final_report = gr.Markdown(label="Final Report (Markdown)", height = 800, min_height = 50)
826
  with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
827
  with gr.Column():
828
  query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)