Guiyom commited on
Commit
f807498
·
verified ·
1 Parent(s): 9db83ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -211
app.py CHANGED
@@ -12,6 +12,8 @@ import tempfile
12
  import logging
13
  import markdown
14
  import unicodedata
 
 
15
  from datetime import datetime
16
  from reportlab.lib.pagesizes import A4
17
  from xhtml2pdf import pisa
@@ -27,7 +29,7 @@ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/
27
  "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
28
 
29
  # =============================================================================
30
- # Helper functions for external APIs and PDF Processing
31
  # =============================================================================
32
 
33
  def display_image():
@@ -176,32 +178,24 @@ def generate_final_report(initial_query: str, reportstyle: str, learnings: list,
176
  prompt = (f"""
177
  Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
178
  The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
179
- It must include inline citations (e.g., [1], [2], etc.).
180
- It must follow this writing style {reportstyle}.
181
- The report must include at least {round(pages/3,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
182
-
183
- The structure of the report should be:
184
  - Abstract
185
  - Table of contents
186
  - Introduction
187
- - [Sections and sub-sections, depending on the size and relevant topic]
188
  - Conclusion
189
- - References of the documents used in the inline citations
190
-
191
- Important: For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...). This is to avoid issues when converting markdown to html.
192
- You should still use markdown for the stryling (titles levels, bold, italic), tables...
193
-
194
- Output the report directly without any introductory meta comments.
195
 
 
196
  Learnings:
197
  {json.dumps(learnings, indent=2)}
198
-
199
  Merged Reference Details:
200
  {aggregated_crumbs}"""
201
  )
202
  tokentarget = word_count * 3 # rough multiplier for token target
203
  report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
204
- # If the report is too long, compress it.
205
  if len(report) > MAX_MESSAGE_LENGTH:
206
  report = compress_text(report, MAX_MESSAGE_LENGTH)
207
  if report.startswith("Error calling OpenAI API"):
@@ -211,17 +205,21 @@ Merged Reference Details:
211
  return report
212
 
213
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
214
- # Filter out already seen results
215
  new_results = []
216
  candidate_indexes = []
 
217
  for idx, res in enumerate(results):
218
  url = res.get("link", "")
219
  if url and url not in visited_urls:
 
 
 
220
  new_results.append(res)
221
  candidate_indexes.append(idx)
 
222
  if not new_results:
223
  return []
224
- # Build the prompt with relaxed criteria.
225
  results_text = ""
226
  for idx, res in enumerate(new_results):
227
  title = res.get("title", "No Title")
@@ -231,25 +229,18 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
231
  prompt = (
232
  f"The following search results were obtained for the query '{query}' with clarifications:\n"
233
  f"{clarifications}\n\n"
234
- "For each result, decide whether it might be of interest for deeper research. "
235
- "Even if not completely certain, lean towards including more potential references. "
236
- "Return your decision as a JSON object where each key is the result index (as an integer) and the value is either 'yes' or 'no'. "
237
- "For example: {\"0\": \"yes\", \"1\": \"no\", \"2\": \"yes\"}.\n"
238
- "Consider the title, snippet, and URL in your decision."
239
- f"\nResults:{results_text}\n"
240
- "Output only the JSON object."
241
  )
242
  llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
243
  try:
244
  decision_map = json.loads(llm_response)
245
  except Exception as e:
246
  logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
247
- # In case of error, default to no results selected.
248
  decision_map = {}
249
  filtered = []
250
  for idx, res in enumerate(new_results):
251
  url = res.get("link", "")
252
- # Add each URL to visited regardless of decision.
253
  visited_urls.add(url)
254
  decision = decision_map.get(str(idx), "no").strip().lower()
255
  if decision == "yes":
@@ -258,12 +249,11 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
258
  return filtered
259
 
260
  def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
261
- finalquery = f"({query})" # original query is wrapped in parentheses
262
  languages_detected_list = languagesdetected.split(",")
263
  for lang in languages_detected_list:
264
  prompt2 = f"""The research query is: "{query}".
265
- Based on this query and context: "{context}", and with the detected language {lang}, provide the translated version of the query in that language.
266
- The translation must be less than 20 words and preserve search operators like AND, OR, parenthesis, quotation marks, and exclusion hyphens.
267
  Output only the translated query."""
268
  translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
269
  finalquery += f" OR ({translatedquery})"
@@ -271,11 +261,14 @@ Output only the translated query."""
271
  return finalquery
272
 
273
  def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
 
274
  base_terms = initial_query.strip()
275
- # Here you may add refinements if necessary to keep queries short.
276
- queries = [base_terms]
277
- # If topics are to be added, you can extend this list.
278
- final_queries = queries[:min(len(queries), breadth)]
 
 
279
  logging.info(f"generate_query_tree: Generated queries: {final_queries}")
280
  return final_queries
281
 
@@ -283,13 +276,11 @@ def generate_serp_queries(context: str, breadth: int, depth: int, initial_query:
283
  selected_engines=None, results_per_query: int = 10) -> list:
284
  queries = generate_query_tree(initial_query, breadth, depth)
285
  prompt = f"""The research query is: "{initial_query}".
286
- Based on this query and the context: "{context}", suggest one or several languages (other than English) that might be relevant.
287
  Output either:
288
  - "No local attributes detected"
289
- - One language (e.g., "Spanish")
290
- - Multiple languages comma separated (e.g., "Italian,Putonghua,Cantonese")
291
- Output only the result.
292
- """
293
  languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
294
  if languages_detected != "No local attributes detected":
295
  queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
@@ -297,17 +288,12 @@ Output only the result.
297
  prompt_engines = f"""
298
  Examine these queries:
299
  {queries}
300
- and considering the research context:
301
  {context}
302
- Identify among these search engines:
303
- google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews
304
- Which are most relevant? Output a comma separated list (e.g., "google,baidu").
305
- If none are found, output "google".
306
- """
307
  identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
308
  selected_engines = identified_engines.split(",")
309
- else:
310
- selected_engines = selected_engines
311
  final_queries = []
312
  for q in queries:
313
  for engine in selected_engines:
@@ -393,6 +379,23 @@ def refine_query(query: str, openai_api_key: str) -> str:
393
  logging.info(f"refine_query: Refined query: {refined}")
394
  return refined
395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  class ReportGenerator:
397
  def __init__(self):
398
  pass
@@ -403,9 +406,9 @@ class ReportGenerator:
403
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
404
  # Remove markdown hyperlink syntax: replace [text](link) with just text.
405
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
406
- # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
407
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
408
- # Insert explicit page breaks before specific headings for main report sections.
409
  html_content = html_content.replace("<h2>Table of Contents</h2>",
410
  "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
411
  html_content = html_content.replace("<h2>Introduction</h2>",
@@ -414,10 +417,8 @@ class ReportGenerator:
414
  "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
415
  html_content = html_content.replace("<h2>References</h2>",
416
  "<div style='page-break-before: always;'></div><h2>References</h2>")
417
- # For the Surprise-Me section, ensure it starts on a new page.
418
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
419
  "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
420
- # Build header using metadata if provided.
421
  date_str = datetime.now().strftime("%Y-%m-%d")
422
  header = ""
423
  if metadata:
@@ -425,33 +426,21 @@ class ReportGenerator:
425
  <p>Author: {metadata.get('User name', 'N/A')}</p>
426
  <p>Date: {metadata.get('Date', date_str)}</p>
427
  <hr/>"""
428
- # Build a complete HTML document with CSS.
429
  full_html = f"""
430
  <html>
431
  <head>
432
  <meta charset="utf-8" />
433
  <style>
434
- body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
435
  h1 {{ font-size: 24pt; margin-bottom: 12px; }}
436
  h2 {{ font-size: 20pt; margin-bottom: 10px; }}
437
  h3 {{ font-size: 18pt; margin-bottom: 8px; }}
438
  p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
439
- ol {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
440
- ul {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
441
  hr {{ border: 1px solid #ccc; margin: 20px 0; }}
442
- table {{
443
- border-collapse: collapse;
444
- width: 100%;
445
- margin-bottom: 10px;
446
- }}
447
- th, td {{
448
- border: 1px solid #ccc;
449
- padding: 8px;
450
- text-align: left;
451
- }}
452
- th {{
453
- background-color: #f2f2f2;
454
- }}
455
  </style>
456
  </head>
457
  <body>
@@ -460,7 +449,6 @@ class ReportGenerator:
460
  </body>
461
  </html>
462
  """
463
- # Generate PDF from HTML using xhtml2pdf (pisa)
464
  pdf_buffer = io.BytesIO()
465
  pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
466
  if pisa_status.err:
@@ -481,7 +469,6 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
481
  final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
482
  pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
483
  metadata=metadata)
484
- # Create a temporary file for PDF download
485
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
486
  tmp_file.write(pdf_bytes)
487
  tmp_path = tmp_file.name
@@ -492,10 +479,6 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
492
  return f"Error generating report: {str(e)}", None
493
 
494
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
495
- """
496
- Given a list of crumb records (each with 'url', 'summary', and 'full_content'),
497
- extract and aggregate only the summary parts.
498
- """
499
  aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
500
  logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
501
  return aggregated
@@ -514,16 +497,12 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
514
  "Formulate this as a new research query that could lead to innovative insights.")
515
  disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
516
  logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
517
-
518
- # Generate tailored clarification questions for the disruptive query
519
  clarifications_for_new = generate_tailored_questions(
520
  os.getenv("OPENAI_API_KEY"),
521
- disruptive_query + "\n\n IMPORTANT NOTE: in this specific iteration, generate also the responses for the questions asked (simulated)",
522
  "", "", "", ""
523
  )
524
  logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
525
-
526
- # Run iterative deep research for the disruptive query
527
  generator = iterative_deep_research_gen(
528
  disruptive_query, reportstyle, breadth, depth, followup_clarifications,
529
  include_domains, exclude_keywords, additional_clarifications,
@@ -538,7 +517,8 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
538
  appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
539
  return appended_report
540
 
541
- def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
 
542
  followup_clarifications: str,
543
  include_domains: str,
544
  exclude_keywords: str,
@@ -558,90 +538,93 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
558
  references_list = []
559
  followup_suggestions = []
560
  logging.info("iterative_deep_research_gen: Research started.")
561
- for iteration in range(1, depth + 1):
562
- process_log += f"\n--- Iteration {iteration} ---\n"
563
- logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
564
- combined_context = overall_context
565
- if followup_suggestions:
566
- # Deduplicate follow-up suggestions before adding them to context.
567
- unique_suggestions = list(set(followup_suggestions))
568
- combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
569
- queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
570
- process_log += f"Generated queries: {queries}\n"
571
- iteration_learnings = []
572
- followup_suggestions = [] # reset for current iteration
573
- for query_tuple in queries:
574
- query_str, engine = query_tuple
575
- mod_query = query_str
576
- if include_domains.strip():
577
- domains = [d.strip() for d in include_domains.split(",") if d.strip()]
578
- domain_str = " OR ".join([f"site:{d}" for d in domains])
579
- mod_query += f" ({domain_str})"
580
- if exclude_keywords.strip():
581
- for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
582
- mod_query += f" -{ex}"
583
- process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
584
- results = perform_serpapi_search(mod_query, engine, results_per_query)
585
-
586
- # Instead of processing all results one-by-one, first filter them
587
- filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
588
- process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
589
- for res in filtered_results:
590
- url = res.get("link", "")
591
- if not url:
592
- continue
593
- content = ""
594
- if url.lower().endswith(".pdf"):
595
- content = process_pdf(url)
596
- if "Error processing PDF" in content:
597
  continue
598
- process_log += f"Extracted PDF content from {url}\n"
599
- else:
600
- try:
601
- response = requests.get(url, headers=HEADERS)
602
- response.raise_for_status()
603
- content = response.text
604
- process_log += f"Extracted full page content from {url}\n"
605
- except Exception as e:
606
- logging.error(f"Error retrieving content from {url}: {e}")
607
- process_log += f"Error retrieving content from {url}: {e}\n"
608
- continue
609
- analysis = analyze_with_gpt4o(initial_query, content)
610
- analysis_summary = analysis.get("summary", "").strip()
611
- process_log += (f"Summary: {analysis.get('summary')}, Follow-ups: {analysis.get('followups')}\n")
612
- if not analysis_summary:
613
- analysis_summary = content[:200] + "..." if len(content) > 200 else content
614
- crumbs_list.append({
615
- "url": url,
616
- "summary": analysis_summary,
617
- "full_content": content
618
- })
619
- if analysis.get("relevant", "no").lower() == "yes":
620
- if url.startswith("http://") or url.startswith("https://"):
621
- link_str = f" <a href='{url}'>[{ref_counter}]</a>"
622
  else:
623
- link_str = f" [{ref_counter}]"
624
- summary_with_ref = analysis_summary + link_str
625
- iteration_learnings.append(summary_with_ref)
626
- references_list.append((ref_counter, url))
627
- ref_counter += 1
628
- if isinstance(analysis.get("followups"), list):
629
- followup_suggestions.extend(analysis.get("followups"))
630
- process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
631
- logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
632
- overall_learnings.extend(iteration_learnings)
633
- overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
634
- if additional_clarifications.strip():
635
- overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
636
- process_log += "Appended additional clarifications to the context.\n"
637
- progress_pct = int((iteration / depth) * 100)
638
- yield (f"Progress: {progress_pct}%", None, None, None)
639
- aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
640
- final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
641
- alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
642
- final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
643
- logging.info("iterative_deep_research_gen: Final report generated.")
644
- yield ("", final_report, process_log, crumbs_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
  def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
647
  prompt = (
@@ -649,21 +632,21 @@ def assess_report_alignment(report: str, initial_query: str, clarifications: str
649
  "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
650
  "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
651
  "Research Report:\n" + report + "\n\n"
652
- "Provide a short assessment in one paragraph on how well the report aligns with these requirements."
653
  )
654
  assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
655
  logging.info(f"assess_report_alignment: Assessment result: {assessment}")
656
  return assessment
657
 
658
- def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
659
- followup_clarifications: str, include_domains: str,
660
- exclude_keywords: str, additional_clarifications: str,
661
- results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
662
- pages: str, surprise_me: bool):
 
663
  if not openai_api_key or not serpapi_api_key:
664
- logging.error("run_deep_research: Invalid API keys provided.")
665
- return "Please input valid API keys", "", "", "", ""
666
-
667
  os.environ["OPENAI_API_KEY"] = openai_api_key
668
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
669
 
@@ -675,50 +658,42 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
675
  if existing_crumbs:
676
  extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
677
 
678
- final_progress = ""
 
 
 
679
  final_report = ""
680
- final_process_log = ""
681
- final_crumbs = ""
682
- logging.info("run_deep_research: Starting deep research process.")
683
- for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
684
- initial_query, reportstyle, breadth, depth, followup_clarifications,
685
- include_domains, exclude_keywords, additional_clarifications,
686
- extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
687
  if rep is None:
688
- final_progress = progress
689
- yield final_progress, None, None, None, None
690
  else:
691
  final_report = rep
692
- final_process_log = proc_log
693
- final_crumbs = crumbs
694
  break
695
  if surprise_me:
696
- extended_report = generate_surprise_report(
697
- final_report, final_crumbs, initial_query, reportstyle, breadth, depth,
698
- followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
699
- results_per_query, selected_engines
700
- )
701
  final_report = extended_report
702
- final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
703
- logging.info("run_deep_research: Deep research process completed.")
704
- yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
705
-
706
- def load_example(example_choice: str) -> str:
707
- filename = ""
708
- if example_choice == "Implications of the release of advanced Deep Research solutions":
709
- filename = "example1.txt"
710
- elif example_choice == "AI regulation in finance":
711
- filename = "example2.txt"
712
- elif example_choice == "AI top voices":
713
- filename = "example3.txt"
714
- try:
715
- with open(filename, "r", encoding="utf-8") as f:
716
- content = f.read()
717
- logging.info(f"load_example: Loaded content from {filename}")
718
- return content
719
- except Exception as e:
720
- logging.error(f"load_example: Error loading {filename}: {e}")
721
- return ""
722
 
723
  def main():
724
  custom_css = """
@@ -772,16 +747,16 @@ def main():
772
  openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
773
  serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
774
  gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
775
- gr.Markdown("You can check the open-source code - None of the user API keys are stored or logged.")
776
 
777
- with gr.Accordion ("2] Research topic", open=False):
778
  with gr.Row():
779
  research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
780
  refine_query_button = gr.Button("Refine my Query", scale=1)
781
 
782
  with gr.Accordion("3] Q&A", open=False):
783
  with gr.Row():
784
- clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale = 4)
785
  gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
786
 
787
  with gr.Accordion("4] Search Parameters", open=False):
@@ -822,7 +797,7 @@ def main():
822
  with gr.Accordion("5] Report", open=False, elem_classes="folder"):
823
  progress_display = gr.Markdown("", elem_id="progress-display")
824
  run_btn = gr.Button("Generate report")
825
- final_report = gr.Markdown(label="Final Report (Markdown)", height = 800, min_height = 50)
826
  with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
827
  with gr.Column():
828
  query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)
 
12
  import logging
13
  import markdown
14
  import unicodedata
15
+ import asyncio
16
+ import aiohttp
17
  from datetime import datetime
18
  from reportlab.lib.pagesizes import A4
19
  from xhtml2pdf import pisa
 
29
  "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
30
 
31
  # =============================================================================
32
+ # Helper functions for external APIs, PDF Processing and Asynchronous Requests
33
  # =============================================================================
34
 
35
  def display_image():
 
178
  prompt = (f"""
179
  Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
180
  The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
181
+ It must include inline citations (e.g., [1], [2], etc.) and follow this writing style: {reportstyle}.
182
+ Include at least {round(pages/3,0)} tables from the sources used (citations added if necessary).
183
+ The structure should have:
 
 
184
  - Abstract
185
  - Table of contents
186
  - Introduction
187
+ - [Sections and sub-sections as needed]
188
  - Conclusion
189
+ - References
 
 
 
 
 
190
 
191
+ Important: Number titles and lists as 1., 1.1, etc.
192
  Learnings:
193
  {json.dumps(learnings, indent=2)}
 
194
  Merged Reference Details:
195
  {aggregated_crumbs}"""
196
  )
197
  tokentarget = word_count * 3 # rough multiplier for token target
198
  report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
 
199
  if len(report) > MAX_MESSAGE_LENGTH:
200
  report = compress_text(report, MAX_MESSAGE_LENGTH)
201
  if report.startswith("Error calling OpenAI API"):
 
205
  return report
206
 
207
  def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
208
+ # Filter out already seen results by URL and domain (robust deduplication)
209
  new_results = []
210
  candidate_indexes = []
211
+ seen_domains = set()
212
  for idx, res in enumerate(results):
213
  url = res.get("link", "")
214
  if url and url not in visited_urls:
215
+ domain = url.split("/")[2] if "://" in url else url
216
+ if domain in seen_domains:
217
+ continue
218
  new_results.append(res)
219
  candidate_indexes.append(idx)
220
+ seen_domains.add(domain)
221
  if not new_results:
222
  return []
 
223
  results_text = ""
224
  for idx, res in enumerate(new_results):
225
  title = res.get("title", "No Title")
 
229
  prompt = (
230
  f"The following search results were obtained for the query '{query}' with clarifications:\n"
231
  f"{clarifications}\n\n"
232
+ "For each result, decide if it might be relevant for deeper research. Return a JSON object with keys as result indices and values as 'yes' or 'no'.\n"
233
+ f"Results:{results_text}\nOutput only the JSON object."
 
 
 
 
 
234
  )
235
  llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
236
  try:
237
  decision_map = json.loads(llm_response)
238
  except Exception as e:
239
  logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
 
240
  decision_map = {}
241
  filtered = []
242
  for idx, res in enumerate(new_results):
243
  url = res.get("link", "")
 
244
  visited_urls.add(url)
245
  decision = decision_map.get(str(idx), "no").strip().lower()
246
  if decision == "yes":
 
249
  return filtered
250
 
251
  def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
252
+ finalquery = f"({query})" # original query in parentheses
253
  languages_detected_list = languagesdetected.split(",")
254
  for lang in languages_detected_list:
255
  prompt2 = f"""The research query is: "{query}".
256
+ Based on this query and context: "{context}", and using the detected language {lang}, provide a translated version (less than 20 words) preserving search operators.
 
257
  Output only the translated query."""
258
  translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
259
  finalquery += f" OR ({translatedquery})"
 
261
  return finalquery
262
 
263
  def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
264
+ # Generate several variants of the query based on the desired breadth.
265
  base_terms = initial_query.strip()
266
+ variants = [base_terms,
267
+ base_terms + " detailed analysis",
268
+ base_terms + " review",
269
+ base_terms + " case study"]
270
+ # Return only as many as needed (up to 'breadth')
271
+ final_queries = variants[:min(len(variants), breadth)]
272
  logging.info(f"generate_query_tree: Generated queries: {final_queries}")
273
  return final_queries
274
 
 
276
  selected_engines=None, results_per_query: int = 10) -> list:
277
  queries = generate_query_tree(initial_query, breadth, depth)
278
  prompt = f"""The research query is: "{initial_query}".
279
+ Based on the context: "{context}", suggest non-English languages (if any) relevant.
280
  Output either:
281
  - "No local attributes detected"
282
+ - A comma-separated list (e.g., "Spanish,Italian")
283
+ Output only the result."""
 
 
284
  languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
285
  if languages_detected != "No local attributes detected":
286
  queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
 
288
  prompt_engines = f"""
289
  Examine these queries:
290
  {queries}
291
+ Considering the context:
292
  {context}
293
+ Identify among these search engines: google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews.
294
+ Return a comma separated list (default "google" if none)."""
 
 
 
295
  identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
296
  selected_engines = identified_engines.split(",")
 
 
297
  final_queries = []
298
  for q in queries:
299
  for engine in selected_engines:
 
379
  logging.info(f"refine_query: Refined query: {refined}")
380
  return refined
381
 
382
+ # --- New Asynchronous Helper for Parallel URL Fetching --- #
383
+ async def async_fetch_url(session: aiohttp.ClientSession, url: str) -> str:
384
+ """Fetch the URL asynchronously using aiohttp."""
385
+ try:
386
+ async with session.get(url, headers=HEADERS, timeout=10) as response:
387
+ response.raise_for_status()
388
+ text = await response.text()
389
+ logging.info(f"async_fetch_url: Fetched content from {url}")
390
+ return text
391
+ except Exception as e:
392
+ logging.error(f"async_fetch_url: Error retrieving content from {url}: {e}")
393
+ return ""
394
+
395
+ # =============================================================================
396
+ # ReportGenerator and PDF generation (Enhanced CSS added)
397
+ # =============================================================================
398
+
399
  class ReportGenerator:
400
  def __init__(self):
401
  pass
 
406
  solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
407
  # Remove markdown hyperlink syntax: replace [text](link) with just text.
408
  solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
409
+ # Convert markdown to HTML using the "extra" and "tables" extensions.
410
  html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
411
+ # Insert explicit page breaks before key headings (with added CSS for dynamic styling).
412
  html_content = html_content.replace("<h2>Table of Contents</h2>",
413
  "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
414
  html_content = html_content.replace("<h2>Introduction</h2>",
 
417
  "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
418
  html_content = html_content.replace("<h2>References</h2>",
419
  "<div style='page-break-before: always;'></div><h2>References</h2>")
 
420
  html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
421
  "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
 
422
  date_str = datetime.now().strftime("%Y-%m-%d")
423
  header = ""
424
  if metadata:
 
426
  <p>Author: {metadata.get('User name', 'N/A')}</p>
427
  <p>Date: {metadata.get('Date', date_str)}</p>
428
  <hr/>"""
 
429
  full_html = f"""
430
  <html>
431
  <head>
432
  <meta charset="utf-8" />
433
  <style>
434
+ body {{ font-family: Helvetica, sans-serif; margin: 40px; background: #fefefe; }}
435
  h1 {{ font-size: 24pt; margin-bottom: 12px; }}
436
  h2 {{ font-size: 20pt; margin-bottom: 10px; }}
437
  h3 {{ font-size: 18pt; margin-bottom: 8px; }}
438
  p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
439
+ ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
 
440
  hr {{ border: 1px solid #ccc; margin: 20px 0; }}
441
+ table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
442
+ th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
443
+ th {{ background-color: #f2f2f2; }}
 
 
 
 
 
 
 
 
 
 
444
  </style>
445
  </head>
446
  <body>
 
449
  </body>
450
  </html>
451
  """
 
452
  pdf_buffer = io.BytesIO()
453
  pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
454
  if pisa_status.err:
 
469
  final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
470
  pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
471
  metadata=metadata)
 
472
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
473
  tmp_file.write(pdf_bytes)
474
  tmp_path = tmp_file.name
 
479
  return f"Error generating report: {str(e)}", None
480
 
481
  def extract_summary_from_crumbs(crumbs_list: list) -> str:
 
 
 
 
482
  aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
483
  logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
484
  return aggregated
 
497
  "Formulate this as a new research query that could lead to innovative insights.")
498
  disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
499
  logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
 
 
500
  clarifications_for_new = generate_tailored_questions(
501
  os.getenv("OPENAI_API_KEY"),
502
+ disruptive_query + "\n\n IMPORTANT NOTE: in this iteration, generate also simulated responses for the questions asked",
503
  "", "", "", ""
504
  )
505
  logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
 
 
506
  generator = iterative_deep_research_gen(
507
  disruptive_query, reportstyle, breadth, depth, followup_clarifications,
508
  include_domains, exclude_keywords, additional_clarifications,
 
517
  appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
518
  return appended_report
519
 
520
+ # --- Adaptive and Parallel Organized Research (Dynamic Agent Orchestration) --- #
521
+ async def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
522
  followup_clarifications: str,
523
  include_domains: str,
524
  exclude_keywords: str,
 
538
  references_list = []
539
  followup_suggestions = []
540
  logging.info("iterative_deep_research_gen: Research started.")
541
+
542
+ # Create a single aiohttp session for parallel page fetching
543
+ async with aiohttp.ClientSession() as session:
544
+ for iteration in range(1, depth + 1):
545
+ process_log += f"\n--- Iteration {iteration} ---\n"
546
+ logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
547
+ combined_context = overall_context
548
+ if followup_suggestions:
549
+ unique_suggestions = list(set(followup_suggestions))
550
+ combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
551
+ queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
552
+ process_log += f"Generated queries: {queries}\n"
553
+ iteration_learnings = []
554
+ followup_suggestions = [] # reset for current iteration
555
+
556
+ # For each query, perform SERPAPI search and fetch pages concurrently:
557
+ for query_str, engine in queries:
558
+ mod_query = query_str
559
+ if include_domains.strip():
560
+ domains = [d.strip() for d in include_domains.split(",") if d.strip()]
561
+ domain_str = " OR ".join([f"site:{d}" for d in domains])
562
+ mod_query += f" ({domain_str})"
563
+ if exclude_keywords.strip():
564
+ for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
565
+ mod_query += f" -{ex}"
566
+ process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
567
+ results = perform_serpapi_search(mod_query, engine, results_per_query)
568
+ filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
569
+ process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
570
+ async_tasks = []
571
+ for res in filtered_results:
572
+ url = res.get("link", "")
573
+ if not url:
 
 
 
574
  continue
575
+ if url.lower().endswith(".pdf"):
576
+ content = process_pdf(url)
577
+ process_log += f"Extracted PDF content from {url}\n"
578
+ # Process synchronously for PDFs
579
+ analysis = analyze_with_gpt4o(initial_query, content)
580
+ analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
581
+ crumbs_list.append({"url": url, "summary": analysis_summary, "full_content": content})
582
+ if analysis.get("relevant", "no").lower() == "yes":
583
+ link_str = f" <a href='{url}'>[{ref_counter}]</a>"
584
+ summary_with_ref = analysis_summary + link_str
585
+ iteration_learnings.append(summary_with_ref)
586
+ references_list.append((ref_counter, url))
587
+ ref_counter += 1
588
+ if isinstance(analysis.get("followups"), list):
589
+ followup_suggestions.extend(analysis.get("followups"))
 
 
 
 
 
 
 
 
 
590
  else:
591
+ # Schedule asynchronous fetching for non-PDF pages
592
+ async_tasks.append(async_fetch_url(session, url))
593
+ # Wait for asynchronous fetches to complete
594
+ if async_tasks:
595
+ fetched_contents = await asyncio.gather(*async_tasks)
596
+ for content in fetched_contents:
597
+ if not content:
598
+ continue
599
+ analysis = analyze_with_gpt4o(initial_query, content)
600
+ analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
601
+ # Here we do not re-fetch URL since it is already processed
602
+ crumbs_list.append({"url": "async_url", "summary": analysis_summary, "full_content": content})
603
+ if analysis.get("relevant", "no").lower() == "yes":
604
+ link_str = f" [*]" # Mark asynchronous fetched URLs.
605
+ summary_with_ref = analysis_summary + link_str
606
+ iteration_learnings.append(summary_with_ref)
607
+ if isinstance(analysis.get("followups"), list):
608
+ followup_suggestions.extend(analysis.get("followups"))
609
+ process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
610
+ logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
611
+ overall_learnings.extend(iteration_learnings)
612
+ overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
613
+ if additional_clarifications.strip():
614
+ overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
615
+ process_log += "Appended additional clarifications to the context.\n"
616
+ # Adaptive follow-up: if new followup suggestions emerged, call tailored questions generator
617
+ if followup_suggestions:
618
+ extra_questions = generate_tailored_questions(os.getenv("OPENAI_API_KEY"), initial_query, "", "", "", "")
619
+ overall_context += "\nAdaptive Follow-Up Questions:\n" + extra_questions + "\n"
620
+ progress_pct = int((iteration / depth) * 100)
621
+ yield (f"Progress: {progress_pct}%", None, process_log, None)
622
+ aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
623
+ final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
624
+ alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
625
+ final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
626
+ logging.info("iterative_deep_research_gen: Final report generated.")
627
+ yield ("", final_report, process_log, crumbs_list)
628
 
629
  def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
630
  prompt = (
 
632
  "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
633
  "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
634
  "Research Report:\n" + report + "\n\n"
635
+ "Provide a short paragraph assessment on how well the report aligns with these requirements."
636
  )
637
  assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
638
  logging.info(f"assess_report_alignment: Assessment result: {assessment}")
639
  return assessment
640
 
641
+ # --- Main Deep Research Orchestrator (Wrapper for async execution) --- #
642
+ async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str,
643
+ breadth: int, depth: int, followup_clarifications: str, include_domains: str,
644
+ exclude_keywords: str, additional_clarifications: str, results_per_query: int,
645
+ selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
646
+ pages: str, surprise_me: bool):
647
  if not openai_api_key or not serpapi_api_key:
648
+ logging.error("orchestrate_deep_research: Invalid API keys provided.")
649
+ return "Please input valid API keys", "", "", ""
 
650
  os.environ["OPENAI_API_KEY"] = openai_api_key
651
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
652
 
 
658
  if existing_crumbs:
659
  extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
660
 
661
+ loop = asyncio.get_event_loop()
662
+ researcher = iterative_deep_research_gen(initial_query, reportstyle, breadth, depth, followup_clarifications,
663
+ include_domains, exclude_keywords, additional_clarifications,
664
+ extra_context, selected_engines, results_per_query, go_deeper=int(pages))
665
  final_report = ""
666
+ process_log = ""
667
+ async for progress, rep, proc_log, crumbs in researcher:
 
 
 
 
 
668
  if rep is None:
669
+ current_progress = progress
670
+ # You could yield intermediate progress if needed.
671
  else:
672
  final_report = rep
673
+ process_log = proc_log
 
674
  break
675
  if surprise_me:
676
+ extended_report = generate_surprise_report(final_report, crumbs, initial_query, reportstyle, breadth, depth,
677
+ followup_clarifications, include_domains, exclude_keywords,
678
+ additional_clarifications, results_per_query, selected_engines)
 
 
679
  final_report = extended_report
680
+ return final_report, process_log, extra_context
681
+
682
+ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
683
+ followup_clarifications: str, include_domains: str, exclude_keywords: str, additional_clarifications: str,
684
+ results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
685
+ pages: str, surprise_me: bool):
686
+ final_report, proc_log, extra_context = asyncio.run(
687
+ orchestrate_deep_research(openai_api_key, serpapi_api_key, initial_query, reportstyle, breadth, depth,
688
+ followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
689
+ results_per_query, selected_engines, existing_crumbs, existing_report, existing_log,
690
+ pages, surprise_me)
691
+ )
692
+ return ("Progress: 100%", final_report, existing_report, existing_log, existing_crumbs)
693
+
694
+ # =============================================================================
695
+ # Gradio Interface using gr.Blocks with Custom CSS
696
+ # =============================================================================
 
 
 
697
 
698
  def main():
699
  custom_css = """
 
747
  openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
748
  serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
749
  gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
750
+ gr.Markdown("API keys are not stored or logged.")
751
 
752
+ with gr.Accordion("2] Research topic", open=False):
753
  with gr.Row():
754
  research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
755
  refine_query_button = gr.Button("Refine my Query", scale=1)
756
 
757
  with gr.Accordion("3] Q&A", open=False):
758
  with gr.Row():
759
+ clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale=4)
760
  gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
761
 
762
  with gr.Accordion("4] Search Parameters", open=False):
 
797
  with gr.Accordion("5] Report", open=False, elem_classes="folder"):
798
  progress_display = gr.Markdown("", elem_id="progress-display")
799
  run_btn = gr.Button("Generate report")
800
+ final_report = gr.Markdown(label="Final Report (Markdown)", height=800, min_height=50)
801
  with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
802
  with gr.Column():
803
  query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)