Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 13, 2025

Commit

559e3f2

verified ·

1 Parent(s): 4c364e9

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -186

app.py CHANGED Viewed

@@ -12,8 +12,6 @@ import tempfile
 import logging
 import markdown
 import unicodedata
-import asyncio
-import aiohttp
 from datetime import datetime
 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
@@ -29,7 +27,7 @@ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/
                          "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
 # =============================================================================
-# Helper functions for external APIs, PDF Processing and Asynchronous Requests
 # =============================================================================
 def display_image():
@@ -178,24 +176,32 @@ def generate_final_report(initial_query: str, reportstyle: str, learnings: list,
     prompt = (f"""
 Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
 The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
-It must include inline citations (e.g., [1], [2], etc.) and follow this writing style: {reportstyle}.
-Include at least {round(pages/3,0)} tables from the sources used (citations added if necessary).
-The structure should have:
 - Abstract
 - Table of contents
 - Introduction
-- [Sections and sub-sections as needed]
 - Conclusion
-- References
-Important: Number titles and lists as 1., 1.1, etc.
 Learnings:
 {json.dumps(learnings, indent=2)}
 Merged Reference Details:
 {aggregated_crumbs}"""
     )
     tokentarget = word_count * 3  # rough multiplier for token target
     report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
     if len(report) > MAX_MESSAGE_LENGTH:
         report = compress_text(report, MAX_MESSAGE_LENGTH)
     if report.startswith("Error calling OpenAI API"):
@@ -205,21 +211,17 @@ Merged Reference Details:
     return report
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
-    # Filter out already seen results by URL and domain (robust deduplication)
     new_results = []
     candidate_indexes = []
-    seen_domains = set()
     for idx, res in enumerate(results):
         url = res.get("link", "")
         if url and url not in visited_urls:
-            domain = url.split("/")[2] if "://" in url else url
-            if domain in seen_domains:
-                continue
             new_results.append(res)
             candidate_indexes.append(idx)
-            seen_domains.add(domain)
     if not new_results:
         return []
     results_text = ""
     for idx, res in enumerate(new_results):
         title = res.get("title", "No Title")
@@ -229,18 +231,25 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
     prompt = (
         f"The following search results were obtained for the query '{query}' with clarifications:\n"
         f"{clarifications}\n\n"
-        "For each result, decide if it might be relevant for deeper research. Return a JSON object with keys as result indices and values as 'yes' or 'no'.\n"
-        f"Results:{results_text}\nOutput only the JSON object."
     )
     llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
     try:
         decision_map = json.loads(llm_response)
     except Exception as e:
         logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
         decision_map = {}
     filtered = []
     for idx, res in enumerate(new_results):
         url = res.get("link", "")
         visited_urls.add(url)
         decision = decision_map.get(str(idx), "no").strip().lower()
         if decision == "yes":
@@ -249,11 +258,12 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
     return filtered
 def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
-    finalquery = f"({query})"  # original query in parentheses
     languages_detected_list = languagesdetected.split(",")
     for lang in languages_detected_list:
         prompt2 = f"""The research query is: "{query}".
-Based on this query and context: "{context}", and using the detected language {lang}, provide a translated version (less than 20 words) preserving search operators.
 Output only the translated query."""
         translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
         finalquery += f" OR ({translatedquery})"
@@ -261,14 +271,11 @@ Output only the translated query."""
     return finalquery
 def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
-    # Generate several variants of the query based on the desired breadth.
     base_terms = initial_query.strip()
-    variants = [base_terms,
-                base_terms + " detailed analysis",
-                base_terms + " review",
-                base_terms + " case study"]
-    # Return only as many as needed (up to 'breadth')
-    final_queries = variants[:min(len(variants), breadth)]
     logging.info(f"generate_query_tree: Generated queries: {final_queries}")
     return final_queries
@@ -276,11 +283,13 @@ def generate_serp_queries(context: str, breadth: int, depth: int, initial_query:
                           selected_engines=None, results_per_query: int = 10) -> list:
     queries = generate_query_tree(initial_query, breadth, depth)
     prompt = f"""The research query is: "{initial_query}".
-Based on the context: "{context}", suggest non-English languages (if any) relevant.
 Output either:
 - "No local attributes detected"
-- A comma-separated list (e.g., "Spanish,Italian")
-Output only the result."""
     languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
     if languages_detected != "No local attributes detected":
         queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
@@ -288,12 +297,17 @@ Output only the result."""
         prompt_engines = f"""
 Examine these queries:
 {queries}
-Considering the context:
 {context}
-Identify among these search engines: google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews.
-Return a comma separated list (default "google" if none)."""
         identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
         selected_engines = identified_engines.split(",")
     final_queries = []
     for q in queries:
         for engine in selected_engines:
@@ -379,23 +393,6 @@ def refine_query(query: str, openai_api_key: str) -> str:
     logging.info(f"refine_query: Refined query: {refined}")
     return refined
-# --- New Asynchronous Helper for Parallel URL Fetching --- #
-async def async_fetch_url(session: aiohttp.ClientSession, url: str) -> str:
-    """Fetch the URL asynchronously using aiohttp."""
-    try:
-        async with session.get(url, headers=HEADERS, timeout=10) as response:
-            response.raise_for_status()
-            text = await response.text()
-            logging.info(f"async_fetch_url: Fetched content from {url}")
-            return text
-    except Exception as e:
-        logging.error(f"async_fetch_url: Error retrieving content from {url}: {e}")
-        return ""
-# =============================================================================
-# ReportGenerator and PDF generation (Enhanced CSS added)
-# =============================================================================
 class ReportGenerator:
     def __init__(self):
         pass
@@ -406,9 +403,9 @@ class ReportGenerator:
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         # Remove markdown hyperlink syntax: replace [text](link) with just text.
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
-        # Convert markdown to HTML using the "extra" and "tables" extensions.
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
-        # Insert explicit page breaks before key headings (with added CSS for dynamic styling).
         html_content = html_content.replace("<h2>Table of Contents</h2>",
             "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>",
@@ -417,8 +414,10 @@ class ReportGenerator:
             "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>",
             "<div style='page-break-before: always;'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
             "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
@@ -426,21 +425,33 @@ class ReportGenerator:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
-        body {{ font-family: Helvetica, sans-serif; margin: 40px; background: #fefefe; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; }}
         p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
-        ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
         hr {{ border: 1px solid #ccc; margin: 20px 0; }}
-        table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
-        th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
-        th {{ background-color: #f2f2f2; }}
     </style>
 </head>
 <body>
@@ -449,6 +460,7 @@ class ReportGenerator:
 </body>
 </html>
 """
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
         if pisa_status.err:
@@ -469,6 +481,7 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
             final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
         pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
                                                               metadata=metadata)
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
             tmp_file.write(pdf_bytes)
             tmp_path = tmp_file.name
@@ -479,6 +492,10 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
         return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
     aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
     logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
     return aggregated
@@ -497,12 +514,16 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
                   "Formulate this as a new research query that could lead to innovative insights.")
     disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
     logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
     clarifications_for_new = generate_tailored_questions(
         os.getenv("OPENAI_API_KEY"),
-        disruptive_query + "\n\n IMPORTANT NOTE: in this iteration, generate also simulated responses for the questions asked",
         "", "", "", ""
     )
     logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
@@ -517,8 +538,7 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
-# --- Adaptive and Parallel Organized Research (Dynamic Agent Orchestration) --- #
-async def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
                                 exclude_keywords: str,
@@ -538,93 +558,90 @@ async def iterative_deep_research_gen(initial_query: str, reportstyle: str, brea
     references_list = []
     followup_suggestions = []
     logging.info("iterative_deep_research_gen: Research started.")
-    # Create a single aiohttp session for parallel page fetching
-    async with aiohttp.ClientSession() as session:
-        for iteration in range(1, depth + 1):
-            process_log += f"\n--- Iteration {iteration} ---\n"
-            logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
-            combined_context = overall_context
-            if followup_suggestions:
-                unique_suggestions = list(set(followup_suggestions))
-                combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
-            queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
-            process_log += f"Generated queries: {queries}\n"
-            iteration_learnings = []
-            followup_suggestions = []  # reset for current iteration
-            # For each query, perform SERPAPI search and fetch pages concurrently:
-            for query_str, engine in queries:
-                mod_query = query_str
-                if include_domains.strip():
-                    domains = [d.strip() for d in include_domains.split(",") if d.strip()]
-                    domain_str = " OR ".join([f"site:{d}" for d in domains])
-                    mod_query += f" ({domain_str})"
-                if exclude_keywords.strip():
-                    for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
-                        mod_query += f" -{ex}"
-                process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
-                results = perform_serpapi_search(mod_query, engine, results_per_query)
-                filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
-                process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
-                async_tasks = []
-                for res in filtered_results:
-                    url = res.get("link", "")
-                    if not url:
                         continue
-                    if url.lower().endswith(".pdf"):
-                        content = process_pdf(url)
-                        process_log += f"Extracted PDF content from {url}\n"
-                        # Process synchronously for PDFs
-                        analysis = analyze_with_gpt4o(initial_query, content)
-                        analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
-                        crumbs_list.append({"url": url, "summary": analysis_summary, "full_content": content})
-                        if analysis.get("relevant", "no").lower() == "yes":
-                            link_str = f" <a href='{url}'>[{ref_counter}]</a>"
-                            summary_with_ref = analysis_summary + link_str
-                            iteration_learnings.append(summary_with_ref)
-                            references_list.append((ref_counter, url))
-                            ref_counter += 1
-                            if isinstance(analysis.get("followups"), list):
-                                followup_suggestions.extend(analysis.get("followups"))
                     else:
-                        # Schedule asynchronous fetching for non-PDF pages
-                        async_tasks.append(async_fetch_url(session, url))
-                # Wait for asynchronous fetches to complete
-                if async_tasks:
-                    fetched_contents = await asyncio.gather(*async_tasks)
-                    for content in fetched_contents:
-                        if not content:
-                            continue
-                        analysis = analyze_with_gpt4o(initial_query, content)
-                        analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
-                        # Here we do not re-fetch URL since it is already processed
-                        crumbs_list.append({"url": "async_url", "summary": analysis_summary, "full_content": content})
-                        if analysis.get("relevant", "no").lower() == "yes":
-                            link_str = f" [*]"  # Mark asynchronous fetched URLs.
-                            summary_with_ref = analysis_summary + link_str
-                            iteration_learnings.append(summary_with_ref)
-                            if isinstance(analysis.get("followups"), list):
-                                followup_suggestions.extend(analysis.get("followups"))
-            process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
-            logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
-            overall_learnings.extend(iteration_learnings)
-            overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
-            if additional_clarifications.strip():
-                overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
-                process_log += "Appended additional clarifications to the context.\n"
-            # Adaptive follow-up: if new followup suggestions emerged, call tailored questions generator
-            if followup_suggestions:
-                extra_questions = generate_tailored_questions(os.getenv("OPENAI_API_KEY"), initial_query, "", "", "", "")
-                overall_context += "\nAdaptive Follow-Up Questions:\n" + extra_questions + "\n"
-            progress_pct = int((iteration / depth) * 100)
-            yield (f"Progress: {progress_pct}%", None, process_log, None)
-        aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
-        final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
-        alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
-        final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
-        logging.info("iterative_deep_research_gen: Final report generated.")
-        yield ("", final_report, process_log, crumbs_list)
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
@@ -632,21 +649,21 @@ def assess_report_alignment(report: str, initial_query: str, clarifications: str
         "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
         "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
         "Research Report:\n" + report + "\n\n"
-        "Provide a short paragraph assessment on how well the report aligns with these requirements."
     )
     assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
     logging.info(f"assess_report_alignment: Assessment result: {assessment}")
     return assessment
-# --- Main Deep Research Orchestrator (Wrapper for async execution) --- #
-async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str,
-                                    breadth: int, depth: int, followup_clarifications: str, include_domains: str,
-                                    exclude_keywords: str, additional_clarifications: str, results_per_query: int,
-                                    selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
-                                    pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
-        logging.error("orchestrate_deep_research: Invalid API keys provided.")
-        return "Please input valid API keys", "", "", ""
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
@@ -658,26 +675,33 @@ async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, i
     if existing_crumbs:
         extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
-    loop = asyncio.get_event_loop()
-    researcher = iterative_deep_research_gen(initial_query, reportstyle, breadth, depth, followup_clarifications,
-                                               include_domains, exclude_keywords, additional_clarifications,
-                                               extra_context, selected_engines, results_per_query, go_deeper=int(pages))
     final_report = ""
-    process_log = ""
-    async for progress, rep, proc_log, crumbs in researcher:
         if rep is None:
-            current_progress = progress
-            # You could yield intermediate progress if needed.
         else:
             final_report = rep
-            process_log = proc_log
             break
     if surprise_me:
-        extended_report = generate_surprise_report(final_report, crumbs, initial_query, reportstyle, breadth, depth,
-                                                   followup_clarifications, include_domains, exclude_keywords,
-                                                   additional_clarifications, results_per_query, selected_engines)
         final_report = extended_report
-    return final_report, process_log, extra_context
 def load_example(example_choice: str) -> str:
     filename = ""
@@ -696,22 +720,6 @@ def load_example(example_choice: str) -> str:
         logging.error(f"load_example: Error loading {filename}: {e}")
         return ""
-def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
-                      followup_clarifications: str, include_domains: str, exclude_keywords: str, additional_clarifications: str,
-                      results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
-                      pages: str, surprise_me: bool):
-    final_report, proc_log, extra_context = asyncio.run(
-        orchestrate_deep_research(openai_api_key, serpapi_api_key, initial_query, reportstyle, breadth, depth,
-                                  followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
-                                  results_per_query, selected_engines, existing_crumbs, existing_report, existing_log,
-                                  pages, surprise_me)
-    )
-    return ("Progress: 100%", final_report, existing_report, existing_log, existing_crumbs)
-# =============================================================================
-# Gradio Interface using gr.Blocks with Custom CSS
-# =============================================================================
 def main():
     custom_css = """
     /* Overall container customization */
@@ -764,16 +772,16 @@ def main():
             openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
             serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
             gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
-            gr.Markdown("API keys are not stored or logged.")
-        with gr.Accordion("2] Research topic", open=False):
             with gr.Row():
                 research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
                 refine_query_button = gr.Button("Refine my Query", scale=1)
         with gr.Accordion("3] Q&A", open=False):
             with gr.Row():
-                clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale=4)
                 gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
         with gr.Accordion("4] Search Parameters", open=False):
@@ -814,7 +822,7 @@ def main():
         with gr.Accordion("5] Report", open=False, elem_classes="folder"):
             progress_display = gr.Markdown("", elem_id="progress-display")
             run_btn = gr.Button("Generate report")
-            final_report = gr.Markdown(label="Final Report (Markdown)", height=800, min_height=50)
             with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
                 with gr.Column():
                     query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)

 import logging
 import markdown
 import unicodedata
 from datetime import datetime
 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
                          "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
 # =============================================================================
+# Helper functions for external APIs and PDF Processing
 # =============================================================================
 def display_image():
     prompt = (f"""
 Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
 The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
+It must include inline citations (e.g., [1], [2], etc.).
+It must follow this writing style {reportstyle}.
+The report must include at least {round(pages/3,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
+The structure of the report should be:
 - Abstract
 - Table of contents
 - Introduction
+- [Sections and sub-sections, depending on the size and relevant topic]
 - Conclusion
+- References of the documents used in the inline citations
+Important: For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...). This is to avoid issues when converting markdown to html.
+You should still use markdown for the stryling (titles levels, bold, italic), tables...
+Output the report directly without any introductory meta comments.
 Learnings:
 {json.dumps(learnings, indent=2)}
 Merged Reference Details:
 {aggregated_crumbs}"""
     )
     tokentarget = word_count * 3  # rough multiplier for token target
     report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
+    # If the report is too long, compress it.
     if len(report) > MAX_MESSAGE_LENGTH:
         report = compress_text(report, MAX_MESSAGE_LENGTH)
     if report.startswith("Error calling OpenAI API"):
     return report
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
+    # Filter out already seen results
     new_results = []
     candidate_indexes = []
     for idx, res in enumerate(results):
         url = res.get("link", "")
         if url and url not in visited_urls:
             new_results.append(res)
             candidate_indexes.append(idx)
     if not new_results:
         return []
+    # Build the prompt with relaxed criteria.
     results_text = ""
     for idx, res in enumerate(new_results):
         title = res.get("title", "No Title")
     prompt = (
         f"The following search results were obtained for the query '{query}' with clarifications:\n"
         f"{clarifications}\n\n"
+        "For each result, decide whether it might be of interest for deeper research. "
+        "Even if not completely certain, lean towards including more potential references. "
+        "Return your decision as a JSON object where each key is the result index (as an integer) and the value is either 'yes' or 'no'. "
+        "For example: {\"0\": \"yes\", \"1\": \"no\", \"2\": \"yes\"}.\n"
+        "Consider the title, snippet, and URL in your decision."
+        f"\nResults:{results_text}\n"
+        "Output only the JSON object."
     )
     llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
     try:
         decision_map = json.loads(llm_response)
     except Exception as e:
         logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
+        # In case of error, default to no results selected.
         decision_map = {}
     filtered = []
     for idx, res in enumerate(new_results):
         url = res.get("link", "")
+        # Add each URL to visited regardless of decision.
         visited_urls.add(url)
         decision = decision_map.get(str(idx), "no").strip().lower()
         if decision == "yes":
     return filtered
 def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
+    finalquery = f"({query})"  # original query is wrapped in parentheses
     languages_detected_list = languagesdetected.split(",")
     for lang in languages_detected_list:
         prompt2 = f"""The research query is: "{query}".
+Based on this query and context: "{context}", and with the detected language {lang}, provide the translated version of the query in that language.
+The translation must be less than 20 words and preserve search operators like AND, OR, parenthesis, quotation marks, and exclusion hyphens.
 Output only the translated query."""
         translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
         finalquery += f" OR ({translatedquery})"
     return finalquery
 def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
     base_terms = initial_query.strip()
+    # Here you may add refinements if necessary to keep queries short.
+    queries = [base_terms]
+    # If topics are to be added, you can extend this list.
+    final_queries = queries[:min(len(queries), breadth)]
     logging.info(f"generate_query_tree: Generated queries: {final_queries}")
     return final_queries
                           selected_engines=None, results_per_query: int = 10) -> list:
     queries = generate_query_tree(initial_query, breadth, depth)
     prompt = f"""The research query is: "{initial_query}".
+Based on this query and the context: "{context}", suggest one or several languages (other than English) that might be relevant.
 Output either:
 - "No local attributes detected"
+- One language (e.g., "Spanish")
+- Multiple languages comma separated (e.g., "Italian,Putonghua,Cantonese")
+Output only the result.
+"""
     languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
     if languages_detected != "No local attributes detected":
         queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
         prompt_engines = f"""
 Examine these queries:
 {queries}
+and considering the research context:
 {context}
+Identify among these search engines:
+google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews
+Which are most relevant? Output a comma separated list (e.g., "google,baidu").
+If none are found, output "google".
+"""
         identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
         selected_engines = identified_engines.split(",")
+    else:
+        selected_engines = selected_engines
     final_queries = []
     for q in queries:
         for engine in selected_engines:
     logging.info(f"refine_query: Refined query: {refined}")
     return refined
 class ReportGenerator:
     def __init__(self):
         pass
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         # Remove markdown hyperlink syntax: replace [text](link) with just text.
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
+        # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
+        # Insert explicit page breaks before specific headings for main report sections.
         html_content = html_content.replace("<h2>Table of Contents</h2>",
             "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>",
             "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>",
             "<div style='page-break-before: always;'></div><h2>References</h2>")
+        # For the Surprise-Me section, ensure it starts on a new page.
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
             "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
+        # Build header using metadata if provided.
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
+        # Build a complete HTML document with CSS.
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
+        body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; }}
         p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
+        ol {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
+        ul {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
         hr {{ border: 1px solid #ccc; margin: 20px 0; }}
+        table {{
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 10px;
+        }}
+        th, td {{
+            border: 1px solid #ccc;
+            padding: 8px;
+            text-align: left;
+        }}
+        th {{
+            background-color: #f2f2f2;
+        }}
     </style>
 </head>
 <body>
 </body>
 </html>
 """
+        # Generate PDF from HTML using xhtml2pdf (pisa)
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
         if pisa_status.err:
             final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
         pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
                                                               metadata=metadata)
+        # Create a temporary file for PDF download
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
             tmp_file.write(pdf_bytes)
             tmp_path = tmp_file.name
         return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
+    """
+    Given a list of crumb records (each with 'url', 'summary', and 'full_content'),
+    extract and aggregate only the summary parts.
+    """
     aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
     logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
     return aggregated
                   "Formulate this as a new research query that could lead to innovative insights.")
     disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
     logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
+    # Generate tailored clarification questions for the disruptive query
     clarifications_for_new = generate_tailored_questions(
         os.getenv("OPENAI_API_KEY"),
+        disruptive_query + "\n\n IMPORTANT NOTE: in this specific iteration, generate also the responses for the questions asked (simulated)",
         "", "", "", ""
     )
     logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
+    # Run iterative deep research for the disruptive query
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
+def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
                                 exclude_keywords: str,
     references_list = []
     followup_suggestions = []
     logging.info("iterative_deep_research_gen: Research started.")
+    for iteration in range(1, depth + 1):
+        process_log += f"\n--- Iteration {iteration} ---\n"
+        logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
+        combined_context = overall_context
+        if followup_suggestions:
+            # Deduplicate follow-up suggestions before adding them to context.
+            unique_suggestions = list(set(followup_suggestions))
+            combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
+        queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
+        process_log += f"Generated queries: {queries}\n"
+        iteration_learnings = []
+        followup_suggestions = []  # reset for current iteration
+        for query_tuple in queries:
+            query_str, engine = query_tuple
+            mod_query = query_str
+            if include_domains.strip():
+                domains = [d.strip() for d in include_domains.split(",") if d.strip()]
+                domain_str = " OR ".join([f"site:{d}" for d in domains])
+                mod_query += f" ({domain_str})"
+            if exclude_keywords.strip():
+                for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
+                    mod_query += f" -{ex}"
+            process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
+            results = perform_serpapi_search(mod_query, engine, results_per_query)
+            # Instead of processing all results one-by-one, first filter them
+            filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
+            process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
+            for res in filtered_results:
+                url = res.get("link", "")
+                if not url:
+                    continue
+                content = ""
+                if url.lower().endswith(".pdf"):
+                    content = process_pdf(url)
+                    if "Error processing PDF" in content:
                         continue
+                    process_log += f"Extracted PDF content from {url}\n"
+                else:
+                    try:
+                        response = requests.get(url, headers=HEADERS)
+                        response.raise_for_status()
+                        content = response.text
+                        process_log += f"Extracted full page content from {url}\n"
+                    except Exception as e:
+                        logging.error(f"Error retrieving content from {url}: {e}")
+                        process_log += f"Error retrieving content from {url}: {e}\n"
+                        continue
+                analysis = analyze_with_gpt4o(initial_query, content)
+                analysis_summary = analysis.get("summary", "").strip()
+                process_log += (f"Summary: {analysis.get('summary')}, Follow-ups: {analysis.get('followups')}\n")
+                if not analysis_summary:
+                    analysis_summary = content[:200] + "..." if len(content) > 200 else content
+                crumbs_list.append({
+                    "url": url,
+                    "summary": analysis_summary,
+                    "full_content": content
+                })
+                if analysis.get("relevant", "no").lower() == "yes":
+                    if url.startswith("http://") or url.startswith("https://"):
+                        link_str = f" <a href='{url}'>[{ref_counter}]</a>"
                     else:
+                        link_str = f" [{ref_counter}]"
+                    summary_with_ref = analysis_summary + link_str
+                    iteration_learnings.append(summary_with_ref)
+                    references_list.append((ref_counter, url))
+                    ref_counter += 1
+                    if isinstance(analysis.get("followups"), list):
+                        followup_suggestions.extend(analysis.get("followups"))
+        process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
+        logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
+        overall_learnings.extend(iteration_learnings)
+        overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
+        if additional_clarifications.strip():
+            overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
+            process_log += "Appended additional clarifications to the context.\n"
+        progress_pct = int((iteration / depth) * 100)
+        yield (f"Progress: {progress_pct}%", None, None, None)
+    aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
+    final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
+    alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
+    final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
+    logging.info("iterative_deep_research_gen: Final report generated.")
+    yield ("", final_report, process_log, crumbs_list)
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
         "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
         "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
         "Research Report:\n" + report + "\n\n"
+        "Provide a short assessment in one paragraph on how well the report aligns with these requirements."
     )
     assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
     logging.info(f"assess_report_alignment: Assessment result: {assessment}")
     return assessment
+def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
+                      followup_clarifications: str, include_domains: str,
+                      exclude_keywords: str, additional_clarifications: str,
+                      results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
+                      pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
+        logging.error("run_deep_research: Invalid API keys provided.")
+        return "Please input valid API keys", "", "", "", ""
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     if existing_crumbs:
         extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
+    final_progress = ""
     final_report = ""
+    final_process_log = ""
+    final_crumbs = ""
+    logging.info("run_deep_research: Starting deep research process.")
+    for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
+            initial_query, reportstyle, breadth, depth, followup_clarifications,
+            include_domains, exclude_keywords, additional_clarifications,
+            extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
         if rep is None:
+            final_progress = progress
+            yield final_progress, None, None, None, None
         else:
             final_report = rep
+            final_process_log = proc_log
+            final_crumbs = crumbs
             break
     if surprise_me:
+        extended_report = generate_surprise_report(
+            final_report, final_crumbs, initial_query, reportstyle, breadth, depth,
+            followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
+            results_per_query, selected_engines
+        )
         final_report = extended_report
+        final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
+    logging.info("run_deep_research: Deep research process completed.")
+    yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
 def load_example(example_choice: str) -> str:
     filename = ""
         logging.error(f"load_example: Error loading {filename}: {e}")
         return ""
 def main():
     custom_css = """
     /* Overall container customization */
             openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
             serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
             gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
+            gr.Markdown("You can check the open-source code - None of the user API keys are stored or logged.")
+        with gr.Accordion ("2] Research topic", open=False):
             with gr.Row():
                 research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
                 refine_query_button = gr.Button("Refine my Query", scale=1)
         with gr.Accordion("3] Q&A", open=False):
             with gr.Row():
+                clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale = 4)
                 gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
         with gr.Accordion("4] Search Parameters", open=False):
         with gr.Accordion("5] Report", open=False, elem_classes="folder"):
             progress_display = gr.Markdown("", elem_id="progress-display")
             run_btn = gr.Button("Generate report")
+            final_report = gr.Markdown(label="Final Report (Markdown)", height = 800, min_height = 50)
             with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
                 with gr.Column():
                     query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)