Spaces:

10gen
/

deepsearchitv2

Runtime error

App Files Files Community

Guiyom commited on Feb 13, 2025

Commit

f807498

verified ·

1 Parent(s): 9db83ee

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -211

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ import tempfile
 import logging
 import markdown
 import unicodedata
 from datetime import datetime
 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
@@ -27,7 +29,7 @@ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/
                          "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
 # =============================================================================
-# Helper functions for external APIs and PDF Processing
 # =============================================================================
 def display_image():
@@ -176,32 +178,24 @@ def generate_final_report(initial_query: str, reportstyle: str, learnings: list,
     prompt = (f"""
 Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
 The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
-It must include inline citations (e.g., [1], [2], etc.).
-It must follow this writing style {reportstyle}.
-The report must include at least {round(pages/3,0)} tables from the sources used (add citations if necessary) and use facts and figures extensively to ground the analysis.
-The structure of the report should be:
 - Abstract
 - Table of contents
 - Introduction
-- [Sections and sub-sections, depending on the size and relevant topic]
 - Conclusion
-- References of the documents used in the inline citations
-Important: For the numbering of titles or numbered lists, use numbers (ex: 1.) and sub-units (1.1, 1.2... 1.1.1...,1.1.2...). This is to avoid issues when converting markdown to html.
-You should still use markdown for the stryling (titles levels, bold, italic), tables...
-Output the report directly without any introductory meta comments.
 Learnings:
 {json.dumps(learnings, indent=2)}
 Merged Reference Details:
 {aggregated_crumbs}"""
     )
     tokentarget = word_count * 3  # rough multiplier for token target
     report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
-    # If the report is too long, compress it.
     if len(report) > MAX_MESSAGE_LENGTH:
         report = compress_text(report, MAX_MESSAGE_LENGTH)
     if report.startswith("Error calling OpenAI API"):
@@ -211,17 +205,21 @@ Merged Reference Details:
     return report
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
-    # Filter out already seen results
     new_results = []
     candidate_indexes = []
     for idx, res in enumerate(results):
         url = res.get("link", "")
         if url and url not in visited_urls:
             new_results.append(res)
             candidate_indexes.append(idx)
     if not new_results:
         return []
-    # Build the prompt with relaxed criteria.
     results_text = ""
     for idx, res in enumerate(new_results):
         title = res.get("title", "No Title")
@@ -231,25 +229,18 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
     prompt = (
         f"The following search results were obtained for the query '{query}' with clarifications:\n"
         f"{clarifications}\n\n"
-        "For each result, decide whether it might be of interest for deeper research. "
-        "Even if not completely certain, lean towards including more potential references. "
-        "Return your decision as a JSON object where each key is the result index (as an integer) and the value is either 'yes' or 'no'. "
-        "For example: {\"0\": \"yes\", \"1\": \"no\", \"2\": \"yes\"}.\n"
-        "Consider the title, snippet, and URL in your decision."
-        f"\nResults:{results_text}\n"
-        "Output only the JSON object."
     )
     llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
     try:
         decision_map = json.loads(llm_response)
     except Exception as e:
         logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
-        # In case of error, default to no results selected.
         decision_map = {}
     filtered = []
     for idx, res in enumerate(new_results):
         url = res.get("link", "")
-        # Add each URL to visited regardless of decision.
         visited_urls.add(url)
         decision = decision_map.get(str(idx), "no").strip().lower()
         if decision == "yes":
@@ -258,12 +249,11 @@ def filter_search_results(results: list, visited_urls: set, query: str, clarific
     return filtered
 def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
-    finalquery = f"({query})"  # original query is wrapped in parentheses
     languages_detected_list = languagesdetected.split(",")
     for lang in languages_detected_list:
         prompt2 = f"""The research query is: "{query}".
-Based on this query and context: "{context}", and with the detected language {lang}, provide the translated version of the query in that language.
-The translation must be less than 20 words and preserve search operators like AND, OR, parenthesis, quotation marks, and exclusion hyphens.
 Output only the translated query."""
         translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
         finalquery += f" OR ({translatedquery})"
@@ -271,11 +261,14 @@ Output only the translated query."""
     return finalquery
 def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
     base_terms = initial_query.strip()
-    # Here you may add refinements if necessary to keep queries short.
-    queries = [base_terms]
-    # If topics are to be added, you can extend this list.
-    final_queries = queries[:min(len(queries), breadth)]
     logging.info(f"generate_query_tree: Generated queries: {final_queries}")
     return final_queries
@@ -283,13 +276,11 @@ def generate_serp_queries(context: str, breadth: int, depth: int, initial_query:
                           selected_engines=None, results_per_query: int = 10) -> list:
     queries = generate_query_tree(initial_query, breadth, depth)
     prompt = f"""The research query is: "{initial_query}".
-Based on this query and the context: "{context}", suggest one or several languages (other than English) that might be relevant.
 Output either:
 - "No local attributes detected"
-- One language (e.g., "Spanish")
-- Multiple languages comma separated (e.g., "Italian,Putonghua,Cantonese")
-Output only the result.
-"""
     languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
     if languages_detected != "No local attributes detected":
         queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
@@ -297,17 +288,12 @@ Output only the result.
         prompt_engines = f"""
 Examine these queries:
 {queries}
-and considering the research context:
 {context}
-Identify among these search engines:
-google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews
-Which are most relevant? Output a comma separated list (e.g., "google,baidu").
-If none are found, output "google".
-"""
         identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
         selected_engines = identified_engines.split(",")
-    else:
-        selected_engines = selected_engines
     final_queries = []
     for q in queries:
         for engine in selected_engines:
@@ -393,6 +379,23 @@ def refine_query(query: str, openai_api_key: str) -> str:
     logging.info(f"refine_query: Refined query: {refined}")
     return refined
 class ReportGenerator:
     def __init__(self):
         pass
@@ -403,9 +406,9 @@ class ReportGenerator:
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         # Remove markdown hyperlink syntax: replace [text](link) with just text.
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
-        # Convert markdown to HTML using the "extra" and "tables" extensions to support numbering and table syntax.
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
-        # Insert explicit page breaks before specific headings for main report sections.
         html_content = html_content.replace("<h2>Table of Contents</h2>",
             "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>",
@@ -414,10 +417,8 @@ class ReportGenerator:
             "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>",
             "<div style='page-break-before: always;'></div><h2>References</h2>")
-        # For the Surprise-Me section, ensure it starts on a new page.
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
             "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
-        # Build header using metadata if provided.
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
@@ -425,33 +426,21 @@ class ReportGenerator:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
-        # Build a complete HTML document with CSS.
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
-        body {{ font-family: Helvetica, sans-serif; margin: 40px; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; }}
         p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
-        ol {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
-        ul {{ font-size: 11pt; margin-left: 20px; margin-top: 0; margin-bottom: 10px; line-height: 1.5; }}
         hr {{ border: 1px solid #ccc; margin: 20px 0; }}
-        table {{
-            border-collapse: collapse;
-            width: 100%;
-            margin-bottom: 10px;
-        }}
-        th, td {{
-            border: 1px solid #ccc;
-            padding: 8px;
-            text-align: left;
-        }}
-        th {{
-            background-color: #f2f2f2;
-        }}
     </style>
 </head>
 <body>
@@ -460,7 +449,6 @@ class ReportGenerator:
 </body>
 </html>
 """
-        # Generate PDF from HTML using xhtml2pdf (pisa)
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
         if pisa_status.err:
@@ -481,7 +469,6 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
             final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
         pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
                                                               metadata=metadata)
-        # Create a temporary file for PDF download
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
             tmp_file.write(pdf_bytes)
             tmp_path = tmp_file.name
@@ -492,10 +479,6 @@ def handle_generate_report(query_name: str, user_name: str, final_report: str):
         return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
-    """
-    Given a list of crumb records (each with 'url', 'summary', and 'full_content'),
-    extract and aggregate only the summary parts.
-    """
     aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
     logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
     return aggregated
@@ -514,16 +497,12 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
                   "Formulate this as a new research query that could lead to innovative insights.")
     disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
     logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
-    # Generate tailored clarification questions for the disruptive query
     clarifications_for_new = generate_tailored_questions(
         os.getenv("OPENAI_API_KEY"),
-        disruptive_query + "\n\n IMPORTANT NOTE: in this specific iteration, generate also the responses for the questions asked (simulated)",
         "", "", "", ""
     )
     logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
-    # Run iterative deep research for the disruptive query
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
@@ -538,7 +517,8 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
-def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
                                 exclude_keywords: str,
@@ -558,90 +538,93 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
     references_list = []
     followup_suggestions = []
     logging.info("iterative_deep_research_gen: Research started.")
-    for iteration in range(1, depth + 1):
-        process_log += f"\n--- Iteration {iteration} ---\n"
-        logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
-        combined_context = overall_context
-        if followup_suggestions:
-            # Deduplicate follow-up suggestions before adding them to context.
-            unique_suggestions = list(set(followup_suggestions))
-            combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
-        queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
-        process_log += f"Generated queries: {queries}\n"
-        iteration_learnings = []
-        followup_suggestions = []  # reset for current iteration
-        for query_tuple in queries:
-            query_str, engine = query_tuple
-            mod_query = query_str
-            if include_domains.strip():
-                domains = [d.strip() for d in include_domains.split(",") if d.strip()]
-                domain_str = " OR ".join([f"site:{d}" for d in domains])
-                mod_query += f" ({domain_str})"
-            if exclude_keywords.strip():
-                for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
-                    mod_query += f" -{ex}"
-            process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
-            results = perform_serpapi_search(mod_query, engine, results_per_query)
-            # Instead of processing all results one-by-one, first filter them
-            filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
-            process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
-            for res in filtered_results:
-                url = res.get("link", "")
-                if not url:
-                    continue
-                content = ""
-                if url.lower().endswith(".pdf"):
-                    content = process_pdf(url)
-                    if "Error processing PDF" in content:
                         continue
-                    process_log += f"Extracted PDF content from {url}\n"
-                else:
-                    try:
-                        response = requests.get(url, headers=HEADERS)
-                        response.raise_for_status()
-                        content = response.text
-                        process_log += f"Extracted full page content from {url}\n"
-                    except Exception as e:
-                        logging.error(f"Error retrieving content from {url}: {e}")
-                        process_log += f"Error retrieving content from {url}: {e}\n"
-                        continue
-                analysis = analyze_with_gpt4o(initial_query, content)
-                analysis_summary = analysis.get("summary", "").strip()
-                process_log += (f"Summary: {analysis.get('summary')}, Follow-ups: {analysis.get('followups')}\n")
-                if not analysis_summary:
-                    analysis_summary = content[:200] + "..." if len(content) > 200 else content
-                crumbs_list.append({
-                    "url": url,
-                    "summary": analysis_summary,
-                    "full_content": content
-                })
-                if analysis.get("relevant", "no").lower() == "yes":
-                    if url.startswith("http://") or url.startswith("https://"):
-                        link_str = f" <a href='{url}'>[{ref_counter}]</a>"
                     else:
-                        link_str = f" [{ref_counter}]"
-                    summary_with_ref = analysis_summary + link_str
-                    iteration_learnings.append(summary_with_ref)
-                    references_list.append((ref_counter, url))
-                    ref_counter += 1
-                    if isinstance(analysis.get("followups"), list):
-                        followup_suggestions.extend(analysis.get("followups"))
-        process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
-        logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
-        overall_learnings.extend(iteration_learnings)
-        overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
-        if additional_clarifications.strip():
-            overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
-            process_log += "Appended additional clarifications to the context.\n"
-        progress_pct = int((iteration / depth) * 100)
-        yield (f"Progress: {progress_pct}%", None, None, None)
-    aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
-    final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
-    alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
-    final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
-    logging.info("iterative_deep_research_gen: Final report generated.")
-    yield ("", final_report, process_log, crumbs_list)
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
@@ -649,21 +632,21 @@ def assess_report_alignment(report: str, initial_query: str, clarifications: str
         "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
         "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
         "Research Report:\n" + report + "\n\n"
-        "Provide a short assessment in one paragraph on how well the report aligns with these requirements."
     )
     assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
     logging.info(f"assess_report_alignment: Assessment result: {assessment}")
     return assessment
-def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
-                      followup_clarifications: str, include_domains: str,
-                      exclude_keywords: str, additional_clarifications: str,
-                      results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
-                      pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
-        logging.error("run_deep_research: Invalid API keys provided.")
-        return "Please input valid API keys", "", "", "", ""
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
@@ -675,50 +658,42 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
     if existing_crumbs:
         extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
-    final_progress = ""
     final_report = ""
-    final_process_log = ""
-    final_crumbs = ""
-    logging.info("run_deep_research: Starting deep research process.")
-    for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
-            initial_query, reportstyle, breadth, depth, followup_clarifications,
-            include_domains, exclude_keywords, additional_clarifications,
-            extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
         if rep is None:
-            final_progress = progress
-            yield final_progress, None, None, None, None
         else:
             final_report = rep
-            final_process_log = proc_log
-            final_crumbs = crumbs
             break
     if surprise_me:
-        extended_report = generate_surprise_report(
-            final_report, final_crumbs, initial_query, reportstyle, breadth, depth,
-            followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
-            results_per_query, selected_engines
-        )
         final_report = extended_report
-        final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
-    logging.info("run_deep_research: Deep research process completed.")
-    yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
-def load_example(example_choice: str) -> str:
-    filename = ""
-    if example_choice == "Implications of the release of advanced Deep Research solutions":
-        filename = "example1.txt"
-    elif example_choice == "AI regulation in finance":
-        filename = "example2.txt"
-    elif example_choice == "AI top voices":
-        filename = "example3.txt"
-    try:
-        with open(filename, "r", encoding="utf-8") as f:
-            content = f.read()
-        logging.info(f"load_example: Loaded content from {filename}")
-        return content
-    except Exception as e:
-        logging.error(f"load_example: Error loading {filename}: {e}")
-        return ""
 def main():
     custom_css = """
@@ -772,16 +747,16 @@ def main():
             openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
             serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
             gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
-            gr.Markdown("You can check the open-source code - None of the user API keys are stored or logged.")
-        with gr.Accordion ("2] Research topic", open=False):
             with gr.Row():
                 research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
                 refine_query_button = gr.Button("Refine my Query", scale=1)
         with gr.Accordion("3] Q&A", open=False):
             with gr.Row():
-                clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale = 4)
                 gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
         with gr.Accordion("4] Search Parameters", open=False):
@@ -822,7 +797,7 @@ def main():
         with gr.Accordion("5] Report", open=False, elem_classes="folder"):
             progress_display = gr.Markdown("", elem_id="progress-display")
             run_btn = gr.Button("Generate report")
-            final_report = gr.Markdown(label="Final Report (Markdown)", height = 800, min_height = 50)
             with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
                 with gr.Column():
                     query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)

 import logging
 import markdown
 import unicodedata
+import asyncio
+import aiohttp
 from datetime import datetime
 from reportlab.lib.pagesizes import A4
 from xhtml2pdf import pisa
                          "(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"}
 # =============================================================================
+# Helper functions for external APIs, PDF Processing and Asynchronous Requests
 # =============================================================================
 def display_image():
     prompt = (f"""
 Using the following learnings and merged reference details from a deep research process on '{initial_query}', produce a comprehensive research report in Markdown format.
 The report should be very detailed and lengthy — approximately the equivalent of {pages} pages (or {word_count} words) when printed.
+It must include inline citations (e.g., [1], [2], etc.) and follow this writing style: {reportstyle}.
+Include at least {round(pages/3,0)} tables from the sources used (citations added if necessary).
+The structure should have:
 - Abstract
 - Table of contents
 - Introduction
+- [Sections and sub-sections as needed]
 - Conclusion
+- References
+Important: Number titles and lists as 1., 1.1, etc.
 Learnings:
 {json.dumps(learnings, indent=2)}
 Merged Reference Details:
 {aggregated_crumbs}"""
     )
     tokentarget = word_count * 3  # rough multiplier for token target
     report = openai_call(prompt, model="o3-mini", max_tokens_param=tokentarget)
     if len(report) > MAX_MESSAGE_LENGTH:
         report = compress_text(report, MAX_MESSAGE_LENGTH)
     if report.startswith("Error calling OpenAI API"):
     return report
 def filter_search_results(results: list, visited_urls: set, query: str, clarifications: str) -> list:
+    # Filter out already seen results by URL and domain (robust deduplication)
     new_results = []
     candidate_indexes = []
+    seen_domains = set()
     for idx, res in enumerate(results):
         url = res.get("link", "")
         if url and url not in visited_urls:
+            domain = url.split("/")[2] if "://" in url else url
+            if domain in seen_domains:
+                continue
             new_results.append(res)
             candidate_indexes.append(idx)
+            seen_domains.add(domain)
     if not new_results:
         return []
     results_text = ""
     for idx, res in enumerate(new_results):
         title = res.get("title", "No Title")
     prompt = (
         f"The following search results were obtained for the query '{query}' with clarifications:\n"
         f"{clarifications}\n\n"
+        "For each result, decide if it might be relevant for deeper research. Return a JSON object with keys as result indices and values as 'yes' or 'no'.\n"
+        f"Results:{results_text}\nOutput only the JSON object."
     )
     llm_response = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=200)
     try:
         decision_map = json.loads(llm_response)
     except Exception as e:
         logging.error(f"filter_search_results: JSON decode error: {e}; Full response: {llm_response}")
         decision_map = {}
     filtered = []
     for idx, res in enumerate(new_results):
         url = res.get("link", "")
         visited_urls.add(url)
         decision = decision_map.get(str(idx), "no").strip().lower()
         if decision == "yes":
     return filtered
 def make_multilingual_query(query: str, context: str, languagesdetected: str) -> str:
+    finalquery = f"({query})"  # original query in parentheses
     languages_detected_list = languagesdetected.split(",")
     for lang in languages_detected_list:
         prompt2 = f"""The research query is: "{query}".
+Based on this query and context: "{context}", and using the detected language {lang}, provide a translated version (less than 20 words) preserving search operators.
 Output only the translated query."""
         translatedquery = openai_call(prompt2, model="gpt-4o-mini", max_tokens_param=50)
         finalquery += f" OR ({translatedquery})"
     return finalquery
 def generate_query_tree(initial_query: str, breadth: int, depth: int) -> list:
+    # Generate several variants of the query based on the desired breadth.
     base_terms = initial_query.strip()
+    variants = [base_terms,
+                base_terms + " detailed analysis",
+                base_terms + " review",
+                base_terms + " case study"]
+    # Return only as many as needed (up to 'breadth')
+    final_queries = variants[:min(len(variants), breadth)]
     logging.info(f"generate_query_tree: Generated queries: {final_queries}")
     return final_queries
                           selected_engines=None, results_per_query: int = 10) -> list:
     queries = generate_query_tree(initial_query, breadth, depth)
     prompt = f"""The research query is: "{initial_query}".
+Based on the context: "{context}", suggest non-English languages (if any) relevant.
 Output either:
 - "No local attributes detected"
+- A comma-separated list (e.g., "Spanish,Italian")
+Output only the result."""
     languages_detected = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=20)
     if languages_detected != "No local attributes detected":
         queries = [make_multilingual_query(q, context, languages_detected) for q in queries]
         prompt_engines = f"""
 Examine these queries:
 {queries}
+Considering the context:
 {context}
+Identify among these search engines: google,google_jobs_listing,google_trends,google_news,google_scholar,google_ai_overview,bing,bing_news,baidu,baidu_news,yandex,youtube_video,linkedin,linkedin_profile,duckduckgo_news,yelp_reviews.
+Return a comma separated list (default "google" if none)."""
         identified_engines = openai_call(prompt_engines, model="gpt-4o-mini", max_tokens_param=20)
         selected_engines = identified_engines.split(",")
     final_queries = []
     for q in queries:
         for engine in selected_engines:
     logging.info(f"refine_query: Refined query: {refined}")
     return refined
+# --- New Asynchronous Helper for Parallel URL Fetching --- #
+async def async_fetch_url(session: aiohttp.ClientSession, url: str) -> str:
+    """Fetch the URL asynchronously using aiohttp."""
+    try:
+        async with session.get(url, headers=HEADERS, timeout=10) as response:
+            response.raise_for_status()
+            text = await response.text()
+            logging.info(f"async_fetch_url: Fetched content from {url}")
+            return text
+    except Exception as e:
+        logging.error(f"async_fetch_url: Error retrieving content from {url}: {e}")
+        return ""
+# =============================================================================
+# ReportGenerator and PDF generation (Enhanced CSS added)
+# =============================================================================
 class ReportGenerator:
     def __init__(self):
         pass
         solution_content = re.sub(r'[\u2010\u2011\u2012\u2013\u2014\u2015]', "-", solution_content)
         # Remove markdown hyperlink syntax: replace [text](link) with just text.
         solution_content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', solution_content)
+        # Convert markdown to HTML using the "extra" and "tables" extensions.
         html_content = markdown.markdown(solution_content, extensions=['extra', 'tables'])
+        # Insert explicit page breaks before key headings (with added CSS for dynamic styling).
         html_content = html_content.replace("<h2>Table of Contents</h2>",
             "<div style='page-break-before: always;'></div><h2>Table of Contents</h2>")
         html_content = html_content.replace("<h2>Introduction</h2>",
             "<div style='page-break-before: always;'></div><h2>Conclusion</h2>")
         html_content = html_content.replace("<h2>References</h2>",
             "<div style='page-break-before: always;'></div><h2>References</h2>")
         html_content = html_content.replace("<h2>Surprise-Me Extension Report</h2>",
             "<div style='page-break-before: always;'></div><h2>Surprise-Me Extension Report</h2>")
         date_str = datetime.now().strftime("%Y-%m-%d")
         header = ""
         if metadata:
 <p>Author: {metadata.get('User name', 'N/A')}</p>
 <p>Date: {metadata.get('Date', date_str)}</p>
 <hr/>"""
         full_html = f"""
 <html>
 <head>
     <meta charset="utf-8" />
     <style>
+        body {{ font-family: Helvetica, sans-serif; margin: 40px; background: #fefefe; }}
         h1 {{ font-size: 24pt; margin-bottom: 12px; }}
         h2 {{ font-size: 20pt; margin-bottom: 10px; }}
         h3 {{ font-size: 18pt; margin-bottom: 8px; }}
         p {{ font-size: 11pt; line-height: 1.5; margin-bottom: 10px; }}
+        ol, ul {{ font-size: 11pt; margin-left: 20px; line-height: 1.5; }}
         hr {{ border: 1px solid #ccc; margin: 20px 0; }}
+        table {{ border-collapse: collapse; width: 100%; margin-bottom: 10px; }}
+        th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
+        th {{ background-color: #f2f2f2; }}
     </style>
 </head>
 <body>
 </body>
 </html>
 """
         pdf_buffer = io.BytesIO()
         pisa_status = pisa.CreatePDF(full_html, dest=pdf_buffer)
         if pisa_status.err:
             final_report = compress_text(final_report, MAX_MESSAGE_LENGTH)
         pdf_bytes = report_generator.generate_report_pdf_html(solution_content=final_report,
                                                               metadata=metadata)
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
             tmp_file.write(pdf_bytes)
             tmp_path = tmp_file.name
         return f"Error generating report: {str(e)}", None
 def extract_summary_from_crumbs(crumbs_list: list) -> str:
     aggregated = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
     logging.info("extract_summary_from_crumbs: Aggregated crumb summary created.")
     return aggregated
                   "Formulate this as a new research query that could lead to innovative insights.")
     disruptive_query = openai_call(new_prompt, model="gpt-4o-mini", max_tokens_param=500)
     logging.info(f"generate_surprise_report: Disruptive new query generated: {disruptive_query}")
     clarifications_for_new = generate_tailored_questions(
         os.getenv("OPENAI_API_KEY"),
+        disruptive_query + "\n\n IMPORTANT NOTE: in this iteration, generate also simulated responses for the questions asked",
         "", "", "", ""
     )
     logging.info(f"generate_surprise_report: Clarification questions for new query: {clarifications_for_new}")
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
+# --- Adaptive and Parallel Organized Research (Dynamic Agent Orchestration) --- #
+async def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
                                 exclude_keywords: str,
     references_list = []
     followup_suggestions = []
     logging.info("iterative_deep_research_gen: Research started.")
+    # Create a single aiohttp session for parallel page fetching
+    async with aiohttp.ClientSession() as session:
+        for iteration in range(1, depth + 1):
+            process_log += f"\n--- Iteration {iteration} ---\n"
+            logging.info(f"iterative_deep_research_gen: Starting iteration {iteration}.")
+            combined_context = overall_context
+            if followup_suggestions:
+                unique_suggestions = list(set(followup_suggestions))
+                combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
+            queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
+            process_log += f"Generated queries: {queries}\n"
+            iteration_learnings = []
+            followup_suggestions = []  # reset for current iteration
+            # For each query, perform SERPAPI search and fetch pages concurrently:
+            for query_str, engine in queries:
+                mod_query = query_str
+                if include_domains.strip():
+                    domains = [d.strip() for d in include_domains.split(",") if d.strip()]
+                    domain_str = " OR ".join([f"site:{d}" for d in domains])
+                    mod_query += f" ({domain_str})"
+                if exclude_keywords.strip():
+                    for ex in [ex.strip() for ex in exclude_keywords.split(",") if ex.strip()]:
+                        mod_query += f" -{ex}"
+                process_log += f"\nPerforming SERPAPI search with query: {mod_query} using engine: {engine}\n"
+                results = perform_serpapi_search(mod_query, engine, results_per_query)
+                filtered_results = filter_search_results(results, visited_urls, initial_query, followup_clarifications)
+                process_log += f"After filtering, {len(filtered_results)} results remain for processing.\n"
+                async_tasks = []
+                for res in filtered_results:
+                    url = res.get("link", "")
+                    if not url:
                         continue
+                    if url.lower().endswith(".pdf"):
+                        content = process_pdf(url)
+                        process_log += f"Extracted PDF content from {url}\n"
+                        # Process synchronously for PDFs
+                        analysis = analyze_with_gpt4o(initial_query, content)
+                        analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
+                        crumbs_list.append({"url": url, "summary": analysis_summary, "full_content": content})
+                        if analysis.get("relevant", "no").lower() == "yes":
+                            link_str = f" <a href='{url}'>[{ref_counter}]</a>"
+                            summary_with_ref = analysis_summary + link_str
+                            iteration_learnings.append(summary_with_ref)
+                            references_list.append((ref_counter, url))
+                            ref_counter += 1
+                            if isinstance(analysis.get("followups"), list):
+                                followup_suggestions.extend(analysis.get("followups"))
                     else:
+                        # Schedule asynchronous fetching for non-PDF pages
+                        async_tasks.append(async_fetch_url(session, url))
+                # Wait for asynchronous fetches to complete
+                if async_tasks:
+                    fetched_contents = await asyncio.gather(*async_tasks)
+                    for content in fetched_contents:
+                        if not content:
+                            continue
+                        analysis = analyze_with_gpt4o(initial_query, content)
+                        analysis_summary = analysis.get("summary", "").strip() or (content[:200] + "..." if len(content) > 200 else content)
+                        # Here we do not re-fetch URL since it is already processed
+                        crumbs_list.append({"url": "async_url", "summary": analysis_summary, "full_content": content})
+                        if analysis.get("relevant", "no").lower() == "yes":
+                            link_str = f" [*]"  # Mark asynchronous fetched URLs.
+                            summary_with_ref = analysis_summary + link_str
+                            iteration_learnings.append(summary_with_ref)
+                            if isinstance(analysis.get("followups"), list):
+                                followup_suggestions.extend(analysis.get("followups"))
+            process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
+            logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
+            overall_learnings.extend(iteration_learnings)
+            overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
+            if additional_clarifications.strip():
+                overall_context += "\nAdditional Clarifications from user: " + additional_clarifications.strip() + "\n"
+                process_log += "Appended additional clarifications to the context.\n"
+            # Adaptive follow-up: if new followup suggestions emerged, call tailored questions generator
+            if followup_suggestions:
+                extra_questions = generate_tailored_questions(os.getenv("OPENAI_API_KEY"), initial_query, "", "", "", "")
+                overall_context += "\nAdaptive Follow-Up Questions:\n" + extra_questions + "\n"
+            progress_pct = int((iteration / depth) * 100)
+            yield (f"Progress: {progress_pct}%", None, process_log, None)
+        aggregated_crumbs = "\n".join([f"URL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
+        final_report = generate_final_report(initial_query, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
+        alignment_assessment = assess_report_alignment(final_report, initial_query, followup_clarifications)
+        final_report += "\n\n\n\n\n**Report alignment assessment:**\n" + alignment_assessment
+        logging.info("iterative_deep_research_gen: Final report generated.")
+        yield ("", final_report, process_log, crumbs_list)
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
         "and the clarification Q&A provided. Ensure that the report covers key points of the topic.\n\n"
         "Initial Query: " + initial_query + "\nClarifications: " + clarifications + "\n\n"
         "Research Report:\n" + report + "\n\n"
+        "Provide a short paragraph assessment on how well the report aligns with these requirements."
     )
     assessment = openai_call(prompt, model="gpt-3.5-turbo", max_tokens_param=200)
     logging.info(f"assess_report_alignment: Assessment result: {assessment}")
     return assessment
+# --- Main Deep Research Orchestrator (Wrapper for async execution) --- #
+async def orchestrate_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str,
+                                    breadth: int, depth: int, followup_clarifications: str, include_domains: str,
+                                    exclude_keywords: str, additional_clarifications: str, results_per_query: int,
+                                    selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
+                                    pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
+        logging.error("orchestrate_deep_research: Invalid API keys provided.")
+        return "Please input valid API keys", "", "", ""
     os.environ["OPENAI_API_KEY"] = openai_api_key
     os.environ["SERPAPI_API_KEY"] = serpapi_api_key
     if existing_crumbs:
         extra_context += f"Existing Crumbs:\n{existing_crumbs}\n"
+    loop = asyncio.get_event_loop()
+    researcher = iterative_deep_research_gen(initial_query, reportstyle, breadth, depth, followup_clarifications,
+                                               include_domains, exclude_keywords, additional_clarifications,
+                                               extra_context, selected_engines, results_per_query, go_deeper=int(pages))
     final_report = ""
+    process_log = ""
+    async for progress, rep, proc_log, crumbs in researcher:
         if rep is None:
+            current_progress = progress
+            # You could yield intermediate progress if needed.
         else:
             final_report = rep
+            process_log = proc_log
             break
     if surprise_me:
+        extended_report = generate_surprise_report(final_report, crumbs, initial_query, reportstyle, breadth, depth,
+                                                   followup_clarifications, include_domains, exclude_keywords,
+                                                   additional_clarifications, results_per_query, selected_engines)
         final_report = extended_report
+    return final_report, process_log, extra_context
+def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query: str, reportstyle: str, breadth: int, depth: int,
+                      followup_clarifications: str, include_domains: str, exclude_keywords: str, additional_clarifications: str,
+                      results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
+                      pages: str, surprise_me: bool):
+    final_report, proc_log, extra_context = asyncio.run(
+        orchestrate_deep_research(openai_api_key, serpapi_api_key, initial_query, reportstyle, breadth, depth,
+                                  followup_clarifications, include_domains, exclude_keywords, additional_clarifications,
+                                  results_per_query, selected_engines, existing_crumbs, existing_report, existing_log,
+                                  pages, surprise_me)
+    )
+    return ("Progress: 100%", final_report, existing_report, existing_log, existing_crumbs)
+# =============================================================================
+# Gradio Interface using gr.Blocks with Custom CSS
+# =============================================================================
 def main():
     custom_css = """
             openai_api_key_input = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API Key here...", type="password")
             serpapi_api_key_input = gr.Textbox(label="SERPAPI API Key", placeholder="Enter your SERPAPI API Key here...", type="password")
             gr.Markdown("[Create OpenAI API Key](https://platform.openai.com/account/api-keys) | [Create SERPAPI API Key](https://serpapi.com/manage-api-key)")
+            gr.Markdown("API keys are not stored or logged.")
+        with gr.Accordion("2] Research topic", open=False):
             with gr.Row():
                 research_query = gr.Textbox(label="Research Query", placeholder="Enter your research query here...", lines=2, elem_id="research-query", scale=4)
                 refine_query_button = gr.Button("Refine my Query", scale=1)
         with gr.Accordion("3] Q&A", open=False):
             with gr.Row():
+                clarification_text = gr.Textbox(label="Clarification / Follow-Up Questions", placeholder="Tailored clarifying suggestions will appear here...", lines=6, scale=4)
                 gen_followups = gr.Button("Generate Tailored Clarification Questions", scale=1)
         with gr.Accordion("4] Search Parameters", open=False):
         with gr.Accordion("5] Report", open=False, elem_classes="folder"):
             progress_display = gr.Markdown("", elem_id="progress-display")
             run_btn = gr.Button("Generate report")
+            final_report = gr.Markdown(label="Final Report (Markdown)", height=800, min_height=50)
             with gr.Accordion("Generate PDF", open=False, elem_classes="folder"):
                 with gr.Column():
                     query_name = gr.Textbox(label="Query name", placeholder="Enter query name...", lines=1)