Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 20, 2025

Commit

18cc689

verified ·

1 Parent(s): 3d57a74

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -25

app.py CHANGED Viewed

@@ -110,9 +110,11 @@ G --> H[Performance Evaluation - 45% Speed Improvement, 35% Risk Profiling, 50%
 - Take a deep breath, think step by step and think it well.
 // Examples
 -- flowchart --
 Important:
-- If the flow is "broader" than deep, choose LR (Left Right)
 - If the flow is "deeper" than broad (>3 levels), choose TD (Top Down)
 Top Down:
@@ -393,6 +395,12 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
         return err_msg
 def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
     client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
     prompt = (f"""Analyze the following content from a query result:
@@ -423,6 +431,9 @@ Note: General Optimization Guidelines:
 For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
 Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
 Proceed."""
     )
     try:
@@ -533,7 +544,17 @@ def generate_final_report(initial_query: str, context: str, reportstyle: str, le
     word_count = pages * 500
     prompt = (f"""
 // Instructions:
-Using the following learnings and merged reference details from a deep research process on:
 '{initial_query}'
 Taking also into consideration the context:
 {context}
@@ -854,7 +875,7 @@ def generate_tailored_questions(openai_api_key: str, query: str, existing_qa: st
 def backup_fields(research_query: str,
                   include_domains: str, exclude_keywords: str, additional_clarifications: str,
                   selected_engines, results_per_query, breadth, depth, clarification_text: str,
-                  existing_report: str, existing_log: str, crumbs_box: str, final_report: str) -> str:
     data = {
         "openai_api_key": "",
         "serpapi_api_key": "",
@@ -870,7 +891,8 @@ def backup_fields(research_query: str,
         "existing_report": existing_report,
         "existing_log": existing_log,
         "crumbs_box": crumbs_box,
-        "final_report": final_report
     }
     backup_json = json.dumps(data, indent=2)
     logging.info(f"backup_fields: Data backed up: {backup_json}")
@@ -894,10 +916,11 @@ def load_fields(backup_json: str):
                 data.get("existing_report", ""),
                 data.get("existing_log", ""),
                 data.get("crumbs_box", ""),
-                data.get("final_report", ""))
     except Exception as e:
         logging.error(f"load_fields error: {e}")
-        return ("", "", "", "", "", "", [], 10, 4, 2, "", "", "", "", "")
 def refine_query(query: str, openai_api_key: str) -> str:
     os.environ["OPENAI_API_KEY"] = openai_api_key
@@ -1192,7 +1215,7 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
-        extra_context="", selected_engines=selected_engines, results_per_query=results_per_query, go_deeper=1
     )
     extension_report = ""
     for progress, rep, proc_log, new_crumbs in generator:
@@ -1203,6 +1226,32 @@ def generate_surprise_report(previous_report: str, crumbs_list: list, initial_qu
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
 def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
@@ -1211,6 +1260,7 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                                 extra_context: str = "",
                                 selected_engines=None,
                                 results_per_query: int = 10,
                                 go_deeper: int = 8):
     overall_context = extra_context + f"Initial Query: {initial_query}\n"
     if followup_clarifications.strip():
@@ -1218,6 +1268,12 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
     process_log = "Starting research with context:\n" + overall_context + "\n"
     overall_learnings = []
     visited_urls = set()
     crumbs_list = []
     ref_counter = 1
     references_list = []
@@ -1232,10 +1288,20 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
             unique_suggestions = list(set(followup_suggestions))
             combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
         queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
-        process_log += f"Generated queries: {queries}\n"
         iteration_learnings = []
         followup_suggestions = []  # reset for current iteration
-        for query_tuple in queries:
             query_str, engine = query_tuple
             mod_query = query_str
             if include_domains.strip():
@@ -1272,9 +1338,13 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                         logging.error(f"Error retrieving content from {url}: {e}")
                         process_log += f"Error retrieving content from {url}: {e}\n"
                         continue
-                # Clean the raw content to obtain concise text
-                cleaned_content = clean_content(raw_content)
                 # Analyze the cleaned content with GPT-4o-mini
                 analysis = analyze_with_gpt4o(initial_query, cleaned_content, breadth)
@@ -1317,7 +1387,17 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
             process_log += "Appended additional clarifications to the context.\n"
         progress_pct = int((iteration / depth) * 100)
         yield (f"Progress: {progress_pct}%", None, None, None)
-    aggregated_crumbs = "\n\n".join([f"Title: {c.get('title', 'No Title')}\nURL: {c['url']}\nSummary: {c['summary']}" for c in crumbs_list])
     final_report = generate_final_report(initial_query, combined_context, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
     # --- NEW STEP: Post-process final_report to replace visual and focus placeholders ---
@@ -1330,8 +1410,49 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
         f"<p>---------</p><p><b>Report alignment assessment:</b> {alignment_assessment}</p>    </div>  </body></html>"
     )
     logging.info("iterative_deep_research_gen: Final report generated.")
-    yield ("", final_report, process_log, crumbs_list)
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
         "Please assess the following research report in terms of its alignment with the initial user request "
@@ -1349,7 +1470,7 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
                       followup_clarifications: str, include_domains: str,
                       exclude_keywords: str, additional_clarifications: str,
                       results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
-                      pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
         logging.error("run_deep_research: Invalid API keys provided.")
         return "Please input valid API keys", "", "", "", ""
@@ -1370,13 +1491,13 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
     final_process_log = ""
     final_crumbs = ""
     logging.info("run_deep_research: Starting deep research process.")
-    for progress, rep, proc_log, crumbs in iterative_deep_research_gen(
             initial_query, reportstyle, breadth, depth, followup_clarifications,
             include_domains, exclude_keywords, additional_clarifications,
-            extra_context, selected_engines, results_per_query, go_deeper=int(pages)):
         if rep is None:
             final_progress = progress
-            yield final_progress, None, None, None, None
         else:
             final_report = rep
             final_process_log = proc_log
@@ -1391,7 +1512,7 @@ def run_deep_research(openai_api_key: str, serpapi_api_key: str, initial_query:
         final_report = extended_report
         final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
     logging.info("run_deep_research: Deep research process completed.")
-    yield (final_progress, final_report, final_report, final_process_log, final_crumbs)
 def load_example(example_choice: str) -> str:
     filename = ""
@@ -1521,10 +1642,11 @@ def main():
                     report_file = gr.File(label="Download Report", visible=False, interactive=False, file_types=[".pdf"])
                     generate_button = gr.Button("Generate Report")
-        with gr.Accordion("6] Extra Context (Crumbs, Existing Report & Log)", open=False):
-            existing_report = gr.Textbox(label="Existing Report (if any)", placeholder="Paste previously generated report here...", lines=4)
-            existing_log = gr.Textbox(label="Existing Process Log (if any)", placeholder="Paste previously generated log here...", lines=4)
-            crumbs_box = gr.Textbox(label="Existing Crumbs (All scraped sources, JSON)", placeholder="Paste existing crumbs JSON here...", lines=4)
         with gr.Accordion("7] Backup / Restore Fields", open=False):
             backup_text = gr.Textbox(label="Backup JSON", placeholder="Backup output will appear here. You can also paste JSON here to load fields.", lines=6, interactive=True)
@@ -1550,9 +1672,9 @@ def main():
         run_btn.click(
             fn=run_deep_research,
             inputs=[openai_api_key_input, serpapi_api_key_input, research_query, reportstyle, breadth, depth, clarification_text, include_domains, exclude_keywords,
-                    additional_clarifications, results_per_query, selected_engines, existing_report, existing_log, crumbs_box,
                     pages_dropdown, surprise_me_checkbox],
-            outputs=[progress_display, final_report, existing_report, existing_log, crumbs_box],
             show_progress=True,
             api_name="deep_research"
         )

 - Take a deep breath, think step by step and think it well.
 // Examples
+Note: Pay attention for each example to what type of parenthesis / bracket is used and respect it scrupulously
 -- flowchart --
 Important:
+- If the flow is "broader" than deep (>3 branches at the same level), choose LR (Left Right)
 - If the flow is "deeper" than broad (>3 levels), choose TD (Top Down)
 Top Down:
         return err_msg
 def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
+    # measure snippet length
+    snippet_words = len(snippet.split())
+    # decide a proportional max tokens (cap at 3000 for example)
+    # e.g. 1 token ~ ~0.75 words, so we do something simplistic:
+    dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
     client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
     prompt = (f"""Analyze the following content from a query result:
 For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
 Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
+4. Ensure that the summary length and level of detail is proportional to the source length.
+Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
 Proceed."""
     )
     try:
     word_count = pages * 500
     prompt = (f"""
 // Instructions:
+- We want to incorporate as many relevant numbers, statistics, factual references, quotes from the sources,
+- Explicit mentions of organizations, tools, projects, or people from the crumb data as possible.
+- In your writing, do the following:
+1. Integrate numbers, quotes, and factual references systematically.
+2. Whenever you mention a figure or quote, add an inline reference [x] matching its source from the references.
+3. Specifically name relevant organizations, tools, project names, and people encountered in the crumbs or learnings.
+4. This is for academic purposes, so thorough citations and referencing are essential.
+Note: Do not be shy to use the names (organizations, people, project, application, tools...) mentioned in the sources, we need this for academic correctness
+// Sources
+Use the following learnings and merged reference details from a deep research process on:
 '{initial_query}'
 Taking also into consideration the context:
 {context}
 def backup_fields(research_query: str,
                   include_domains: str, exclude_keywords: str, additional_clarifications: str,
                   selected_engines, results_per_query, breadth, depth, clarification_text: str,
+                  existing_report: str, existing_log: str, crumbs_box: str, final_report: str, existing_queries_box: str) -> str:
     data = {
         "openai_api_key": "",
         "serpapi_api_key": "",
         "existing_report": existing_report,
         "existing_log": existing_log,
         "crumbs_box": crumbs_box,
+        "final_report": final_report,
+        "existing_queries": existing_queries_box
     }
     backup_json = json.dumps(data, indent=2)
     logging.info(f"backup_fields: Data backed up: {backup_json}")
                 data.get("existing_report", ""),
                 data.get("existing_log", ""),
                 data.get("crumbs_box", ""),
+                data.get("final_report", ""),
+                data.get("existing_queries",""))
     except Exception as e:
         logging.error(f"load_fields error: {e}")
+        return ("", "", "", "", "", "", [], 10, 4, 2, "", "", "", "", "", "")
 def refine_query(query: str, openai_api_key: str) -> str:
     os.environ["OPENAI_API_KEY"] = openai_api_key
     generator = iterative_deep_research_gen(
         disruptive_query, reportstyle, breadth, depth, followup_clarifications,
         include_domains, exclude_keywords, additional_clarifications,
+        extra_context="", selected_engines=selected_engines, results_per_query=results_per_query, existing_queries, go_deeper=1
     )
     extension_report = ""
     for progress, rep, proc_log, new_crumbs in generator:
     appended_report = previous_report + "\n\n<div style='page-break-before: always;'></div>\n<h2>Surprise-Me Extension Report</h2>\n\n" + clarifications_for_new + "\n\n" + extension_report
     return appended_report
+def extract_structured_insights(html_text: str) -> str:
+    """
+    Extract only facts, figures, arguments, and quotes in a concise manner.
+    Use BeautifulSoup to parse and remove anything not relevant to these categories.
+    This function returns a short text suitable for summarization by the LLM.
+    """
+    soup = BeautifulSoup(html_text, "html.parser")
+    # We can decide to keep paragraphs that contain digits (numbers),
+    # or words like "claim", "argument", "quote", etc. This is just an example heuristic.
+    paragraphs = soup.find_all('p')
+    curated_excerpts = []
+    for p in paragraphs:
+        text = p.get_text().strip()
+        # If it has digits or certain keywords, we keep it
+        if re.search(r'\d+', text) or re.search(r'\bargument\b|\bfact\b|\bfigure\b|\bstudy\b|\bquote\b', text, re.IGNORECASE):
+            curated_excerpts.append(text)
+    # Combine them into a shorter snippet
+    snippet = "\n".join(curated_excerpts)
+    # If snippet is too short, fallback to the entire cleaned text
+    if len(snippet.split()) < 30:
+        snippet = clean_content(html_text)[:2000]  # or some fallback length
+    return snippet
 def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: int, depth: int,
                                 followup_clarifications: str,
                                 include_domains: str,
                                 extra_context: str = "",
                                 selected_engines=None,
                                 results_per_query: int = 10,
+                                existing_queries: str,
                                 go_deeper: int = 8):
     overall_context = extra_context + f"Initial Query: {initial_query}\n"
     if followup_clarifications.strip():
     process_log = "Starting research with context:\n" + overall_context + "\n"
     overall_learnings = []
     visited_urls = set()
+    # Parse previously processed queries from existing_queries if provided
+    processed_queries = set()
+    for q_line in existing_queries.splitlines():
+        q_line = q_line.strip()
+        if q_line:
+            processed_queries.add(q_line)
     crumbs_list = []
     ref_counter = 1
     references_list = []
             unique_suggestions = list(set(followup_suggestions))
             combined_context += "\nFollow-up suggestions: " + ", ".join(unique_suggestions)
         queries = generate_serp_queries(combined_context, breadth, depth, initial_query, selected_engines, results_per_query)
+        # ===================================================================
+        # Skip queries already in processed_queries
+        filtered_query_tuples = []
+        for q_tuple in queries:
+            q_text, eng = q_tuple
+            if q_text not in processed_queries:
+                filtered_query_tuples.append(q_tuple)
+                processed_queries.add(q_text)  # remember we've processed it
+        # ===================================================================
+        process_log += f"\nWill run {len(filtered_query_tuples)} new queries this iteration instead of {len(queries)} total.\n"
         iteration_learnings = []
         followup_suggestions = []  # reset for current iteration
+        for query_tuple in filtered_query_tuples:
             query_str, engine = query_tuple
             mod_query = query_str
             if include_domains.strip():
                         logging.error(f"Error retrieving content from {url}: {e}")
                         process_log += f"Error retrieving content from {url}: {e}\n"
                         continue
+                # 1) Clean and do minimal parse
+                cleaned_html = clean_content(raw_content)
+                # 2) Extract structured data
+                semantically_rich_snippet = extract_structured_insights(cleaned_html)
+                # 3) Summarize with LLM
+                analysis = analyze_with_gpt4o(initial_query, semantically_rich_snippet, breadth)
                 # Analyze the cleaned content with GPT-4o-mini
                 analysis = analyze_with_gpt4o(initial_query, cleaned_content, breadth)
             process_log += "Appended additional clarifications to the context.\n"
         progress_pct = int((iteration / depth) * 100)
         yield (f"Progress: {progress_pct}%", None, None, None)
+    # chunk and filter all crumbs if breadth>3 and depth>2
+    filtered_crumbs_list = crumbs_list
+    if breadth > 3 and depth > 2:
+        filtered_crumbs_list = filter_crumbs_in_batches(crumbs_list, initial_query, followup_clarifications)
+    # Now build aggregated crumb text from filtered_crumbs_list only
+    aggregated_crumbs = "\n\n".join([
+        f"Title: {c.get('title','No Title')}\nURL: {c['url']}\nSummary: {c['summary']}"
+        for c in filtered_crumbs_list
+    ])
     final_report = generate_final_report(initial_query, combined_context, reportstyle, overall_learnings, list(visited_urls), aggregated_crumbs, references_list, pages=go_deeper)
     # --- NEW STEP: Post-process final_report to replace visual and focus placeholders ---
         f"<p>---------</p><p><b>Report alignment assessment:</b> {alignment_assessment}</p>    </div>  </body></html>"
     )
     logging.info("iterative_deep_research_gen: Final report generated.")
+    # We convert processed_queries to a string suitable for storing
+    all_processed_queries_str = "\n".join(sorted(processed_queries))
+    yield ("", final_report, process_log, crumbs_list, all_processed_queries_str)
+def filter_crumbs_in_batches(crumbs_list: list, initial_query: str, clarifications: str) -> list:
+    """
+    Splits crumbs into batches of 20, calls an LLM to decide keep/ignore each crumb.
+    Returns the final list of accepted crumbs.
+    """
+    accepted = []
+    batch_size = 20
+    for i in range(0, len(crumbs_list), batch_size):
+        batch = crumbs_list[i:i+batch_size]
+        # Build a prompt describing each crumb
+        prompt = "We have a set of crumbs. For each crumb, decide if it significantly adds new facts, figures, references, or quotes.\n"
+        prompt += "Mark 'yes' if it is valuable for the final report, otherwise 'no'. Output JSON.\n\n"
+        listing = []
+        for idx, c in enumerate(batch):
+            snippet_for_prompt = c["summary"][:500]  # short snippet
+            listing.append(f"Crumb {idx}: {snippet_for_prompt}")
+        prompt += "\n".join(listing)
+        prompt += """
+Return a JSON object with structure:
+{
+ "0": "yes" or "no",
+ "1": "yes" or "no",
+ ...
+}
+"""
+        decision_str = openai_call(prompt, model="o3-mini", max_tokens_param=1500)
+        # parse JSON
+        try:
+            decisions = json.loads(decision_str)
+        except:
+            decisions = {}
+        for idx, c in enumerate(batch):
+            d = decisions.get(str(idx), "no").lower()
+            if d == "yes":
+                accepted.append(c)
+    return accepted
 def assess_report_alignment(report: str, initial_query: str, clarifications: str) -> str:
     prompt = (
         "Please assess the following research report in terms of its alignment with the initial user request "
                       followup_clarifications: str, include_domains: str,
                       exclude_keywords: str, additional_clarifications: str,
                       results_per_query: int, selected_engines, existing_crumbs: str, existing_report: str, existing_log: str,
+                      existing_queries: str, pages: str, surprise_me: bool):
     if not openai_api_key or not serpapi_api_key:
         logging.error("run_deep_research: Invalid API keys provided.")
         return "Please input valid API keys", "", "", "", ""
     final_process_log = ""
     final_crumbs = ""
     logging.info("run_deep_research: Starting deep research process.")
+    for progress, rep, proc_log, crumbs, all_processed_queries_str in iterative_deep_research_gen(
             initial_query, reportstyle, breadth, depth, followup_clarifications,
             include_domains, exclude_keywords, additional_clarifications,
+            extra_context, selected_engines, results_per_query, existing_queries, go_deeper=int(pages)):
         if rep is None:
             final_progress = progress
+            yield final_progress, None, None, None, None, all_processed_queries_str
         else:
             final_report = rep
             final_process_log = proc_log
         final_report = extended_report
         final_progress = "Progress: 100% (\"Surprise Me\" extension complete)"
     logging.info("run_deep_research: Deep research process completed.")
+    yield (final_progress, final_report, final_report, final_process_log, final_crumbs, all_processed_queries_str)
 def load_example(example_choice: str) -> str:
     filename = ""
                     report_file = gr.File(label="Download Report", visible=False, interactive=False, file_types=[".pdf"])
                     generate_button = gr.Button("Generate Report")
+        with gr.Accordion("6] Extra Context (Crumbs, Existing Report & Log, Processed Queries)", open=False):
+            existing_report = gr.Textbox(label="Existing Report (if any)", ...)
+            existing_log = gr.Textbox(label="Existing Process Log (if any)", ...)
+            crumbs_box = gr.Textbox(label="Existing Crumbs (All sources, JSON)", ...)
+            existing_queries_box = gr.Textbox(label="Existing Queries (processed queries)", placeholder="Paste processed queries here...", lines=4)
         with gr.Accordion("7] Backup / Restore Fields", open=False):
             backup_text = gr.Textbox(label="Backup JSON", placeholder="Backup output will appear here. You can also paste JSON here to load fields.", lines=6, interactive=True)
         run_btn.click(
             fn=run_deep_research,
             inputs=[openai_api_key_input, serpapi_api_key_input, research_query, reportstyle, breadth, depth, clarification_text, include_domains, exclude_keywords,
+                    additional_clarifications, results_per_query, selected_engines, existing_report, existing_log, existing_queries, crumbs_box,
                     pages_dropdown, surprise_me_checkbox],
+            outputs=[progress_display, final_report, existing_report, existing_log, crumbs_box, existing_queries_box],
             show_progress=True,
             api_name="deep_research"
         )