Spaces:

10gen
/

deepsearchitv2

Running

App Files Files Community

Guiyom commited on Feb 21, 2025

Commit

02baeee

verified ·

1 Parent(s): 19cf986

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -19

app.py CHANGED Viewed

@@ -27,7 +27,8 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
 # Global Settings
 # =============================================================================
 MAX_MESSAGE_LENGTH = 1048576
 # =============================================================================
 # Helper functions for external APIs and PDF Processing
 # =============================================================================
@@ -387,7 +388,7 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
         response = client.chat.completions.create(**params)
         result = response.choices[0].message.content.strip()
         result = result.strip().strip("json").strip("```").strip()
-        logging.info(f"openai_call completed with model {model}. Response preview: {result[:400]}")
         return result
     except Exception as e:
         err_msg = f"Error calling OpenAI API: {e}"
@@ -431,8 +432,10 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
             "Summarize the following text, preserving all key details and ensuring that any tables or structured "
             "data are also summarized:\n\n" + chunk
         )
-        # Use a relatively small max_tokens value for each chunk summarization.
         summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
         summary_chunks.append(summary_chunk.strip())
     combined_summary = "\n".join(summary_chunks)
@@ -473,30 +476,32 @@ Research topic:
 {query}
 Instructions:
-1.  Relevance: Determine if the content is relevant to the research topic. Answer with a single word: 'yes' or 'no'.
 2.  Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
-    -   Key Facts (at least 5): List the core factual claims. Use short, declarative sentences or bullet points. Apply lemmatization, common abbreviations (e.g., vs., e.g., i.e., AI, LLM), and remove unnecessary words.
-    -   Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation and present concisely (list or table format). If the content includes tables or structured data, extract and summarize the critical information from them. If data available, collect enough so that the user can use it to generate new tables and graphs, visuals.
-    -   Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
-    -   Key Quotes (at least 1 if any): Include significant quotes (with the name of the author in parentheses). Attribute quotes correctly. Choose quotes that are concise and impactful. If a quote can be paraphrased concisely without losing essential meaning, paraphrase it and note that it's a paraphrase. Use symbols instead of words (&, +, ->, =, ...).
-    -   Structured Summary (10 to 50 sentences depending on the length): Mention anecdotes, people, locations, and any additional context that will make the end report relatable and grounded.
 Note: General Optimization Guidelines:
-    -   Lemmatize: Use the root form of words (e.g., "running" -> "run").
-    -   Abbreviate: Use common abbreviations.
-    -   Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
-    -   Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
-    -   Implicit Representation: Remove redundant terms.
-    -   Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
-3.  Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic and build upon the summarized content. Aim for deeper understanding by using search operators (AND, OR, quotation marks) where appropriate. Represent these queries as a Python list of strings, e.g., ["query1", "query2", ...].
 4. Ensure that the summary length and level of detail is proportional to the source length.
 Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
 Proceed."""
     )
     try:
         response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
         res_text = response.strip()
@@ -586,9 +591,12 @@ def process_pdf(url: str) -> str:
         return err
 def compress_text(text: str, target_length: int) -> str:
     prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
     summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
-    logging.info(f"compress_text: Compressed text length: {len(summary)}")
     return summary
 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
@@ -1382,7 +1390,6 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                     raw_content = process_pdf(url)
                     if "Error processing PDF" in raw_content:
                         continue
-                    process_log += f"Extracted PDF content from {url}\n"
                 else:
                     try:
                         headers = {"User-Agent": get_random_header()}
@@ -1394,6 +1401,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                         logging.error(f"Error retrieving content from {url}: {e}")
                         process_log += f"Error retrieving content from {url}: {e}\n"
                         continue
                 # 1) Clean and do minimal parse
                 cleaned_html = clean_content(raw_content)
@@ -1437,6 +1448,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
                             followup_suggestions.extend(analysis.get("followups"))
         process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
         logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
         overall_learnings.extend(iteration_learnings)
         overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
         if additional_clarifications.strip():
@@ -1738,7 +1753,7 @@ def main():
         backup_button.click(
             fn=backup_fields,
             inputs=[research_query, include_domains, exclude_keywords,
-                    additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report],
             outputs=[backup_text]
         )
         load_button.click(

 # Global Settings
 # =============================================================================
 MAX_MESSAGE_LENGTH = 1048576
+SUMMARIZATION_REQUEST_COUNT = 0
+TOTAL_SUMMARIZED_WORDS = 0
 # =============================================================================
 # Helper functions for external APIs and PDF Processing
 # =============================================================================
         response = client.chat.completions.create(**params)
         result = response.choices[0].message.content.strip()
         result = result.strip().strip("json").strip("```").strip()
+        logging.info(f"openai_call completed with model {model}. Response preview: {result}")
         return result
     except Exception as e:
         err_msg = f"Error calling OpenAI API: {e}"
             "Summarize the following text, preserving all key details and ensuring that any tables or structured "
             "data are also summarized:\n\n" + chunk
         )
         summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
+        global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
+        SUMMARIZATION_REQUEST_COUNT += 1
+        TOTAL_SUMMARIZED_WORDS += len(summary_chunk.split())
         summary_chunks.append(summary_chunk.strip())
     combined_summary = "\n".join(summary_chunks)
 {query}
 Instructions:
+1.  Relevance: Determine if the content is relevant to the research topic. Answer with a single word: "yes" or "no".
 2.  Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
+    -   Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
+    -   Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
+    -   Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
+    -   Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it’s a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
+    -   Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
 Note: General Optimization Guidelines:
+    -   Lemmatize words (e.g., "running" -> "run").
+    -   Use common abbreviations.
+    -   Remove redundancy and unnecessary words.
+    -   Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
+    -   Use symbols where appropriate.
+3.  Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed. Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere).
 4. Ensure that the summary length and level of detail is proportional to the source length.
 Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
+**Output Requirement: Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere before the result).
 Proceed."""
     )
     try:
         response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
         res_text = response.strip()
         return err
 def compress_text(text: str, target_length: int) -> str:
+    global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
     prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
     summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
+    SUMMARIZATION_REQUEST_COUNT += 1
+    TOTAL_SUMMARIZED_WORDS += len(summary.split())
+    logging.info(f"compress_text: Compressed text length: {len(summary)} -- Requests: {SUMMARIZATION_REQUEST_COUNT}, Total words: {TOTAL_SUMMARIZED_WORDS}")
     return summary
 def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
                     raw_content = process_pdf(url)
                     if "Error processing PDF" in raw_content:
                         continue
                 else:
                     try:
                         headers = {"User-Agent": get_random_header()}
                         logging.error(f"Error retrieving content from {url}: {e}")
                         process_log += f"Error retrieving content from {url}: {e}\n"
                         continue
+                # Skip processing if raw_content is empty or too short (< 1000 characters)
+                if not raw_content or len(raw_content) < 1000:
+                    process_log += f"Content from {url} is too short (<1000 characters), skipping.\n"
+                    continue
                 # 1) Clean and do minimal parse
                 cleaned_html = clean_content(raw_content)
                             followup_suggestions.extend(analysis.get("followups"))
         process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
         logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
+        if len(iteration_learnings) == 0:
+            process_log += f"Iteration {iteration} extracted no learnings. Aborting further iterations to avoid freezing.\n"
+            logging.warning(f"iterative_deep_research_gen: Iteration {iteration} extracted no learnings. Aborting research.")
+            break  # Exit early if no learnings were extracted.
         overall_learnings.extend(iteration_learnings)
         overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
         if additional_clarifications.strip():
         backup_button.click(
             fn=backup_fields,
             inputs=[research_query, include_domains, exclude_keywords,
+                    additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report, existing_queries_box],
             outputs=[backup_text]
         )
         load_button.click(