Guiyom commited on
Commit
02baeee
·
verified ·
1 Parent(s): 19cf986

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -19
app.py CHANGED
@@ -27,7 +27,8 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
27
  # Global Settings
28
  # =============================================================================
29
  MAX_MESSAGE_LENGTH = 1048576
30
-
 
31
  # =============================================================================
32
  # Helper functions for external APIs and PDF Processing
33
  # =============================================================================
@@ -387,7 +388,7 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
387
  response = client.chat.completions.create(**params)
388
  result = response.choices[0].message.content.strip()
389
  result = result.strip().strip("json").strip("```").strip()
390
- logging.info(f"openai_call completed with model {model}. Response preview: {result[:400]}")
391
  return result
392
  except Exception as e:
393
  err_msg = f"Error calling OpenAI API: {e}"
@@ -431,8 +432,10 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
431
  "Summarize the following text, preserving all key details and ensuring that any tables or structured "
432
  "data are also summarized:\n\n" + chunk
433
  )
434
- # Use a relatively small max_tokens value for each chunk summarization.
435
  summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
 
 
 
436
  summary_chunks.append(summary_chunk.strip())
437
 
438
  combined_summary = "\n".join(summary_chunks)
@@ -473,30 +476,32 @@ Research topic:
473
  {query}
474
 
475
  Instructions:
476
- 1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: 'yes' or 'no'.
477
 
478
  2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
479
- - Key Facts (at least 5): List the core factual claims. Use short, declarative sentences or bullet points. Apply lemmatization, common abbreviations (e.g., vs., e.g., i.e., AI, LLM), and remove unnecessary words.
480
- - Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation and present concisely (list or table format). If the content includes tables or structured data, extract and summarize the critical information from them. If data available, collect enough so that the user can use it to generate new tables and graphs, visuals.
481
- - Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
482
- - Key Quotes (at least 1 if any): Include significant quotes (with the name of the author in parentheses). Attribute quotes correctly. Choose quotes that are concise and impactful. If a quote can be paraphrased concisely without losing essential meaning, paraphrase it and note that it's a paraphrase. Use symbols instead of words (&, +, ->, =, ...).
483
- - Structured Summary (10 to 50 sentences depending on the length): Mention anecdotes, people, locations, and any additional context that will make the end report relatable and grounded.
484
 
485
  Note: General Optimization Guidelines:
486
- - Lemmatize: Use the root form of words (e.g., "running" -> "run").
487
- - Abbreviate: Use common abbreviations.
488
- - Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
489
- - Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
490
- - Implicit Representation: Remove redundant terms.
491
- - Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
492
 
493
- 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic and build upon the summarized content. Aim for deeper understanding by using search operators (AND, OR, quotation marks) where appropriate. Represent these queries as a Python list of strings, e.g., ["query1", "query2", ...].
494
 
495
  4. Ensure that the summary length and level of detail is proportional to the source length.
496
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
497
 
 
 
498
  Proceed."""
499
  )
 
500
  try:
501
  response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
502
  res_text = response.strip()
@@ -586,9 +591,12 @@ def process_pdf(url: str) -> str:
586
  return err
587
 
588
  def compress_text(text: str, target_length: int) -> str:
 
589
  prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
590
  summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
591
- logging.info(f"compress_text: Compressed text length: {len(summary)}")
 
 
592
  return summary
593
 
594
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
@@ -1382,7 +1390,6 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
1382
  raw_content = process_pdf(url)
1383
  if "Error processing PDF" in raw_content:
1384
  continue
1385
- process_log += f"Extracted PDF content from {url}\n"
1386
  else:
1387
  try:
1388
  headers = {"User-Agent": get_random_header()}
@@ -1394,6 +1401,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
1394
  logging.error(f"Error retrieving content from {url}: {e}")
1395
  process_log += f"Error retrieving content from {url}: {e}\n"
1396
  continue
 
 
 
 
1397
 
1398
  # 1) Clean and do minimal parse
1399
  cleaned_html = clean_content(raw_content)
@@ -1437,6 +1448,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
1437
  followup_suggestions.extend(analysis.get("followups"))
1438
  process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
1439
  logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
 
 
 
 
1440
  overall_learnings.extend(iteration_learnings)
1441
  overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
1442
  if additional_clarifications.strip():
@@ -1738,7 +1753,7 @@ def main():
1738
  backup_button.click(
1739
  fn=backup_fields,
1740
  inputs=[research_query, include_domains, exclude_keywords,
1741
- additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report],
1742
  outputs=[backup_text]
1743
  )
1744
  load_button.click(
 
27
  # Global Settings
28
  # =============================================================================
29
  MAX_MESSAGE_LENGTH = 1048576
30
+ SUMMARIZATION_REQUEST_COUNT = 0
31
+ TOTAL_SUMMARIZED_WORDS = 0
32
  # =============================================================================
33
  # Helper functions for external APIs and PDF Processing
34
  # =============================================================================
 
388
  response = client.chat.completions.create(**params)
389
  result = response.choices[0].message.content.strip()
390
  result = result.strip().strip("json").strip("```").strip()
391
+ logging.info(f"openai_call completed with model {model}. Response preview: {result}")
392
  return result
393
  except Exception as e:
394
  err_msg = f"Error calling OpenAI API: {e}"
 
432
  "Summarize the following text, preserving all key details and ensuring that any tables or structured "
433
  "data are also summarized:\n\n" + chunk
434
  )
 
435
  summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
436
+ global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
437
+ SUMMARIZATION_REQUEST_COUNT += 1
438
+ TOTAL_SUMMARIZED_WORDS += len(summary_chunk.split())
439
  summary_chunks.append(summary_chunk.strip())
440
 
441
  combined_summary = "\n".join(summary_chunks)
 
476
  {query}
477
 
478
  Instructions:
479
+ 1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: "yes" or "no".
480
 
481
  2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
482
+ - Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
483
+ - Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
484
+ - Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
485
+ - Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that its a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
486
+ - Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
487
 
488
  Note: General Optimization Guidelines:
489
+ - Lemmatize words (e.g., "running" -> "run").
490
+ - Use common abbreviations.
491
+ - Remove redundancy and unnecessary words.
492
+ - Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
493
+ - Use symbols where appropriate.
 
494
 
495
+ 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed. Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere).
496
 
497
  4. Ensure that the summary length and level of detail is proportional to the source length.
498
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
499
 
500
+ **Output Requirement: Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere before the result).
501
+
502
  Proceed."""
503
  )
504
+
505
  try:
506
  response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
507
  res_text = response.strip()
 
591
  return err
592
 
593
  def compress_text(text: str, target_length: int) -> str:
594
+ global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
595
  prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
596
  summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
597
+ SUMMARIZATION_REQUEST_COUNT += 1
598
+ TOTAL_SUMMARIZED_WORDS += len(summary.split())
599
+ logging.info(f"compress_text: Compressed text length: {len(summary)} -- Requests: {SUMMARIZATION_REQUEST_COUNT}, Total words: {TOTAL_SUMMARIZED_WORDS}")
600
  return summary
601
 
602
  def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
 
1390
  raw_content = process_pdf(url)
1391
  if "Error processing PDF" in raw_content:
1392
  continue
 
1393
  else:
1394
  try:
1395
  headers = {"User-Agent": get_random_header()}
 
1401
  logging.error(f"Error retrieving content from {url}: {e}")
1402
  process_log += f"Error retrieving content from {url}: {e}\n"
1403
  continue
1404
+ # Skip processing if raw_content is empty or too short (< 1000 characters)
1405
+ if not raw_content or len(raw_content) < 1000:
1406
+ process_log += f"Content from {url} is too short (<1000 characters), skipping.\n"
1407
+ continue
1408
 
1409
  # 1) Clean and do minimal parse
1410
  cleaned_html = clean_content(raw_content)
 
1448
  followup_suggestions.extend(analysis.get("followups"))
1449
  process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
1450
  logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
1451
+ if len(iteration_learnings) == 0:
1452
+ process_log += f"Iteration {iteration} extracted no learnings. Aborting further iterations to avoid freezing.\n"
1453
+ logging.warning(f"iterative_deep_research_gen: Iteration {iteration} extracted no learnings. Aborting research.")
1454
+ break # Exit early if no learnings were extracted.
1455
  overall_learnings.extend(iteration_learnings)
1456
  overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
1457
  if additional_clarifications.strip():
 
1753
  backup_button.click(
1754
  fn=backup_fields,
1755
  inputs=[research_query, include_domains, exclude_keywords,
1756
+ additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report, existing_queries_box],
1757
  outputs=[backup_text]
1758
  )
1759
  load_button.click(