Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -27,7 +27,8 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
|
|
| 27 |
# Global Settings
|
| 28 |
# =============================================================================
|
| 29 |
MAX_MESSAGE_LENGTH = 1048576
|
| 30 |
-
|
|
|
|
| 31 |
# =============================================================================
|
| 32 |
# Helper functions for external APIs and PDF Processing
|
| 33 |
# =============================================================================
|
|
@@ -387,7 +388,7 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
|
|
| 387 |
response = client.chat.completions.create(**params)
|
| 388 |
result = response.choices[0].message.content.strip()
|
| 389 |
result = result.strip().strip("json").strip("```").strip()
|
| 390 |
-
logging.info(f"openai_call completed with model {model}. Response preview: {result
|
| 391 |
return result
|
| 392 |
except Exception as e:
|
| 393 |
err_msg = f"Error calling OpenAI API: {e}"
|
|
@@ -431,8 +432,10 @@ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000,
|
|
| 431 |
"Summarize the following text, preserving all key details and ensuring that any tables or structured "
|
| 432 |
"data are also summarized:\n\n" + chunk
|
| 433 |
)
|
| 434 |
-
# Use a relatively small max_tokens value for each chunk summarization.
|
| 435 |
summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
|
|
|
|
|
|
|
|
|
|
| 436 |
summary_chunks.append(summary_chunk.strip())
|
| 437 |
|
| 438 |
combined_summary = "\n".join(summary_chunks)
|
|
@@ -473,30 +476,32 @@ Research topic:
|
|
| 473 |
{query}
|
| 474 |
|
| 475 |
Instructions:
|
| 476 |
-
1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word:
|
| 477 |
|
| 478 |
2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
|
| 479 |
-
- Key Facts (at least 5): List the core factual claims
|
| 480 |
-
- Key Figures (at least 5): Extract numerical data
|
| 481 |
-
- Key Arguments (at least 5): Identify main arguments
|
| 482 |
-
- Key Quotes (at least 1 if any): Include significant quotes (with the
|
| 483 |
-
- Structured Summary (10 to 50 sentences
|
| 484 |
|
| 485 |
Note: General Optimization Guidelines:
|
| 486 |
-
- Lemmatize
|
| 487 |
-
-
|
| 488 |
-
- Remove
|
| 489 |
-
- Shorten
|
| 490 |
-
-
|
| 491 |
-
- Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
|
| 492 |
|
| 493 |
-
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries
|
| 494 |
|
| 495 |
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 496 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 497 |
|
|
|
|
|
|
|
| 498 |
Proceed."""
|
| 499 |
)
|
|
|
|
| 500 |
try:
|
| 501 |
response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
|
| 502 |
res_text = response.strip()
|
|
@@ -586,9 +591,12 @@ def process_pdf(url: str) -> str:
|
|
| 586 |
return err
|
| 587 |
|
| 588 |
def compress_text(text: str, target_length: int) -> str:
|
|
|
|
| 589 |
prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
|
| 590 |
summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
|
| 591 |
-
|
|
|
|
|
|
|
| 592 |
return summary
|
| 593 |
|
| 594 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
|
@@ -1382,7 +1390,6 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1382 |
raw_content = process_pdf(url)
|
| 1383 |
if "Error processing PDF" in raw_content:
|
| 1384 |
continue
|
| 1385 |
-
process_log += f"Extracted PDF content from {url}\n"
|
| 1386 |
else:
|
| 1387 |
try:
|
| 1388 |
headers = {"User-Agent": get_random_header()}
|
|
@@ -1394,6 +1401,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1394 |
logging.error(f"Error retrieving content from {url}: {e}")
|
| 1395 |
process_log += f"Error retrieving content from {url}: {e}\n"
|
| 1396 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1397 |
|
| 1398 |
# 1) Clean and do minimal parse
|
| 1399 |
cleaned_html = clean_content(raw_content)
|
|
@@ -1437,6 +1448,10 @@ def iterative_deep_research_gen(initial_query: str, reportstyle: str, breadth: i
|
|
| 1437 |
followup_suggestions.extend(analysis.get("followups"))
|
| 1438 |
process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
|
| 1439 |
logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1440 |
overall_learnings.extend(iteration_learnings)
|
| 1441 |
overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
|
| 1442 |
if additional_clarifications.strip():
|
|
@@ -1738,7 +1753,7 @@ def main():
|
|
| 1738 |
backup_button.click(
|
| 1739 |
fn=backup_fields,
|
| 1740 |
inputs=[research_query, include_domains, exclude_keywords,
|
| 1741 |
-
additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report],
|
| 1742 |
outputs=[backup_text]
|
| 1743 |
)
|
| 1744 |
load_button.click(
|
|
|
|
| 27 |
# Global Settings
|
| 28 |
# =============================================================================
|
| 29 |
MAX_MESSAGE_LENGTH = 1048576
|
| 30 |
+
SUMMARIZATION_REQUEST_COUNT = 0
|
| 31 |
+
TOTAL_SUMMARIZED_WORDS = 0
|
| 32 |
# =============================================================================
|
| 33 |
# Helper functions for external APIs and PDF Processing
|
| 34 |
# =============================================================================
|
|
|
|
| 388 |
response = client.chat.completions.create(**params)
|
| 389 |
result = response.choices[0].message.content.strip()
|
| 390 |
result = result.strip().strip("json").strip("```").strip()
|
| 391 |
+
logging.info(f"openai_call completed with model {model}. Response preview: {result}")
|
| 392 |
return result
|
| 393 |
except Exception as e:
|
| 394 |
err_msg = f"Error calling OpenAI API: {e}"
|
|
|
|
| 432 |
"Summarize the following text, preserving all key details and ensuring that any tables or structured "
|
| 433 |
"data are also summarized:\n\n" + chunk
|
| 434 |
)
|
|
|
|
| 435 |
summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
|
| 436 |
+
global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
|
| 437 |
+
SUMMARIZATION_REQUEST_COUNT += 1
|
| 438 |
+
TOTAL_SUMMARIZED_WORDS += len(summary_chunk.split())
|
| 439 |
summary_chunks.append(summary_chunk.strip())
|
| 440 |
|
| 441 |
combined_summary = "\n".join(summary_chunks)
|
|
|
|
| 476 |
{query}
|
| 477 |
|
| 478 |
Instructions:
|
| 479 |
+
1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: "yes" or "no".
|
| 480 |
|
| 481 |
2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
|
| 482 |
+
- Key Facts (at least 5): List the core factual claims using short, declarative sentences or bullet points. Apply lemmatization and standard abbreviations.
|
| 483 |
+
- Key Figures (at least 5): Extract numerical data (statistics, dates, percentages) and include any necessary context (units, references, explanations) required to interpret these numbers. Present them concisely (list or table format).
|
| 484 |
+
- Key Arguments (at least 5): Identify main arguments or claims. Summarize supporting evidence and counter-arguments concisely.
|
| 485 |
+
- Key Quotes (at least 1 if any): Include significant quotes (with the author's name in parentheses). Attribute quotes correctly. Paraphrase if needed, indicating that it’s a paraphrase. Use symbols (e.g., &, +, ->, =) to conserve tokens.
|
| 486 |
+
- Structured Summary (10 to 50 sentences): Provide a structured summary that includes anecdotes, people, and locations to ensure the report is relatable.
|
| 487 |
|
| 488 |
Note: General Optimization Guidelines:
|
| 489 |
+
- Lemmatize words (e.g., "running" -> "run").
|
| 490 |
+
- Use common abbreviations.
|
| 491 |
+
- Remove redundancy and unnecessary words.
|
| 492 |
+
- Shorten words carefully (e.g., "information" -> "info") without causing ambiguity.
|
| 493 |
+
- Use symbols where appropriate.
|
|
|
|
| 494 |
|
| 495 |
+
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries relevant to the research topic and the summarized content. Use search operators (AND, OR, quotation marks) as needed. Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere).
|
| 496 |
|
| 497 |
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 498 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 499 |
|
| 500 |
+
**Output Requirement: Output the queries as a JSON list of strings (e.g., ["query1", "query2", ...]) with no additional formatting, extra text, or markdown (do not include the word "python" anywhere before the result).
|
| 501 |
+
|
| 502 |
Proceed."""
|
| 503 |
)
|
| 504 |
+
|
| 505 |
try:
|
| 506 |
response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
|
| 507 |
res_text = response.strip()
|
|
|
|
| 591 |
return err
|
| 592 |
|
| 593 |
def compress_text(text: str, target_length: int) -> str:
|
| 594 |
+
global SUMMARIZATION_REQUEST_COUNT, TOTAL_SUMMARIZED_WORDS
|
| 595 |
prompt = f"Summarize the following text in a way that preserves all valuable information, and output a compressed version not exceeding {target_length} characters:\n\n{text}"
|
| 596 |
summary = openai_call(prompt, model="gpt-4o-mini", max_tokens_param=100000)
|
| 597 |
+
SUMMARIZATION_REQUEST_COUNT += 1
|
| 598 |
+
TOTAL_SUMMARIZED_WORDS += len(summary.split())
|
| 599 |
+
logging.info(f"compress_text: Compressed text length: {len(summary)} -- Requests: {SUMMARIZATION_REQUEST_COUNT}, Total words: {TOTAL_SUMMARIZED_WORDS}")
|
| 600 |
return summary
|
| 601 |
|
| 602 |
def generate_final_report(initial_query: str, context: str, reportstyle: str, learnings: list, visited_urls: list,
|
|
|
|
| 1390 |
raw_content = process_pdf(url)
|
| 1391 |
if "Error processing PDF" in raw_content:
|
| 1392 |
continue
|
|
|
|
| 1393 |
else:
|
| 1394 |
try:
|
| 1395 |
headers = {"User-Agent": get_random_header()}
|
|
|
|
| 1401 |
logging.error(f"Error retrieving content from {url}: {e}")
|
| 1402 |
process_log += f"Error retrieving content from {url}: {e}\n"
|
| 1403 |
continue
|
| 1404 |
+
# Skip processing if raw_content is empty or too short (< 1000 characters)
|
| 1405 |
+
if not raw_content or len(raw_content) < 1000:
|
| 1406 |
+
process_log += f"Content from {url} is too short (<1000 characters), skipping.\n"
|
| 1407 |
+
continue
|
| 1408 |
|
| 1409 |
# 1) Clean and do minimal parse
|
| 1410 |
cleaned_html = clean_content(raw_content)
|
|
|
|
| 1448 |
followup_suggestions.extend(analysis.get("followups"))
|
| 1449 |
process_log += f"Iteration {iteration} extracted {len(iteration_learnings)} learnings.\n"
|
| 1450 |
logging.info(f"iterative_deep_research_gen: Iteration {iteration} extracted {len(iteration_learnings)} learnings.")
|
| 1451 |
+
if len(iteration_learnings) == 0:
|
| 1452 |
+
process_log += f"Iteration {iteration} extracted no learnings. Aborting further iterations to avoid freezing.\n"
|
| 1453 |
+
logging.warning(f"iterative_deep_research_gen: Iteration {iteration} extracted no learnings. Aborting research.")
|
| 1454 |
+
break # Exit early if no learnings were extracted.
|
| 1455 |
overall_learnings.extend(iteration_learnings)
|
| 1456 |
overall_context += f"\nIteration {iteration} learnings:\n" + "\n".join(iteration_learnings) + "\n"
|
| 1457 |
if additional_clarifications.strip():
|
|
|
|
| 1753 |
backup_button.click(
|
| 1754 |
fn=backup_fields,
|
| 1755 |
inputs=[research_query, include_domains, exclude_keywords,
|
| 1756 |
+
additional_clarifications, selected_engines, results_per_query, breadth, depth, clarification_text, existing_report, existing_log, crumbs_box, final_report, existing_queries_box],
|
| 1757 |
outputs=[backup_text]
|
| 1758 |
)
|
| 1759 |
load_button.click(
|