Guiyom commited on
Commit
ed43430
·
verified ·
1 Parent(s): 1eacd00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -22
app.py CHANGED
@@ -394,16 +394,77 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
394
  logging.error(err_msg)
395
  return err_msg
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
398
  # If snippet is a callable, call it to get the string.
399
  if callable(snippet):
400
  snippet = snippet()
401
  snippet_words = len(snippet.split())
402
- # decide a proportional max tokens (cap at 3000 for example)
403
- # e.g. 1 token ~ ~0.75 words, so we do something simplistic:
 
 
 
 
 
 
 
404
  dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
405
 
406
- client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
 
 
 
407
  prompt = (f"""Analyze the following content from a query result:
408
 
409
  {snippet}
@@ -412,40 +473,33 @@ Research topic:
412
  {query}
413
 
414
  Instructions:
415
- 1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: 'yes' or 'no'.
416
 
417
- 2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
418
- - Key Facts (at least 5): List the core factual claims. Use short, declarative sentences or bullet points. Apply lemmatization, common abbreviations (e.g., vs., e.g., i.e., AI, LLM), and remove unnecessary words.
419
- - Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation. Present concisely (list or table format).
420
  - Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
421
- - Key Quotes (at least 1 f any): Include significant quotes (with the name of the author between parenthesis). Attribute quotes correctly. Choose quotes that are concise and impactful. If a quote can be paraphrased concisely without losing essential meaning, paraphrase it and note that it's a paraphrase. Use symbols instead of words (&, +, ->, =, ...).
422
- - Structured summary (10 to 50 sentences depending on the length): mention anecdotes, people, locations, anything that make will make the end report relatable and grounded
423
 
424
  Note: General Optimization Guidelines:
425
  - Lemmatize: Use the root form of words (e.g., "running" -> "run").
426
- - Abbreviate: Use common abbreviations
427
  - Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
428
  - Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
429
  - Implicit Representation: Remove redundant terms.
430
- - Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
431
 
432
- 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic but also developments from the content summarized, aim for deeper understanding, use search operators (AND, OR, quotation marks), and be represented as a Python list of strings.
433
- For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
434
- Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
435
 
436
- 4. Ensure that the summary length and level of detail is proportional to the source length.
437
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
438
 
439
  Proceed."""
440
  )
441
  try:
442
- response = client.chat.completions.create(
443
- model="gpt-4o-mini",
444
- messages=[{"role": "user", "content": prompt}],
445
- temperature=temperature,
446
- max_tokens=max_tokens
447
- )
448
- res_text = response.choices[0].message.content.strip()
449
  # Remove Markdown code fences if present
450
  if res_text.startswith("```"):
451
  res_text = re.sub(r"^```(json)?", "", res_text)
 
394
  logging.error(err_msg)
395
  return err_msg
396
 
397
+ def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000, overlap: int = 200) -> str:
398
+ """
399
+ Summarizes a large text by splitting it into overlapping chunks, summarizing each chunk,
400
+ and then combining the intermediate summaries into a final summary.
401
+
402
+ The prompt for these intermediate calls explicitly instructs to preserve key details
403
+ and to include any tables or structured data present.
404
+
405
+ Parameters:
406
+ text : The input text to summarize.
407
+ target_length: The maximum number of tokens (or an approximate target length) for the final summary.
408
+ chunk_size : The number of words to include in each chunk.
409
+ overlap : The number of overlapping words between chunks.
410
+
411
+ Returns:
412
+ The final combined summary as a string.
413
+ """
414
+ words = text.split()
415
+ if len(words) <= chunk_size:
416
+ # If the text is short, simply return it (or you could call a simple summarization)
417
+ return text
418
+
419
+ chunks = []
420
+ i = 0
421
+ while i < len(words):
422
+ # Create a chunk from i to i+chunk_size words.
423
+ chunk = " ".join(words[i:i+chunk_size])
424
+ chunks.append(chunk)
425
+ # Move forward by chunk_size - overlap words.
426
+ i += (chunk_size - overlap)
427
+
428
+ summary_chunks = []
429
+ for chunk in chunks:
430
+ chunk_prompt = (
431
+ "Summarize the following text, preserving all key details and ensuring that any tables or structured "
432
+ "data are also summarized:\n\n" + chunk
433
+ )
434
+ # Use a relatively small max_tokens value for each chunk summarization.
435
+ summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
436
+ summary_chunks.append(summary_chunk.strip())
437
+
438
+ combined_summary = "\n".join(summary_chunks)
439
+ # Now, produce one final summary that fuses all the intermediate summaries.
440
+ final_prompt = (
441
+ "Combine the following summaries into one concise summary that preserves all critical details, "
442
+ "including any relevant table or structured data:\n\n" + combined_summary
443
+ )
444
+ final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
445
+ return final_summary.strip()
446
+
447
+
448
  def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
449
  # If snippet is a callable, call it to get the string.
450
  if callable(snippet):
451
  snippet = snippet()
452
  snippet_words = len(snippet.split())
453
+
454
+ # Define a word threshold after which we start the chunking summarization.
455
+ CHUNK_WORD_THRESHOLD = 1500
456
+ if snippet_words > CHUNK_WORD_THRESHOLD:
457
+ # Adjust the target_length as needed (here using 2000 tokens as an example).
458
+ snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
459
+ snippet_words = len(snippet.split())
460
+
461
+ # Decide a proportional dynamic token count (for reference; not used to limit the API call below)
462
  dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
463
 
464
+ client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
465
+ # (Assuming you use a client instance from your OpenAI library elsewhere.)
466
+ # Here, we assume that openai.OpenAI(api_key=...) is wrapped by openai_call.
467
+
468
  prompt = (f"""Analyze the following content from a query result:
469
 
470
  {snippet}
 
473
  {query}
474
 
475
  Instructions:
476
+ 1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: 'yes' or 'no'.
477
 
478
+ 2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
479
+ - Key Facts (at least 5): List the core factual claims. Use short, declarative sentences or bullet points. Apply lemmatization, common abbreviations (e.g., vs., e.g., i.e., AI, LLM), and remove unnecessary words.
480
+ - Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation and present concisely (list or table format). If the content includes tables or structured data, extract and summarize the critical information from them.
481
  - Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
482
+ - Key Quotes (at least 1 if any): Include significant quotes (with the name of the author in parentheses). Attribute quotes correctly. Choose quotes that are concise and impactful. If a quote can be paraphrased concisely without losing essential meaning, paraphrase it and note that it's a paraphrase. Use symbols instead of words (&, +, ->, =, ...).
483
+ - Structured Summary (10 to 50 sentences depending on the length): Mention anecdotes, people, locations, and any additional context that will make the end report relatable and grounded.
484
 
485
  Note: General Optimization Guidelines:
486
  - Lemmatize: Use the root form of words (e.g., "running" -> "run").
487
+ - Abbreviate: Use common abbreviations.
488
  - Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
489
  - Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
490
  - Implicit Representation: Remove redundant terms.
491
+ - Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
492
 
493
+ 3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic and build upon the summarized content. Aim for deeper understanding by using search operators (AND, OR, quotation marks) where appropriate. Represent these queries as a Python list of strings, e.g., ["query1", "query2", ...].
 
 
494
 
495
+ 4. Ensure that the summary length and level of detail is proportional to the source length.
496
  Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
497
 
498
  Proceed."""
499
  )
500
  try:
501
+ response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
502
+ res_text = response.strip()
 
 
 
 
 
503
  # Remove Markdown code fences if present
504
  if res_text.startswith("```"):
505
  res_text = re.sub(r"^```(json)?", "", res_text)