Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -394,16 +394,77 @@ def openai_call(prompt: str, messages: list = None, model: str = "o3-mini",
|
|
| 394 |
logging.error(err_msg)
|
| 395 |
return err_msg
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
|
| 398 |
# If snippet is a callable, call it to get the string.
|
| 399 |
if callable(snippet):
|
| 400 |
snippet = snippet()
|
| 401 |
snippet_words = len(snippet.split())
|
| 402 |
-
|
| 403 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
|
| 405 |
|
| 406 |
-
client =
|
|
|
|
|
|
|
|
|
|
| 407 |
prompt = (f"""Analyze the following content from a query result:
|
| 408 |
|
| 409 |
{snippet}
|
|
@@ -412,40 +473,33 @@ Research topic:
|
|
| 412 |
{query}
|
| 413 |
|
| 414 |
Instructions:
|
| 415 |
-
1. Relevance: Determine if the content is relevant to the research topic.
|
| 416 |
|
| 417 |
-
2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections.
|
| 418 |
-
- Key Facts (at least 5):
|
| 419 |
-
- Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation
|
| 420 |
- Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
|
| 421 |
-
- Key Quotes (at least 1
|
| 422 |
-
- Structured
|
| 423 |
|
| 424 |
Note: General Optimization Guidelines:
|
| 425 |
- Lemmatize: Use the root form of words (e.g., "running" -> "run").
|
| 426 |
-
- Abbreviate: Use common abbreviations
|
| 427 |
- Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
|
| 428 |
- Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
|
| 429 |
- Implicit Representation: Remove redundant terms.
|
| 430 |
-
-
|
| 431 |
|
| 432 |
-
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic
|
| 433 |
-
For example: "Artificial intelligence" AND (mathematics OR geometry) -algebra,science AND history AND mathematics,...
|
| 434 |
-
Return the result as a JSON object with the keys 'relevant', 'structure', and 'followups'. The 'structure' value should itself be a JSON object with keys 'Key Facts', 'Key Figures', 'Key Arguments', 'Key Quotes' and 'Summary'.
|
| 435 |
|
| 436 |
-
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 437 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 438 |
|
| 439 |
Proceed."""
|
| 440 |
)
|
| 441 |
try:
|
| 442 |
-
response =
|
| 443 |
-
|
| 444 |
-
messages=[{"role": "user", "content": prompt}],
|
| 445 |
-
temperature=temperature,
|
| 446 |
-
max_tokens=max_tokens
|
| 447 |
-
)
|
| 448 |
-
res_text = response.choices[0].message.content.strip()
|
| 449 |
# Remove Markdown code fences if present
|
| 450 |
if res_text.startswith("```"):
|
| 451 |
res_text = re.sub(r"^```(json)?", "", res_text)
|
|
|
|
| 394 |
logging.error(err_msg)
|
| 395 |
return err_msg
|
| 396 |
|
| 397 |
+
def summarize_large_text(text: str, target_length: int, chunk_size: int = 1000, overlap: int = 200) -> str:
|
| 398 |
+
"""
|
| 399 |
+
Summarizes a large text by splitting it into overlapping chunks, summarizing each chunk,
|
| 400 |
+
and then combining the intermediate summaries into a final summary.
|
| 401 |
+
|
| 402 |
+
The prompt for these intermediate calls explicitly instructs to preserve key details
|
| 403 |
+
and to include any tables or structured data present.
|
| 404 |
+
|
| 405 |
+
Parameters:
|
| 406 |
+
text : The input text to summarize.
|
| 407 |
+
target_length: The maximum number of tokens (or an approximate target length) for the final summary.
|
| 408 |
+
chunk_size : The number of words to include in each chunk.
|
| 409 |
+
overlap : The number of overlapping words between chunks.
|
| 410 |
+
|
| 411 |
+
Returns:
|
| 412 |
+
The final combined summary as a string.
|
| 413 |
+
"""
|
| 414 |
+
words = text.split()
|
| 415 |
+
if len(words) <= chunk_size:
|
| 416 |
+
# If the text is short, simply return it (or you could call a simple summarization)
|
| 417 |
+
return text
|
| 418 |
+
|
| 419 |
+
chunks = []
|
| 420 |
+
i = 0
|
| 421 |
+
while i < len(words):
|
| 422 |
+
# Create a chunk from i to i+chunk_size words.
|
| 423 |
+
chunk = " ".join(words[i:i+chunk_size])
|
| 424 |
+
chunks.append(chunk)
|
| 425 |
+
# Move forward by chunk_size - overlap words.
|
| 426 |
+
i += (chunk_size - overlap)
|
| 427 |
+
|
| 428 |
+
summary_chunks = []
|
| 429 |
+
for chunk in chunks:
|
| 430 |
+
chunk_prompt = (
|
| 431 |
+
"Summarize the following text, preserving all key details and ensuring that any tables or structured "
|
| 432 |
+
"data are also summarized:\n\n" + chunk
|
| 433 |
+
)
|
| 434 |
+
# Use a relatively small max_tokens value for each chunk summarization.
|
| 435 |
+
summary_chunk = openai_call(prompt=chunk_prompt, model="gpt-4o-mini", max_tokens_param=500, temperature=0.7)
|
| 436 |
+
summary_chunks.append(summary_chunk.strip())
|
| 437 |
+
|
| 438 |
+
combined_summary = "\n".join(summary_chunks)
|
| 439 |
+
# Now, produce one final summary that fuses all the intermediate summaries.
|
| 440 |
+
final_prompt = (
|
| 441 |
+
"Combine the following summaries into one concise summary that preserves all critical details, "
|
| 442 |
+
"including any relevant table or structured data:\n\n" + combined_summary
|
| 443 |
+
)
|
| 444 |
+
final_summary = openai_call(prompt=final_prompt, model="gpt-4o-mini", max_tokens_param=target_length, temperature=0.7)
|
| 445 |
+
return final_summary.strip()
|
| 446 |
+
|
| 447 |
+
|
| 448 |
def analyze_with_gpt4o(query: str, snippet: str, breadth: int, temperature: float = 0.7, max_tokens: int = 8000) -> dict:
|
| 449 |
# If snippet is a callable, call it to get the string.
|
| 450 |
if callable(snippet):
|
| 451 |
snippet = snippet()
|
| 452 |
snippet_words = len(snippet.split())
|
| 453 |
+
|
| 454 |
+
# Define a word threshold after which we start the chunking summarization.
|
| 455 |
+
CHUNK_WORD_THRESHOLD = 1500
|
| 456 |
+
if snippet_words > CHUNK_WORD_THRESHOLD:
|
| 457 |
+
# Adjust the target_length as needed (here using 2000 tokens as an example).
|
| 458 |
+
snippet = summarize_large_text(snippet, target_length=2000, chunk_size=1000, overlap=200)
|
| 459 |
+
snippet_words = len(snippet.split())
|
| 460 |
+
|
| 461 |
+
# Decide a proportional dynamic token count (for reference; not used to limit the API call below)
|
| 462 |
dynamic_tokens = min(3000, max(250, int(snippet_words * 0.5)))
|
| 463 |
|
| 464 |
+
client = os.getenv('OPENAI_API_KEY') # alternatively, pass your API key here if needed.
|
| 465 |
+
# (Assuming you use a client instance from your OpenAI library elsewhere.)
|
| 466 |
+
# Here, we assume that openai.OpenAI(api_key=...) is wrapped by openai_call.
|
| 467 |
+
|
| 468 |
prompt = (f"""Analyze the following content from a query result:
|
| 469 |
|
| 470 |
{snippet}
|
|
|
|
| 473 |
{query}
|
| 474 |
|
| 475 |
Instructions:
|
| 476 |
+
1. Relevance: Determine if the content is relevant to the research topic. Answer with a single word: 'yes' or 'no'.
|
| 477 |
|
| 478 |
+
2. Structure: If the content is relevant, provide a comprehensive summary structured into the following sections. Prioritize extreme conciseness and token efficiency while preserving all key information. Aim for the shortest possible summary that retains all essential facts, figures, arguments, and quotes. The total summary should not exceed 1000 words, but shorter is strongly preferred.
|
| 479 |
+
- Key Facts (at least 5): List the core factual claims. Use short, declarative sentences or bullet points. Apply lemmatization, common abbreviations (e.g., vs., e.g., i.e., AI, LLM), and remove unnecessary words.
|
| 480 |
+
- Key Figures (at least 5): Extract numerical data, statistics, dates, percentages. Use numerical representation and present concisely (list or table format). If the content includes tables or structured data, extract and summarize the critical information from them.
|
| 481 |
- Key Arguments (at least 5): Identify main arguments/claims. Summarize supporting evidence and counter-arguments. Use lemmatization, abbreviations, and concise phrasing. Remove redundant phrases.
|
| 482 |
+
- Key Quotes (at least 1 if any): Include significant quotes (with the name of the author in parentheses). Attribute quotes correctly. Choose quotes that are concise and impactful. If a quote can be paraphrased concisely without losing essential meaning, paraphrase it and note that it's a paraphrase. Use symbols instead of words (&, +, ->, =, ...).
|
| 483 |
+
- Structured Summary (10 to 50 sentences depending on the length): Mention anecdotes, people, locations, and any additional context that will make the end report relatable and grounded.
|
| 484 |
|
| 485 |
Note: General Optimization Guidelines:
|
| 486 |
- Lemmatize: Use the root form of words (e.g., "running" -> "run").
|
| 487 |
+
- Abbreviate: Use common abbreviations.
|
| 488 |
- Remove Redundancy: Eliminate unnecessary words and phrases. Be concise.
|
| 489 |
- Shorten Words (Carefully): If a shorter word conveys the same meaning (e.g., "information" -> "info"), use it, but avoid ambiguity.
|
| 490 |
- Implicit Representation: Remove redundant terms.
|
| 491 |
+
- Use Symbols: Use symbols instead of words (&, +, ->, =, ...).
|
| 492 |
|
| 493 |
+
3. Follow-up Search Queries: Generate at least {breadth} follow-up search queries. These should be relevant to the research topic and build upon the summarized content. Aim for deeper understanding by using search operators (AND, OR, quotation marks) where appropriate. Represent these queries as a Python list of strings, e.g., ["query1", "query2", ...].
|
|
|
|
|
|
|
| 494 |
|
| 495 |
+
4. Ensure that the summary length and level of detail is proportional to the source length.
|
| 496 |
Source length: {snippet_words} words. You may produce a more detailed summary if the text is long.
|
| 497 |
|
| 498 |
Proceed."""
|
| 499 |
)
|
| 500 |
try:
|
| 501 |
+
response = openai_call(prompt=prompt, model="gpt-4o-mini", max_tokens_param=max_tokens, temperature=temperature)
|
| 502 |
+
res_text = response.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
# Remove Markdown code fences if present
|
| 504 |
if res_text.startswith("```"):
|
| 505 |
res_text = re.sub(r"^```(json)?", "", res_text)
|