Spaces:
Running
Running
File size: 7,045 Bytes
ec3d86e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# pragent/backend/text_processor.py
import re
from typing import List, Tuple
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AsyncOpenAI, BadRequestError
from tqdm.asyncio import tqdm
SUMMARIZATION_THRESHOLD = 4000
FALLBACK_HEADER_SIZE = 3000
def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False):
"""Creates a LangChain LLM object from the provided client."""
if not client:
raise ValueError("API client is not initialized.")
model_kwargs = {}
if "qwen3" in model.lower() and disable_qwen_thinking:
tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.")
model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
return ChatOpenAI(
model_name=model,
openai_api_key=client.api_key,
openai_api_base=str(client.base_url),
model_kwargs=model_kwargs # Pass the extra arguments here
)
def split_text_by_structure(long_text: str) -> Tuple[str, str]:
"""
Intelligently splits the text into a "header" (title, authors, abstract) and "body".
It looks for keywords like "Abstract" and "Introduction" to determine the split point.
"""
abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE)
if not abstract_match:
tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.")
return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
intro_match = re.search(r'(\n\s*(\d+|I|II|III|IV|V)\.?\s*)?Introduction', long_text[abstract_match.end():], re.IGNORECASE)
if not intro_match:
tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.")
return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
split_point = abstract_match.end() + intro_match.start()
header_text = long_text[:split_point]
body_text = long_text[split_point:]
tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).")
return header_text, body_text
# --- MODIFIED: Added disable_qwen_thinking parameter ---
async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str:
"""
Asynchronously summarizes long text using a structure-aware hybrid strategy.
"""
if not long_text:
return ""
if len(long_text) <= SUMMARIZATION_THRESHOLD:
tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.")
return long_text
header_text, body_text = split_text_by_structure(long_text)
if not body_text:
tqdm.write("[!] Could not separate the body text. Returning the full original text.")
return header_text
tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...")
try:
# Pass the flag down to the LLM creator
llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking)
except ValueError as e:
return f"Error: {e}"
body_summary = ""
tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...")
try:
stuff_prompt_template = """
# INSTRUCTION
You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary.
This summary needs to cover all the essential aspects of the provided text.
# PAPER BODY TEXT:
---
{text}
---
# YOUR DETAILED SYNTHESIZED SUMMARY:
"""
STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True)
docs = [Document(page_content=body_text)]
body_summary = await stuff_chain.arun(docs)
tqdm.write("[β] 'Stuff' strategy for the body text was successful!")
except BadRequestError as e:
if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower():
tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}")
return f"Error: API call failed - {e}"
tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=30000,
chunk_overlap=3000
)
chunks = text_splitter.split_text(body_text)
docs = [Document(page_content=t) for t in chunks]
tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.")
map_prompt_template = """
# INSTRUCTION
You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary.
Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion.
The language must be refined and to the point.
# TEXT SEGMENT:
---
{text}
---
# YOUR CONCISE SUMMARY:
"""
MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"])
combine_prompt_template = """
# INSTRUCTION
You are a senior editor. You have received several summaries extracted from different parts of the same research paper.
Your task is to synthesize these summaries into a single, coherent final summary.
# LIST OF SUMMARIES:
---
{text}
---
# YOUR SYNTHESIZED FINAL DETAILED SUMMARY:
"""
COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"])
map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True)
try:
body_summary = await map_reduce_chain.arun(docs)
tqdm.write("[β] 'Map-Reduce' summarization for the body text is complete.")
except Exception as chain_error:
tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}")
return f"Error: 'Map-Reduce' summarization failed - {chain_error}"
except Exception as e:
tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}")
return f"Error: Summarization failed - {e}"
final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}"
return final_text |