File size: 7,045 Bytes
ec3d86e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# pragent/backend/text_processor.py
import re
from typing import List, Tuple
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AsyncOpenAI, BadRequestError
from tqdm.asyncio import tqdm

SUMMARIZATION_THRESHOLD = 4000
FALLBACK_HEADER_SIZE = 3000

def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False):
    """Creates a LangChain LLM object from the provided client."""
    if not client:
        raise ValueError("API client is not initialized.")
    
    model_kwargs = {}
    if "qwen3" in model.lower() and disable_qwen_thinking:
        tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.")
        model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}
    
    return ChatOpenAI(
        model_name=model,
        openai_api_key=client.api_key,
        openai_api_base=str(client.base_url),
        model_kwargs=model_kwargs  # Pass the extra arguments here
    )

def split_text_by_structure(long_text: str) -> Tuple[str, str]:
    """
    Intelligently splits the text into a "header" (title, authors, abstract) and "body".
    It looks for keywords like "Abstract" and "Introduction" to determine the split point.
    """
    abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE)
    if not abstract_match:
        tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.")
        return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]

    intro_match = re.search(r'(\n\s*(\d+|I|II|III|IV|V)\.?\s*)?Introduction', long_text[abstract_match.end():], re.IGNORECASE)
    
    if not intro_match:
        tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.")
        return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]
        
    split_point = abstract_match.end() + intro_match.start()
    
    header_text = long_text[:split_point]
    body_text = long_text[split_point:]
    
    tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).")
    return header_text, body_text

# --- MODIFIED: Added disable_qwen_thinking parameter ---
async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str:
    """
    Asynchronously summarizes long text using a structure-aware hybrid strategy.
    """
    if not long_text:
        return ""

    if len(long_text) <= SUMMARIZATION_THRESHOLD:
        tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.")
        return long_text

    header_text, body_text = split_text_by_structure(long_text)
    
    if not body_text:
        tqdm.write("[!] Could not separate the body text. Returning the full original text.")
        return header_text

    tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...")

    try:
        # Pass the flag down to the LLM creator
        llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking)
    except ValueError as e:
        return f"Error: {e}"

    body_summary = ""

    tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...")
    try:
        stuff_prompt_template = """
        # INSTRUCTION
        You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary.
        This summary needs to cover all the essential aspects of the provided text.

        # PAPER BODY TEXT:
        ---
        {text}
        ---

        # YOUR DETAILED SYNTHESIZED SUMMARY:
        """
        STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
        stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True)
        
        docs = [Document(page_content=body_text)]
        body_summary = await stuff_chain.arun(docs)
        tqdm.write("[βœ“] 'Stuff' strategy for the body text was successful!")

    except BadRequestError as e:
        if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower():
            tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}")
            return f"Error: API call failed - {e}"
        tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.")
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=30000, 
            chunk_overlap=3000
        )
        chunks = text_splitter.split_text(body_text)
        docs = [Document(page_content=t) for t in chunks]
        tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.")

        map_prompt_template = """
        # INSTRUCTION
        You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary.
        Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion.
        The language must be refined and to the point.

        # TEXT SEGMENT:
        ---
        {text}
        ---

        # YOUR CONCISE SUMMARY:
        """
        MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"])

        combine_prompt_template = """
        # INSTRUCTION
        You are a senior editor. You have received several summaries extracted from different parts of the same research paper.
        Your task is to synthesize these summaries into a single, coherent final summary.

        # LIST OF SUMMARIES:
        ---
        {text}
        ---

        # YOUR SYNTHESIZED FINAL DETAILED SUMMARY:
        """
        COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"])

        map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True)
        
        try:
            body_summary = await map_reduce_chain.arun(docs)
            tqdm.write("[βœ“] 'Map-Reduce' summarization for the body text is complete.")
        except Exception as chain_error:
            tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}")
            return f"Error: 'Map-Reduce' summarization failed - {chain_error}"

    except Exception as e:
        tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}")
        return f"Error: Summarization failed - {e}"

    final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}"
    return final_text