Spaces:
Running
Running
| # pragent/backend/text_processor.py | |
| import re | |
| from typing import List, Tuple | |
| from langchain_openai import ChatOpenAI | |
| from langchain.chains.summarize import load_summarize_chain | |
| from langchain.docstore.document import Document | |
| from langchain.prompts import PromptTemplate | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from openai import AsyncOpenAI, BadRequestError | |
| from tqdm.asyncio import tqdm | |
| SUMMARIZATION_THRESHOLD = 4000 | |
| FALLBACK_HEADER_SIZE = 3000 | |
| def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False): | |
| """Creates a LangChain LLM object from the provided client.""" | |
| if not client: | |
| raise ValueError("API client is not initialized.") | |
| model_kwargs = {} | |
| if "qwen3" in model.lower() and disable_qwen_thinking: | |
| tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.") | |
| model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}} | |
| return ChatOpenAI( | |
| model_name=model, | |
| openai_api_key=client.api_key, | |
| openai_api_base=str(client.base_url), | |
| model_kwargs=model_kwargs # Pass the extra arguments here | |
| ) | |
| def split_text_by_structure(long_text: str) -> Tuple[str, str]: | |
| """ | |
| Intelligently splits the text into a "header" (title, authors, abstract) and "body". | |
| It looks for keywords like "Abstract" and "Introduction" to determine the split point. | |
| """ | |
| abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE) | |
| if not abstract_match: | |
| tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.") | |
| return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:] | |
| intro_match = re.search(r'(\n\s*(\d+|I|II|III|IV|V)\.?\s*)?Introduction', long_text[abstract_match.end():], re.IGNORECASE) | |
| if not intro_match: | |
| tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.") | |
| return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:] | |
| split_point = abstract_match.end() + intro_match.start() | |
| header_text = long_text[:split_point] | |
| body_text = long_text[split_point:] | |
| tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).") | |
| return header_text, body_text | |
| # --- MODIFIED: Added disable_qwen_thinking parameter --- | |
| async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str: | |
| """ | |
| Asynchronously summarizes long text using a structure-aware hybrid strategy. | |
| """ | |
| if not long_text: | |
| return "" | |
| if len(long_text) <= SUMMARIZATION_THRESHOLD: | |
| tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.") | |
| return long_text | |
| header_text, body_text = split_text_by_structure(long_text) | |
| if not body_text: | |
| tqdm.write("[!] Could not separate the body text. Returning the full original text.") | |
| return header_text | |
| tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...") | |
| try: | |
| # Pass the flag down to the LLM creator | |
| llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking) | |
| except ValueError as e: | |
| return f"Error: {e}" | |
| body_summary = "" | |
| tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...") | |
| try: | |
| stuff_prompt_template = """ | |
| # INSTRUCTION | |
| You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary. | |
| This summary needs to cover all the essential aspects of the provided text. | |
| # PAPER BODY TEXT: | |
| --- | |
| {text} | |
| --- | |
| # YOUR DETAILED SYNTHESIZED SUMMARY: | |
| """ | |
| STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"]) | |
| stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True) | |
| docs = [Document(page_content=body_text)] | |
| body_summary = await stuff_chain.arun(docs) | |
| tqdm.write("[β] 'Stuff' strategy for the body text was successful!") | |
| except BadRequestError as e: | |
| if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower(): | |
| tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}") | |
| return f"Error: API call failed - {e}" | |
| tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=30000, | |
| chunk_overlap=3000 | |
| ) | |
| chunks = text_splitter.split_text(body_text) | |
| docs = [Document(page_content=t) for t in chunks] | |
| tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.") | |
| map_prompt_template = """ | |
| # INSTRUCTION | |
| You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary. | |
| Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion. | |
| The language must be refined and to the point. | |
| # TEXT SEGMENT: | |
| --- | |
| {text} | |
| --- | |
| # YOUR CONCISE SUMMARY: | |
| """ | |
| MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"]) | |
| combine_prompt_template = """ | |
| # INSTRUCTION | |
| You are a senior editor. You have received several summaries extracted from different parts of the same research paper. | |
| Your task is to synthesize these summaries into a single, coherent final summary. | |
| # LIST OF SUMMARIES: | |
| --- | |
| {text} | |
| --- | |
| # YOUR SYNTHESIZED FINAL DETAILED SUMMARY: | |
| """ | |
| COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"]) | |
| map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True) | |
| try: | |
| body_summary = await map_reduce_chain.arun(docs) | |
| tqdm.write("[β] 'Map-Reduce' summarization for the body text is complete.") | |
| except Exception as chain_error: | |
| tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}") | |
| return f"Error: 'Map-Reduce' summarization failed - {chain_error}" | |
| except Exception as e: | |
| tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}") | |
| return f"Error: Summarization failed - {e}" | |
| final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}" | |
| return final_text |