Spaces:

yzweak
/

AutoPR

Running

App Files Files Community

AutoPR / pragent /backend /text_processor.py

yzweak

Initial commit

ec3d86e 3 months ago

raw

history blame contribute delete

7.05 kB

	# pragent/backend/text_processor.py
	import re
	from typing import List, Tuple
	from langchain_openai import ChatOpenAI
	from langchain.chains.summarize import load_summarize_chain
	from langchain.docstore.document import Document
	from langchain.prompts import PromptTemplate
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from openai import AsyncOpenAI, BadRequestError
	from tqdm.asyncio import tqdm

	SUMMARIZATION_THRESHOLD = 4000
	FALLBACK_HEADER_SIZE = 3000

	def create_llm(model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False):
	"""Creates a LangChain LLM object from the provided client."""
	if not client:
	raise ValueError("API client is not initialized.")

	model_kwargs = {}
	if "qwen3" in model.lower() and disable_qwen_thinking:
	tqdm.write("[*] Summarizer: Enabling 'disable_thinking' for Qwen3 model.")
	model_kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}

	return ChatOpenAI(
	model_name=model,
	openai_api_key=client.api_key,
	openai_api_base=str(client.base_url),
	model_kwargs=model_kwargs # Pass the extra arguments here
	)

	def split_text_by_structure(long_text: str) -> Tuple[str, str]:
	"""
	Intelligently splits the text into a "header" (title, authors, abstract) and "body".
	It looks for keywords like "Abstract" and "Introduction" to determine the split point.
	"""
	abstract_match = re.search(r'\bAbstract\b', long_text, re.IGNORECASE)
	if not abstract_match:
	tqdm.write("[!] 'Abstract' keyword not found. Falling back to fixed character count for splitting.")
	return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]

	intro_match = re.search(r'(\n\s(\d+\|I\|II\|III\|IV\|V)\.?\s)?Introduction', long_text[abstract_match.end():], re.IGNORECASE)

	if not intro_match:
	tqdm.write("[!] 'Introduction' keyword not found after 'Abstract'. Falling back to fixed character count for splitting.")
	return long_text[:FALLBACK_HEADER_SIZE], long_text[FALLBACK_HEADER_SIZE:]

	split_point = abstract_match.end() + intro_match.start()

	header_text = long_text[:split_point]
	body_text = long_text[split_point:]

	tqdm.write(f"[*] Successfully separated header via keywords ({len(header_text)} characters).")
	return header_text, body_text

	# --- MODIFIED: Added disable_qwen_thinking parameter ---
	async def summarize_long_text(long_text: str, model: str, client: AsyncOpenAI, disable_qwen_thinking: bool = False) -> str:
	"""
	Asynchronously summarizes long text using a structure-aware hybrid strategy.
	"""
	if not long_text:
	return ""

	if len(long_text) <= SUMMARIZATION_THRESHOLD:
	tqdm.write(f"[*] Total text length ({len(long_text)} chars) is below threshold {SUMMARIZATION_THRESHOLD}. Skipping summarization.")
	return long_text

	header_text, body_text = split_text_by_structure(long_text)

	if not body_text:
	tqdm.write("[!] Could not separate the body text. Returning the full original text.")
	return header_text

	tqdm.write(f"[*] Summarizing the identified body text ({len(body_text)} characters)...")

	try:
	# Pass the flag down to the LLM creator
	llm = create_llm(model, client, disable_qwen_thinking=disable_qwen_thinking)
	except ValueError as e:
	return f"Error: {e}"

	body_summary = ""

	tqdm.write("[*] Attempting high-speed 'stuff' summarization strategy for the body text...")
	try:
	stuff_prompt_template = """
	# INSTRUCTION
	You are a senior editor. Your task is to read the following body text of a research paper and synthesize it into a single, coherent, and detailed summary.
	This summary needs to cover all the essential aspects of the provided text.

	# PAPER BODY TEXT:
	---
	{text}
	---

	# YOUR DETAILED SYNTHESIZED SUMMARY:
	"""
	STUFF_PROMPT = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
	stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=STUFF_PROMPT, verbose=True)

	docs = [Document(page_content=body_text)]
	body_summary = await stuff_chain.arun(docs)
	tqdm.write("[✓] 'Stuff' strategy for the body text was successful!")

	except BadRequestError as e:
	if "context_length_exceeded" not in str(e).lower() and "maximum context length" not in str(e).lower() and "context length" not in str(e).lower():
	tqdm.write(f"[!] Unexpected API error with 'stuff' strategy: {e}")
	return f"Error: API call failed - {e}"
	tqdm.write("[!] Body text is too long for the 'stuff' strategy. Falling back to 'map_reduce'.")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=30000,
	chunk_overlap=3000
	)
	chunks = text_splitter.split_text(body_text)
	docs = [Document(page_content=t) for t in chunks]
	tqdm.write(f"[*] Body text has been split into {len(docs)} chunks for summarization.")

	map_prompt_template = """
	# INSTRUCTION
	You are a research analyst. Your task is to read the following text segment from a scientific paper and generate a concise summary.
	Focus only on the most critical information: the research question, the proposed method, key results, and the main conclusion.
	The language must be refined and to the point.

	# TEXT SEGMENT:
	---
	{text}
	---

	# YOUR CONCISE SUMMARY:
	"""
	MAP_PROMPT = PromptTemplate(template=map_prompt_template, input_variables=["text"])

	combine_prompt_template = """
	# INSTRUCTION
	You are a senior editor. You have received several summaries extracted from different parts of the same research paper.
	Your task is to synthesize these summaries into a single, coherent final summary.

	# LIST OF SUMMARIES:
	---
	{text}
	---

	# YOUR SYNTHESIZED FINAL DETAILED SUMMARY:
	"""
	COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template, input_variables=["text"])

	map_reduce_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT, verbose=True)

	try:
	body_summary = await map_reduce_chain.arun(docs)
	tqdm.write("[✓] 'Map-Reduce' summarization for the body text is complete.")
	except Exception as chain_error:
	tqdm.write(f"[!] 'Map-Reduce' chain execution failed: {chain_error}")
	return f"Error: 'Map-Reduce' summarization failed - {chain_error}"

	except Exception as e:
	tqdm.write(f"[!] An unknown error occurred during the summarization process: {e}")
	return f"Error: Summarization failed - {e}"

	final_text = f"{header_text}\n\n[--- Body Summary ---]\n\n{body_summary}"
	return final_text