Spaces:

DrSyedFaizan
/

First_Aid_Assistant

Sleeping

App Files Files Community

First_Aid_Assistant / src /utils /summarizer.py

DrSyedFaizan

Upload folder using huggingface_hub

f8bf7df verified 12 months ago

raw

history blame contribute delete

4.67 kB


	from langchain_community.document_loaders import PyPDFLoader
	from utils.utilities import count_num_tokens
	import openai


	class Summarizer:
	"""
	A class for summarizing PDF documents using OpenAI's ChatGPT engine.

	Attributes:
	None

	Methods:
	summarize_the_pdf:
	Summarizes the content of a PDF file using OpenAI's ChatGPT engine.

	get_llm_response:
	Retrieves the response from the ChatGPT engine for a given prompt.

	Note: Ensure that you have the required dependencies installed and configured, including the OpenAI API key.
	"""
	@staticmethod
	def summarize_the_pdf(
	file_dir: str,
	max_final_token: int,
	token_threshold: int,
	gpt_model: str,
	temperature: float,
	summarizer_llm_system_role: str,
	final_summarizer_llm_system_role: str,
	character_overlap: int
	):
	"""
	Summarizes the content of a PDF file using OpenAI's ChatGPT engine.

	Args:
	file_dir (str): The path to the PDF file.
	max_final_token (int): The maximum number of tokens in the final summary.
	token_threshold (int): The threshold for token count reduction.
	gpt_model (str): The ChatGPT engine model name.
	temperature (float): The temperature parameter for ChatGPT response generation.
	summarizer_llm_system_role (str): The system role for the summarizer.

	Returns:
	str: The final summarized content.
	"""
	docs = []
	docs.extend(PyPDFLoader(file_dir).load())
	print(f"Document length: {len(docs)}")
	max_summarizer_output_token = int(
	max_final_token/len(docs)) - token_threshold
	full_summary = ""
	counter = 1
	print("Generating the summary..")
	# if the document has more than one pages
	if len(docs) > 1:
	for i in range(len(docs)):
	# NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.)

	if i == 0: # For the first page
	prompt = docs[i].page_content + \
	docs[i+1].page_content[:character_overlap]
	# For pages except the fist and the last one.
	elif i < len(docs)-1:
	prompt = docs[i-1].page_content[-character_overlap:] + \
	docs[i].page_content + \
	docs[i+1].page_content[:character_overlap]
	else: # For the last page
	prompt = docs[i-1].page_content[-character_overlap:] + \
	docs[i].page_content
	summarizer_llm_system_role = summarizer_llm_system_role.format(
	max_summarizer_output_token)
	full_summary += Summarizer.get_llm_response(
	gpt_model,
	temperature,
	summarizer_llm_system_role,
	prompt=prompt
	)
	else: # if the document has only one page
	full_summary = docs[0].page_content

	print(f"Page {counter} was summarized. ", end="")
	counter += 1
	print("\nFull summary token length:", count_num_tokens(
	full_summary, model=gpt_model))
	final_summary = Summarizer.get_llm_response(
	gpt_model,
	temperature,
	final_summarizer_llm_system_role,
	prompt=full_summary
	)
	return final_summary

	@staticmethod
	def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str):
	"""
	Retrieves the response from the ChatGPT engine for a given prompt.

	Args:
	gpt_model (str): The ChatGPT engine model name.
	temperature (float): The temperature parameter for ChatGPT response generation.
	summarizer_llm_system_role (str): The system role for the summarizer.
	max_summarizer_output_token (int): The maximum number of tokens for the summarizer output.
	prompt (str): The input prompt for the ChatGPT engine.

	Returns:
	str: The response content from the ChatGPT engine.
	"""
	response = openai.ChatCompletion.create(
	engine=gpt_model,
	messages=[
	{"role": "system", "content": llm_system_role},
	{"role": "user", "content": prompt}
	],
	temperature=temperature,
	)
	return response.choices[0].message.content