Spaces:

sid-0313
/

rag-based-search

No application file

App Files Files Community

rag-based-search / src /utils /summarizer.py

sid-0313

Create summarizer.py

48eace2 verified almost 2 years ago

raw

history blame contribute delete

2.98 kB

	from langchain_community.document_loaders import PyPDFLoader
	from utils.utilities import count_num_tokens
	from openai import OpenAI

	client = OpenAI()

	class Summarizer:
	@staticmethod
	def summarize_the_pdf(
	file_dir: str,
	max_final_token: int,
	token_threshold: int,
	gpt_model: str,
	temperature: float,
	summarizer_llm_system_role: str,
	final_summarizer_llm_system_role: str,
	character_overlap: int
	):
	docs = []
	docs.extend(PyPDFLoader(file_dir).load())
	print(f"Document length: {len(docs)}")
	max_summarizer_output_token = int(
	max_final_token/len(docs)) - token_threshold
	full_summary = ""
	counter = 1
	print("Generating the summary..")
	# if the document has more than one pages
	if len(docs) > 1:
	for i in range(len(docs)):
	# NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.)

	if i == 0: # For the first page
	prompt = docs[i].page_content + \
	docs[i+1].page_content[:character_overlap]
	# For pages except the fist and the last one.
	elif i < len(docs)-1:
	prompt = docs[i-1].page_content[-character_overlap:] + \
	docs[i].page_content + \
	docs[i+1].page_content[:character_overlap]
	else: # For the last page
	prompt = docs[i-1].page_content[-character_overlap:] + \
	docs[i].page_content
	summarizer_llm_system_role = summarizer_llm_system_role.format(
	max_summarizer_output_token)
	full_summary += Summarizer.get_llm_response(
	gpt_model,
	temperature,
	summarizer_llm_system_role,
	prompt=prompt
	)
	else: # if the document has only one page
	full_summary = docs[0].page_content

	print(f"Page {counter} was summarized. ", end="")
	counter += 1
	print("\nFull summary token length:", count_num_tokens(
	full_summary, model=gpt_model))
	final_summary = Summarizer.get_llm_response(
	gpt_model,
	temperature,
	final_summarizer_llm_system_role,
	prompt=full_summary
	)
	return final_summary

	@staticmethod
	def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str):
	response = client.chat.completions.create(
	model=gpt_model,
	messages=[
	{"role": "system", "content": llm_system_role},
	{"role": "user", "content": prompt}
	],
	temperature=temperature,
	)
	return response.choices[0].message.content