from langchain_community.document_loaders import PyPDFLoader from utils.utilities import count_num_tokens import openai class Summarizer: """ A class for summarizing PDF documents using OpenAI's ChatGPT engine. Attributes: None Methods: summarize_the_pdf: Summarizes the content of a PDF file using OpenAI's ChatGPT engine. get_llm_response: Retrieves the response from the ChatGPT engine for a given prompt. Note: Ensure that you have the required dependencies installed and configured, including the OpenAI API key. """ @staticmethod def summarize_the_pdf( file_dir: str, max_final_token: int, token_threshold: int, gpt_model: str, temperature: float, summarizer_llm_system_role: str, final_summarizer_llm_system_role: str, character_overlap: int ): """ Summarizes the content of a PDF file using OpenAI's ChatGPT engine. Args: file_dir (str): The path to the PDF file. max_final_token (int): The maximum number of tokens in the final summary. token_threshold (int): The threshold for token count reduction. gpt_model (str): The ChatGPT engine model name. temperature (float): The temperature parameter for ChatGPT response generation. summarizer_llm_system_role (str): The system role for the summarizer. Returns: str: The final summarized content. """ docs = [] docs.extend(PyPDFLoader(file_dir).load()) print(f"Document length: {len(docs)}") max_summarizer_output_token = int( max_final_token/len(docs)) - token_threshold full_summary = "" counter = 1 print("Generating the summary..") # if the document has more than one pages if len(docs) > 1: for i in range(len(docs)): # NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.) if i == 0: # For the first page prompt = docs[i].page_content + \ docs[i+1].page_content[:character_overlap] # For pages except the fist and the last one. elif i < len(docs)-1: prompt = docs[i-1].page_content[-character_overlap:] + \ docs[i].page_content + \ docs[i+1].page_content[:character_overlap] else: # For the last page prompt = docs[i-1].page_content[-character_overlap:] + \ docs[i].page_content summarizer_llm_system_role = summarizer_llm_system_role.format( max_summarizer_output_token) full_summary += Summarizer.get_llm_response( gpt_model, temperature, summarizer_llm_system_role, prompt=prompt ) else: # if the document has only one page full_summary = docs[0].page_content print(f"Page {counter} was summarized. ", end="") counter += 1 print("\nFull summary token length:", count_num_tokens( full_summary, model=gpt_model)) final_summary = Summarizer.get_llm_response( gpt_model, temperature, final_summarizer_llm_system_role, prompt=full_summary ) return final_summary @staticmethod def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str): """ Retrieves the response from the ChatGPT engine for a given prompt. Args: gpt_model (str): The ChatGPT engine model name. temperature (float): The temperature parameter for ChatGPT response generation. summarizer_llm_system_role (str): The system role for the summarizer. max_summarizer_output_token (int): The maximum number of tokens for the summarizer output. prompt (str): The input prompt for the ChatGPT engine. Returns: str: The response content from the ChatGPT engine. """ response = openai.ChatCompletion.create( engine=gpt_model, messages=[ {"role": "system", "content": llm_system_role}, {"role": "user", "content": prompt} ], temperature=temperature, ) return response.choices[0].message.content