Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from utils.utilities import count_num_tokens | |
| import openai | |
| class Summarizer: | |
| """ | |
| A class for summarizing PDF documents using OpenAI's ChatGPT engine. | |
| Attributes: | |
| None | |
| Methods: | |
| summarize_the_pdf: | |
| Summarizes the content of a PDF file using OpenAI's ChatGPT engine. | |
| get_llm_response: | |
| Retrieves the response from the ChatGPT engine for a given prompt. | |
| Note: Ensure that you have the required dependencies installed and configured, including the OpenAI API key. | |
| """ | |
| def summarize_the_pdf( | |
| file_dir: str, | |
| max_final_token: int, | |
| token_threshold: int, | |
| gpt_model: str, | |
| temperature: float, | |
| summarizer_llm_system_role: str, | |
| final_summarizer_llm_system_role: str, | |
| character_overlap: int | |
| ): | |
| """ | |
| Summarizes the content of a PDF file using OpenAI's ChatGPT engine. | |
| Args: | |
| file_dir (str): The path to the PDF file. | |
| max_final_token (int): The maximum number of tokens in the final summary. | |
| token_threshold (int): The threshold for token count reduction. | |
| gpt_model (str): The ChatGPT engine model name. | |
| temperature (float): The temperature parameter for ChatGPT response generation. | |
| summarizer_llm_system_role (str): The system role for the summarizer. | |
| Returns: | |
| str: The final summarized content. | |
| """ | |
| docs = [] | |
| docs.extend(PyPDFLoader(file_dir).load()) | |
| print(f"Document length: {len(docs)}") | |
| max_summarizer_output_token = int( | |
| max_final_token/len(docs)) - token_threshold | |
| full_summary = "" | |
| counter = 1 | |
| print("Generating the summary..") | |
| # if the document has more than one pages | |
| if len(docs) > 1: | |
| for i in range(len(docs)): | |
| # NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.) | |
| if i == 0: # For the first page | |
| prompt = docs[i].page_content + \ | |
| docs[i+1].page_content[:character_overlap] | |
| # For pages except the fist and the last one. | |
| elif i < len(docs)-1: | |
| prompt = docs[i-1].page_content[-character_overlap:] + \ | |
| docs[i].page_content + \ | |
| docs[i+1].page_content[:character_overlap] | |
| else: # For the last page | |
| prompt = docs[i-1].page_content[-character_overlap:] + \ | |
| docs[i].page_content | |
| summarizer_llm_system_role = summarizer_llm_system_role.format( | |
| max_summarizer_output_token) | |
| full_summary += Summarizer.get_llm_response( | |
| gpt_model, | |
| temperature, | |
| summarizer_llm_system_role, | |
| prompt=prompt | |
| ) | |
| else: # if the document has only one page | |
| full_summary = docs[0].page_content | |
| print(f"Page {counter} was summarized. ", end="") | |
| counter += 1 | |
| print("\nFull summary token length:", count_num_tokens( | |
| full_summary, model=gpt_model)) | |
| final_summary = Summarizer.get_llm_response( | |
| gpt_model, | |
| temperature, | |
| final_summarizer_llm_system_role, | |
| prompt=full_summary | |
| ) | |
| return final_summary | |
| def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str): | |
| """ | |
| Retrieves the response from the ChatGPT engine for a given prompt. | |
| Args: | |
| gpt_model (str): The ChatGPT engine model name. | |
| temperature (float): The temperature parameter for ChatGPT response generation. | |
| summarizer_llm_system_role (str): The system role for the summarizer. | |
| max_summarizer_output_token (int): The maximum number of tokens for the summarizer output. | |
| prompt (str): The input prompt for the ChatGPT engine. | |
| Returns: | |
| str: The response content from the ChatGPT engine. | |
| """ | |
| response = openai.ChatCompletion.create( | |
| engine=gpt_model, | |
| messages=[ | |
| {"role": "system", "content": llm_system_role}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=temperature, | |
| ) | |
| return response.choices[0].message.content | |