Spaces:
No application file
No application file
| from langchain_community.document_loaders import PyPDFLoader | |
| from utils.utilities import count_num_tokens | |
| from openai import OpenAI | |
| client = OpenAI() | |
| class Summarizer: | |
| def summarize_the_pdf( | |
| file_dir: str, | |
| max_final_token: int, | |
| token_threshold: int, | |
| gpt_model: str, | |
| temperature: float, | |
| summarizer_llm_system_role: str, | |
| final_summarizer_llm_system_role: str, | |
| character_overlap: int | |
| ): | |
| docs = [] | |
| docs.extend(PyPDFLoader(file_dir).load()) | |
| print(f"Document length: {len(docs)}") | |
| max_summarizer_output_token = int( | |
| max_final_token/len(docs)) - token_threshold | |
| full_summary = "" | |
| counter = 1 | |
| print("Generating the summary..") | |
| # if the document has more than one pages | |
| if len(docs) > 1: | |
| for i in range(len(docs)): | |
| # NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.) | |
| if i == 0: # For the first page | |
| prompt = docs[i].page_content + \ | |
| docs[i+1].page_content[:character_overlap] | |
| # For pages except the fist and the last one. | |
| elif i < len(docs)-1: | |
| prompt = docs[i-1].page_content[-character_overlap:] + \ | |
| docs[i].page_content + \ | |
| docs[i+1].page_content[:character_overlap] | |
| else: # For the last page | |
| prompt = docs[i-1].page_content[-character_overlap:] + \ | |
| docs[i].page_content | |
| summarizer_llm_system_role = summarizer_llm_system_role.format( | |
| max_summarizer_output_token) | |
| full_summary += Summarizer.get_llm_response( | |
| gpt_model, | |
| temperature, | |
| summarizer_llm_system_role, | |
| prompt=prompt | |
| ) | |
| else: # if the document has only one page | |
| full_summary = docs[0].page_content | |
| print(f"Page {counter} was summarized. ", end="") | |
| counter += 1 | |
| print("\nFull summary token length:", count_num_tokens( | |
| full_summary, model=gpt_model)) | |
| final_summary = Summarizer.get_llm_response( | |
| gpt_model, | |
| temperature, | |
| final_summarizer_llm_system_role, | |
| prompt=full_summary | |
| ) | |
| return final_summary | |
| def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str): | |
| response = client.chat.completions.create( | |
| model=gpt_model, | |
| messages=[ | |
| {"role": "system", "content": llm_system_role}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=temperature, | |
| ) | |
| return response.choices[0].message.content |