sid-0313 commited on
Commit
48eace2
·
verified ·
1 Parent(s): 6e1d6e0

Create summarizer.py

Browse files
Files changed (1) hide show
  1. src/utils/summarizer.py +76 -0
src/utils/summarizer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from utils.utilities import count_num_tokens
3
+ from openai import OpenAI
4
+
5
+ client = OpenAI()
6
+
7
+ class Summarizer:
8
+ @staticmethod
9
+ def summarize_the_pdf(
10
+ file_dir: str,
11
+ max_final_token: int,
12
+ token_threshold: int,
13
+ gpt_model: str,
14
+ temperature: float,
15
+ summarizer_llm_system_role: str,
16
+ final_summarizer_llm_system_role: str,
17
+ character_overlap: int
18
+ ):
19
+ docs = []
20
+ docs.extend(PyPDFLoader(file_dir).load())
21
+ print(f"Document length: {len(docs)}")
22
+ max_summarizer_output_token = int(
23
+ max_final_token/len(docs)) - token_threshold
24
+ full_summary = ""
25
+ counter = 1
26
+ print("Generating the summary..")
27
+ # if the document has more than one pages
28
+ if len(docs) > 1:
29
+ for i in range(len(docs)):
30
+ # NOTE: This part can be optimized by considering a better technique for creating the prompt. (e.g: lanchain "chunksize" and "chunkoverlap" arguments.)
31
+
32
+ if i == 0: # For the first page
33
+ prompt = docs[i].page_content + \
34
+ docs[i+1].page_content[:character_overlap]
35
+ # For pages except the fist and the last one.
36
+ elif i < len(docs)-1:
37
+ prompt = docs[i-1].page_content[-character_overlap:] + \
38
+ docs[i].page_content + \
39
+ docs[i+1].page_content[:character_overlap]
40
+ else: # For the last page
41
+ prompt = docs[i-1].page_content[-character_overlap:] + \
42
+ docs[i].page_content
43
+ summarizer_llm_system_role = summarizer_llm_system_role.format(
44
+ max_summarizer_output_token)
45
+ full_summary += Summarizer.get_llm_response(
46
+ gpt_model,
47
+ temperature,
48
+ summarizer_llm_system_role,
49
+ prompt=prompt
50
+ )
51
+ else: # if the document has only one page
52
+ full_summary = docs[0].page_content
53
+
54
+ print(f"Page {counter} was summarized. ", end="")
55
+ counter += 1
56
+ print("\nFull summary token length:", count_num_tokens(
57
+ full_summary, model=gpt_model))
58
+ final_summary = Summarizer.get_llm_response(
59
+ gpt_model,
60
+ temperature,
61
+ final_summarizer_llm_system_role,
62
+ prompt=full_summary
63
+ )
64
+ return final_summary
65
+
66
+ @staticmethod
67
+ def get_llm_response(gpt_model: str, temperature: float, llm_system_role: str, prompt: str):
68
+ response = client.chat.completions.create(
69
+ model=gpt_model,
70
+ messages=[
71
+ {"role": "system", "content": llm_system_role},
72
+ {"role": "user", "content": prompt}
73
+ ],
74
+ temperature=temperature,
75
+ )
76
+ return response.choices[0].message.content