juniorjukeko commited on
Commit
990b17a
·
1 Parent(s): 02f3795

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -4
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
 
2
  from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
4
  from langchain.chat_models import ChatOpenAI
5
  from langchain.llms import OpenAI
6
  from langchain import PromptTemplate
@@ -53,13 +56,38 @@ model_list = {'gpt-3.5-turbo':'chat',
53
 
54
  text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def summarize_pdf(pdf_file, api_key,
57
  model_name, temperature, llm_max_tokens,
58
  custom_map_prompt, custom_combine_prompt):
59
- global pdf_docs
60
  # Read PDF
61
- loader = OnlinePDFLoader(pdf_file.name)
62
- pdf_docs = loader.load_and_split(text_splitter)
 
63
  file_check(pdf_file)
64
 
65
  # Build LLM Model
@@ -94,7 +122,7 @@ def summarize_pdf(pdf_file, api_key,
94
  def file_check(pdf_file):
95
  if os.path.getsize(pdf_file.name)/1024 **2 > 1:
96
  raise gr.Error("Maximum File Size is 1MB!")
97
- elif len(pdf_docs) > 15:
98
  raise gr.Error("Maximum File Length is 15 Pages!")
99
  else:
100
  pass
 
1
  import os
2
+ from pypdf import PdfReader
3
  from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
4
+ from langchain.docstore.document import Document
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.llms import OpenAI
9
  from langchain import PromptTemplate
 
56
 
57
  text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=250)
58
 
59
+ def parse_pdf(file_path):
60
+ output = []
61
+ print(file_path)
62
+ pdf = PdfReader(file_path)
63
+
64
+ for page in pdf.pages:
65
+ text = page.extract_text()
66
+ output.append(text)
67
+
68
+ return output, len(pdf.pages)
69
+
70
+ def preprocess_pdf_text(list_of_text):
71
+ page_docs = [Document(page_content=page) for page in list_of_text]
72
+
73
+ text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=250, chunk_overlap=50)
74
+ doc_sections = []
75
+ for page in page_docs:
76
+ sections_text = text_splitter.split_text(page.page_content)
77
+ sections_doc = [Document(page_content=section) for section in sections_text]
78
+
79
+ for section in sections_doc:
80
+ doc_sections.append(section)
81
+
82
+ return doc_sections
83
  def summarize_pdf(pdf_file, api_key,
84
  model_name, temperature, llm_max_tokens,
85
  custom_map_prompt, custom_combine_prompt):
86
+ global page_num
87
  # Read PDF
88
+ pdf_txt, page_num = parse_pdf(pdf_file)
89
+ pdf_doc = preprocess_pdf_text(pdf_txt)
90
+
91
  file_check(pdf_file)
92
 
93
  # Build LLM Model
 
122
  def file_check(pdf_file):
123
  if os.path.getsize(pdf_file.name)/1024 **2 > 1:
124
  raise gr.Error("Maximum File Size is 1MB!")
125
+ elif page_num > 15:
126
  raise gr.Error("Maximum File Length is 15 Pages!")
127
  else:
128
  pass