jackycedar commited on
Commit
24545c3
·
1 Parent(s): 2a03a3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -7
app.py CHANGED
@@ -23,6 +23,11 @@ subprocess.run(["pip", "install", "pdf2image"])
23
  import sys
24
  import os
25
 
 
 
 
 
 
26
  # os.environ["OPENAI_API_KEY"] = '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'
27
  os.environ["OPENAI_API_KEY"] = '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'
28
  # os.environ["OPENAI_API_KEY"] = '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' # from CHGPT
@@ -107,7 +112,7 @@ def extractScannedPDF(filePath, chainType):
107
  ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
108
  text += " ".join(ocr_dict['text']) + "\n"
109
 
110
- folder_path = "/content/doc"
111
 
112
  print('Save to output2.txt')
113
  if not os.path.exists(folder_path):
@@ -154,7 +159,7 @@ def extractPDF(filePath, chainType):
154
  text += txt.extract_text() + "\n"
155
  print('Total No. of pages = ', counter)
156
 
157
- folder_path = "/content/doc"
158
 
159
  print('Save to output1.txt')
160
  if not os.path.exists(folder_path):
@@ -163,12 +168,12 @@ def extractPDF(filePath, chainType):
163
  else:
164
  print(f"Folder {folder_path} already exists.")
165
 
166
- with open('/content/doc/output1.txt', 'w') as f:
167
  f.write(text)
168
- with open("/content/doc/output1.txt") as f:
169
  docRead = f.read()
170
 
171
- documents = SimpleDirectoryReader('/content/doc/').load_data()
172
  index = GPTSimpleVectorIndex.from_documents(documents)
173
  index.save_to_disk('index1.json')
174
 
@@ -210,7 +215,7 @@ def on_token_change(user_token):
210
 
211
  def pdfv1(files, chainType):
212
 
213
- newPath = "/content/"
214
  new_name = "t1"
215
  # Separate file name and extension
216
  name, ext = os.path.splitext(files.name)
@@ -225,7 +230,7 @@ def pdfv1(files, chainType):
225
 
226
  def pdfv2(files, chainType):
227
 
228
- newPath = "/content/"
229
  new_name = "t2"
230
  # Separate file name and extension
231
  name, ext = os.path.splitext(files.name)
 
23
  import sys
24
  import os
25
 
26
+ # folder_path = "/content/doc"
27
+ home_path = "/home/user/app/"
28
+ folder_path = "/home/user/app/doc/"
29
+
30
+
31
  # os.environ["OPENAI_API_KEY"] = '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'
32
  os.environ["OPENAI_API_KEY"] = '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'
33
  # os.environ["OPENAI_API_KEY"] = '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' # from CHGPT
 
112
  ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
113
  text += " ".join(ocr_dict['text']) + "\n"
114
 
115
+ # folder_path = "/content/doc"
116
 
117
  print('Save to output2.txt')
118
  if not os.path.exists(folder_path):
 
159
  text += txt.extract_text() + "\n"
160
  print('Total No. of pages = ', counter)
161
 
162
+ # folder_path = "/content/doc"
163
 
164
  print('Save to output1.txt')
165
  if not os.path.exists(folder_path):
 
168
  else:
169
  print(f"Folder {folder_path} already exists.")
170
 
171
+ with open(folder_path + 'output1.txt', 'w') as f:
172
  f.write(text)
173
+ with open(folder_path + 'output1.txt') as f:
174
  docRead = f.read()
175
 
176
+ documents = SimpleDirectoryReader(folder_path).load_data()
177
  index = GPTSimpleVectorIndex.from_documents(documents)
178
  index.save_to_disk('index1.json')
179
 
 
215
 
216
  def pdfv1(files, chainType):
217
 
218
+ newPath = home_path
219
  new_name = "t1"
220
  # Separate file name and extension
221
  name, ext = os.path.splitext(files.name)
 
230
 
231
  def pdfv2(files, chainType):
232
 
233
+ newPath = home_path
234
  new_name = "t2"
235
  # Separate file name and extension
236
  name, ext = os.path.splitext(files.name)