Spaces:

jackycedar
/

pdfs

Runtime error

App Files Files Community

jackycedar commited on Apr 1, 2023

Commit

24545c3

1 Parent(s): 2a03a3b

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -7

app.py CHANGED Viewed

@@ -23,6 +23,11 @@ subprocess.run(["pip", "install", "pdf2image"])
 import sys
 import os
 # os.environ["OPENAI_API_KEY"] = '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'
 os.environ["OPENAI_API_KEY"] = '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'
 # os.environ["OPENAI_API_KEY"] = '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'      # from CHGPT
@@ -107,7 +112,7 @@ def extractScannedPDF(filePath, chainType):
         ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
         text += " ".join(ocr_dict['text']) + "\n"
-    folder_path = "/content/doc"
     print('Save to output2.txt')
     if not os.path.exists(folder_path):
@@ -154,7 +159,7 @@ def extractPDF(filePath, chainType):
     text += txt.extract_text() + "\n"
   print('Total No. of pages = ', counter)
-  folder_path = "/content/doc"
   print('Save to output1.txt')
   if not os.path.exists(folder_path):
@@ -163,12 +168,12 @@ def extractPDF(filePath, chainType):
   else:
       print(f"Folder {folder_path} already exists.")
-  with open('/content/doc/output1.txt', 'w') as f:
       f.write(text)
-  with open("/content/doc/output1.txt") as f:
       docRead = f.read()
-  documents = SimpleDirectoryReader('/content/doc/').load_data()
   index = GPTSimpleVectorIndex.from_documents(documents)
   index.save_to_disk('index1.json')
@@ -210,7 +215,7 @@ def on_token_change(user_token):
 def pdfv1(files, chainType):
-    newPath = "/content/"
     new_name = "t1"
     # Separate file name and extension
     name, ext = os.path.splitext(files.name)
@@ -225,7 +230,7 @@ def pdfv1(files, chainType):
 def pdfv2(files, chainType):
-    newPath = "/content/"
     new_name = "t2"
     # Separate file name and extension
     name, ext = os.path.splitext(files.name)

 import sys
 import os
+# folder_path = "/content/doc"
+home_path = "/home/user/app/"
+folder_path = "/home/user/app/doc/"
 # os.environ["OPENAI_API_KEY"] = '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'
 os.environ["OPENAI_API_KEY"] = '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'
 # os.environ["OPENAI_API_KEY"] = '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'      # from CHGPT
         ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
         text += " ".join(ocr_dict['text']) + "\n"
+    # folder_path = "/content/doc"
     print('Save to output2.txt')
     if not os.path.exists(folder_path):
     text += txt.extract_text() + "\n"
   print('Total No. of pages = ', counter)
+  # folder_path = "/content/doc"
   print('Save to output1.txt')
   if not os.path.exists(folder_path):
   else:
       print(f"Folder {folder_path} already exists.")
+  with open(folder_path + 'output1.txt', 'w') as f:
       f.write(text)
+  with open(folder_path + 'output1.txt') as f:
       docRead = f.read()
+  documents = SimpleDirectoryReader(folder_path).load_data()
   index = GPTSimpleVectorIndex.from_documents(documents)
   index.save_to_disk('index1.json')
 def pdfv1(files, chainType):
+    newPath = home_path
     new_name = "t1"
     # Separate file name and extension
     name, ext = os.path.splitext(files.name)
 def pdfv2(files, chainType):
+    newPath = home_path
     new_name = "t2"
     # Separate file name and extension
     name, ext = os.path.splitext(files.name)