Spaces:
Runtime error
Runtime error
Commit ·
24545c3
1
Parent(s): 2a03a3b
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,6 +23,11 @@ subprocess.run(["pip", "install", "pdf2image"])
|
|
| 23 |
import sys
|
| 24 |
import os
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# os.environ["OPENAI_API_KEY"] = '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'
|
| 27 |
os.environ["OPENAI_API_KEY"] = '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'
|
| 28 |
# os.environ["OPENAI_API_KEY"] = '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' # from CHGPT
|
|
@@ -107,7 +112,7 @@ def extractScannedPDF(filePath, chainType):
|
|
| 107 |
ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
|
| 108 |
text += " ".join(ocr_dict['text']) + "\n"
|
| 109 |
|
| 110 |
-
folder_path = "/content/doc"
|
| 111 |
|
| 112 |
print('Save to output2.txt')
|
| 113 |
if not os.path.exists(folder_path):
|
|
@@ -154,7 +159,7 @@ def extractPDF(filePath, chainType):
|
|
| 154 |
text += txt.extract_text() + "\n"
|
| 155 |
print('Total No. of pages = ', counter)
|
| 156 |
|
| 157 |
-
folder_path = "/content/doc"
|
| 158 |
|
| 159 |
print('Save to output1.txt')
|
| 160 |
if not os.path.exists(folder_path):
|
|
@@ -163,12 +168,12 @@ def extractPDF(filePath, chainType):
|
|
| 163 |
else:
|
| 164 |
print(f"Folder {folder_path} already exists.")
|
| 165 |
|
| 166 |
-
with open('
|
| 167 |
f.write(text)
|
| 168 |
-
with open(
|
| 169 |
docRead = f.read()
|
| 170 |
|
| 171 |
-
documents = SimpleDirectoryReader(
|
| 172 |
index = GPTSimpleVectorIndex.from_documents(documents)
|
| 173 |
index.save_to_disk('index1.json')
|
| 174 |
|
|
@@ -210,7 +215,7 @@ def on_token_change(user_token):
|
|
| 210 |
|
| 211 |
def pdfv1(files, chainType):
|
| 212 |
|
| 213 |
-
newPath =
|
| 214 |
new_name = "t1"
|
| 215 |
# Separate file name and extension
|
| 216 |
name, ext = os.path.splitext(files.name)
|
|
@@ -225,7 +230,7 @@ def pdfv1(files, chainType):
|
|
| 225 |
|
| 226 |
def pdfv2(files, chainType):
|
| 227 |
|
| 228 |
-
newPath =
|
| 229 |
new_name = "t2"
|
| 230 |
# Separate file name and extension
|
| 231 |
name, ext = os.path.splitext(files.name)
|
|
|
|
| 23 |
import sys
|
| 24 |
import os
|
| 25 |
|
| 26 |
+
# folder_path = "/content/doc"
|
| 27 |
+
home_path = "/home/user/app/"
|
| 28 |
+
folder_path = "/home/user/app/doc/"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
# os.environ["OPENAI_API_KEY"] = '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'
|
| 32 |
os.environ["OPENAI_API_KEY"] = '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'
|
| 33 |
# os.environ["OPENAI_API_KEY"] = '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' # from CHGPT
|
|
|
|
| 112 |
ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
|
| 113 |
text += " ".join(ocr_dict['text']) + "\n"
|
| 114 |
|
| 115 |
+
# folder_path = "/content/doc"
|
| 116 |
|
| 117 |
print('Save to output2.txt')
|
| 118 |
if not os.path.exists(folder_path):
|
|
|
|
| 159 |
text += txt.extract_text() + "\n"
|
| 160 |
print('Total No. of pages = ', counter)
|
| 161 |
|
| 162 |
+
# folder_path = "/content/doc"
|
| 163 |
|
| 164 |
print('Save to output1.txt')
|
| 165 |
if not os.path.exists(folder_path):
|
|
|
|
| 168 |
else:
|
| 169 |
print(f"Folder {folder_path} already exists.")
|
| 170 |
|
| 171 |
+
with open(folder_path + 'output1.txt', 'w') as f:
|
| 172 |
f.write(text)
|
| 173 |
+
with open(folder_path + 'output1.txt') as f:
|
| 174 |
docRead = f.read()
|
| 175 |
|
| 176 |
+
documents = SimpleDirectoryReader(folder_path).load_data()
|
| 177 |
index = GPTSimpleVectorIndex.from_documents(documents)
|
| 178 |
index.save_to_disk('index1.json')
|
| 179 |
|
|
|
|
| 215 |
|
| 216 |
def pdfv1(files, chainType):
|
| 217 |
|
| 218 |
+
newPath = home_path
|
| 219 |
new_name = "t1"
|
| 220 |
# Separate file name and extension
|
| 221 |
name, ext = os.path.splitext(files.name)
|
|
|
|
| 230 |
|
| 231 |
def pdfv2(files, chainType):
|
| 232 |
|
| 233 |
+
newPath = home_path
|
| 234 |
new_name = "t2"
|
| 235 |
# Separate file name and extension
|
| 236 |
name, ext = os.path.splitext(files.name)
|