Spaces:
Runtime error
Runtime error
PPTx Feature
Browse files
app.py
CHANGED
|
@@ -10,6 +10,12 @@ from cnocr import CnOcr
|
|
| 10 |
|
| 11 |
# from langchain.document_loaders import PyPDFLoader
|
| 12 |
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from sentence_transformers import SentenceTransformer, models, util
|
| 14 |
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
|
| 15 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
|
@@ -109,32 +115,54 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
| 109 |
|
| 110 |
def up_file(fls):
|
| 111 |
doc_text_list = []
|
| 112 |
-
|
| 113 |
|
|
|
|
| 114 |
for i in fls:
|
| 115 |
names.append(str(i.name))
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
docs = []
|
|
|
|
|
|
|
| 119 |
for i in names:
|
| 120 |
|
| 121 |
if(i[-3:] == "pdf"):
|
| 122 |
-
|
| 123 |
-
|
| 124 |
docs.append(i)
|
|
|
|
|
|
|
|
|
|
| 125 |
|
|
|
|
| 126 |
for i in docs:
|
| 127 |
-
loader =
|
| 128 |
data = loader.load()
|
| 129 |
content = str(data).split("'")
|
| 130 |
cnt = content[1]
|
| 131 |
-
c = cnt.split('\\n\\n')
|
| 132 |
-
final = "".join(c)
|
| 133 |
-
|
| 134 |
-
|
| 135 |
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
|
| 139 |
#print(file.name)
|
| 140 |
with pdfplumber.open(file) as pdf:
|
|
|
|
| 10 |
|
| 11 |
# from langchain.document_loaders import PyPDFLoader
|
| 12 |
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
| 13 |
+
from langchain.document_loaders import UnstructuredPowerPointLoader
|
| 14 |
+
from langchain.document_loaders.image import UnstructuredImageLoader
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
from sentence_transformers import SentenceTransformer, models, util
|
| 20 |
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
|
| 21 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
|
|
|
| 115 |
|
| 116 |
def up_file(fls):
|
| 117 |
doc_text_list = []
|
| 118 |
+
|
| 119 |
|
| 120 |
+
names = []
|
| 121 |
for i in fls:
|
| 122 |
names.append(str(i.name))
|
| 123 |
+
|
| 124 |
|
| 125 |
+
pdf = []
|
| 126 |
docs = []
|
| 127 |
+
pptx = []
|
| 128 |
+
|
| 129 |
for i in names:
|
| 130 |
|
| 131 |
if(i[-3:] == "pdf"):
|
| 132 |
+
pdf.append(i)
|
| 133 |
+
elif(i[-4:] == "docx"):
|
| 134 |
docs.append(i)
|
| 135 |
+
else:
|
| 136 |
+
pptx.append(i)
|
| 137 |
+
|
| 138 |
|
| 139 |
+
#pptx Extracting
|
| 140 |
for i in docs:
|
| 141 |
+
loader = UnstructuredPowerPointLoader(i)
|
| 142 |
data = loader.load()
|
| 143 |
content = str(data).split("'")
|
| 144 |
cnt = content[1]
|
| 145 |
+
# c = cnt.split('\\n\\n')
|
| 146 |
+
# final = "".join(c)
|
| 147 |
+
c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
|
| 148 |
+
doc_text_list.append(c)
|
| 149 |
|
| 150 |
|
| 151 |
+
|
| 152 |
+
#Doc Extracting
|
| 153 |
+
for i in docs:
|
| 154 |
+
loader = UnstructuredWordDocumentLoader(i)
|
| 155 |
+
data = loader.load()
|
| 156 |
+
content = str(data).split("'")
|
| 157 |
+
cnt = content[1]
|
| 158 |
+
# c = cnt.split('\\n\\n')
|
| 159 |
+
# final = "".join(c)
|
| 160 |
+
c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
|
| 161 |
+
doc_text_list.append(c)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
#Pdf Extracting
|
| 165 |
+
for idx, file in enumerate(pdf):
|
| 166 |
print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
|
| 167 |
#print(file.name)
|
| 168 |
with pdfplumber.open(file) as pdf:
|