Raghav001 commited on
Commit
7558fdd
·
1 Parent(s): 8bb5a1a

PPTx Feature

Browse files
Files changed (1) hide show
  1. app.py +38 -10
app.py CHANGED
@@ -10,6 +10,12 @@ from cnocr import CnOcr
10
 
11
  # from langchain.document_loaders import PyPDFLoader
12
  from langchain.document_loaders import UnstructuredWordDocumentLoader
 
 
 
 
 
 
13
  from sentence_transformers import SentenceTransformer, models, util
14
  word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
15
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
@@ -109,32 +115,54 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
109
 
110
  def up_file(fls):
111
  doc_text_list = []
112
- names = []
113
 
 
114
  for i in fls:
115
  names.append(str(i.name))
 
116
 
117
- files = []
118
  docs = []
 
 
119
  for i in names:
120
 
121
  if(i[-3:] == "pdf"):
122
- files.append(i)
123
- else:
124
  docs.append(i)
 
 
 
125
 
 
126
  for i in docs:
127
- loader = UnstructuredWordDocumentLoader(i, mode="elements")
128
  data = loader.load()
129
  content = str(data).split("'")
130
  cnt = content[1]
131
- c = cnt.split('\\n\\n')
132
- final = "".join(c)
133
- doc_text_list.append(final)
134
-
135
 
136
 
137
- for idx, file in enumerate(files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
139
  #print(file.name)
140
  with pdfplumber.open(file) as pdf:
 
10
 
11
  # from langchain.document_loaders import PyPDFLoader
12
  from langchain.document_loaders import UnstructuredWordDocumentLoader
13
+ from langchain.document_loaders import UnstructuredPowerPointLoader
14
+ from langchain.document_loaders.image import UnstructuredImageLoader
15
+
16
+
17
+
18
+
19
  from sentence_transformers import SentenceTransformer, models, util
20
  word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2', do_lower_case=True)
21
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
 
115
 
116
  def up_file(fls):
117
  doc_text_list = []
118
+
119
 
120
+ names = []
121
  for i in fls:
122
  names.append(str(i.name))
123
+
124
 
125
+ pdf = []
126
  docs = []
127
+ pptx = []
128
+
129
  for i in names:
130
 
131
  if(i[-3:] == "pdf"):
132
+ pdf.append(i)
133
+ elif(i[-4:] == "docx"):
134
  docs.append(i)
135
+ else:
136
+ pptx.append(i)
137
+
138
 
139
+ #pptx Extracting
140
  for i in docs:
141
+ loader = UnstructuredPowerPointLoader(i)
142
  data = loader.load()
143
  content = str(data).split("'")
144
  cnt = content[1]
145
+ # c = cnt.split('\\n\\n')
146
+ # final = "".join(c)
147
+ c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
148
+ doc_text_list.append(c)
149
 
150
 
151
+
152
+ #Doc Extracting
153
+ for i in docs:
154
+ loader = UnstructuredWordDocumentLoader(i)
155
+ data = loader.load()
156
+ content = str(data).split("'")
157
+ cnt = content[1]
158
+ # c = cnt.split('\\n\\n')
159
+ # final = "".join(c)
160
+ c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
161
+ doc_text_list.append(c)
162
+
163
+
164
+ #Pdf Extracting
165
+ for idx, file in enumerate(pdf):
166
  print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
167
  #print(file.name)
168
  with pdfplumber.open(file) as pdf: