Raghav001 commited on
Commit
547bc24
·
1 Parent(s): 377bd9b

Image Extraction Feature

Browse files
Files changed (1) hide show
  1. app.py +41 -27
app.py CHANGED
@@ -126,6 +126,7 @@ def up_file(fls):
126
  pdf = []
127
  docs = []
128
  pptx = []
 
129
 
130
  for i in names:
131
 
@@ -133,35 +134,12 @@ def up_file(fls):
133
  pdf.append(i)
134
  elif i[-4:] == "docx":
135
  docs.append(i)
136
- else:
137
  pptx.append(i)
 
 
138
 
139
 
140
- #pptx Extracting
141
- for i in pptx:
142
- loader = UnstructuredPowerPointLoader(i)
143
- data = loader.load()
144
- content = str(data).split("'")
145
- cnt = content[1]
146
- # c = cnt.split('\\n\\n')
147
- # final = "".join(c)
148
- c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
149
- doc_text_list.append(c)
150
-
151
-
152
-
153
- #Doc Extracting
154
- for i in docs:
155
- loader = UnstructuredWordDocumentLoader(i)
156
- data = loader.load()
157
- content = str(data).split("'")
158
- cnt = content[1]
159
- # c = cnt.split('\\n\\n')
160
- # final = "".join(c)
161
- c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
162
- doc_text_list.append(c)
163
-
164
-
165
  #Pdf Extracting
166
  for idx, file in enumerate(pdf):
167
  print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
@@ -198,6 +176,42 @@ def up_file(fls):
198
  res_list.append(str(df))
199
 
200
  doc_text_list += res_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  doc_text_list = [str(text).strip() for text in doc_text_list if len(str(text).strip()) > 0]
202
  # print(doc_text_list)
203
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
@@ -211,7 +225,7 @@ def up_file(fls):
211
  with gr.Blocks(css=".gradio-container {background-color: #f7f377}, footer {visibility: hidden}") as demo:
212
  with gr.Row():
213
  with gr.Column():
214
- file = gr.File(file_types=['.pptx','.docx','.pdf'], label='Click to upload Document', file_count='multiple')
215
  doc_bu = gr.Button(value='Submit', visible=False)
216
 
217
 
 
126
  pdf = []
127
  docs = []
128
  pptx = []
129
+ jpg = []
130
 
131
  for i in names:
132
 
 
134
  pdf.append(i)
135
  elif i[-4:] == "docx":
136
  docs.append(i)
137
+ elif i[-4:] == "pptx":
138
  pptx.append(i)
139
+ else:
140
+ jpg.append(i)
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  #Pdf Extracting
144
  for idx, file in enumerate(pdf):
145
  print("11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
 
176
  res_list.append(str(df))
177
 
178
  doc_text_list += res_list
179
+
180
+ #pptx Extracting
181
+ for i in pptx:
182
+ loader = UnstructuredPowerPointLoader(i)
183
+ # data = loader.load()
184
+ # content = str(data).split("'")
185
+ # cnt = content[1]
186
+ # # c = cnt.split('\\n\\n')
187
+ # # final = "".join(c)
188
+ # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
189
+ doc_text_list.append(data)
190
+
191
+
192
+
193
+ #Doc Extracting
194
+ for i in docs:
195
+ loader = UnstructuredWordDocumentLoader(i)
196
+ # data = loader.load()
197
+ # content = str(data).split("'")
198
+ # cnt = content[1]
199
+ # # c = cnt.split('\\n\\n')
200
+ # # final = "".join(c)
201
+ # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
202
+ doc_text_list.append(data)
203
+
204
+ #Image Extraction
205
+ for i in jpg:
206
+ loader = UnstructuredImageLoader(i)
207
+ # data = loader.load()
208
+ # content = str(data).split("'")
209
+ # cnt = content[1]
210
+ # # c = cnt.split('\\n\\n')
211
+ # # final = "".join(c)
212
+ # c = cnt.replace('\\n\\n',"").replace("<PAGE BREAK>","").replace("\t","")
213
+ doc_text_list.append(data)
214
+
215
  doc_text_list = [str(text).strip() for text in doc_text_list if len(str(text).strip()) > 0]
216
  # print(doc_text_list)
217
  return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
 
225
  with gr.Blocks(css=".gradio-container {background-color: #f7f377}, footer {visibility: hidden}") as demo:
226
  with gr.Row():
227
  with gr.Column():
228
+ file = gr.File(file_types=['.jpeg','jpg','.pptx','.docx','.pdf'], label='Click to upload Document', file_count='multiple')
229
  doc_bu = gr.Button(value='Submit', visible=False)
230
 
231