shaheerawan3 commited on
Commit
4646582
·
verified ·
1 Parent(s): 65487da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -0
app.py CHANGED
@@ -21,6 +21,42 @@ from typing import Optional, List, Dict, Tuple
21
  from bs4 import BeautifulSoup
22
  import requests
23
  from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  class ImageScraper:
26
  def __init__(self):
 
21
  from bs4 import BeautifulSoup
22
  import requests
23
  from io import BytesIO
24
+ import docx
25
+ import PyPDF2
26
+ import pptx
27
+ import cv2
28
+
29
+ class FileProcessor:
30
+ @staticmethod
31
+ def read_txt(file):
32
+ return file.read().decode('utf-8')
33
+
34
+ @staticmethod
35
+ def read_pdf(file):
36
+ pdf_reader = PyPDF2.PdfReader(file)
37
+ text = ""
38
+ for page in pdf_reader.pages:
39
+ text += page.extract_text() + "\n"
40
+ return text
41
+
42
+ @staticmethod
43
+ def read_docx(file):
44
+ doc = docx.Document(file)
45
+ text = ""
46
+ for para in doc.paragraphs:
47
+ text += para.text + "\n"
48
+ return text
49
+
50
+ @staticmethod
51
+ def read_pptx(file):
52
+ prs = pptx.Presentation(file)
53
+ text = ""
54
+ for slide in prs.slides:
55
+ for shape in slide.shapes:
56
+ if hasattr(shape, "text"):
57
+ text += shape.text + "\n"
58
+ return text
59
+
60
 
61
  class ImageScraper:
62
  def __init__(self):