kxx-kkk commited on
Commit
fb4537e
·
verified ·
1 Parent(s): 9f72432

Initial changes

Browse files
Files changed (2) hide show
  1. app.py +46 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from pdf2image import convert_from_path
3
+ import streamlit as st
4
+ import pytesseract
5
+ from PIL import Image
6
+
7
+ def extract_text_from_pdf(pdf_path):
8
+ pdf_file = open(pdf_path, 'rb')
9
+ pdf_reader = PyPDF2.PdfFileReader(pdf_file)
10
+ num_pages = pdf_reader.numPages
11
+
12
+ text = ""
13
+ for page_num in range(num_pages):
14
+ page = pdf_reader.getPage(page_num)
15
+ page_text = page.extractText()
16
+ text += page_text
17
+
18
+ pdf_file.close()
19
+ return text
20
+
21
+ def extract_text_from_image(image):
22
+ text = pytesseract.image_to_string(image)
23
+ return text
24
+
25
+ def main():
26
+ st.title("PDF Text and Image Extractor")
27
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
28
+
29
+ if uploaded_file is not None:
30
+ st.write("File uploaded successfully!")
31
+ text = extract_text_from_pdf(uploaded_file)
32
+
33
+ st.header("Extracted Text:")
34
+ st.write(text)
35
+
36
+ pages = convert_from_path(uploaded_file)
37
+ st.header("Extracted Images:")
38
+ for page_num, page_img in enumerate(pages):
39
+ st.subheader(f"Page {page_num + 1}")
40
+ st.image(page_img, use_column_width=True)
41
+ image_text = extract_text_from_image(page_img)
42
+ st.write("Image Text:")
43
+ st.write(image_text)
44
+
45
+ if __name__ == "__main__":
46
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PyPDF2
2
+ streamlit
3
+ pdf2image
4
+ pytesseract
5
+ pillow