AzizWazir commited on
Commit
c8e848b
·
verified ·
1 Parent(s): 94b0e1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -24
app.py CHANGED
@@ -1,37 +1,66 @@
1
  import streamlit as st
2
  import fitz # PyMuPDF
 
 
 
3
  import pandas as pd
 
 
4
 
5
- def extract_tables_from_pdf(uploaded_file):
6
- # Open the uploaded PDF file (this will be a file-like object)
7
- doc = fitz.open(stream=uploaded_file.read()) # Use .read() to pass the file content as a stream
8
- tables = []
9
-
10
- for page_num in range(len(doc)):
11
- page = doc.load_page(page_num)
12
- text = page.get_text("text")
13
- rows = text.split("\n")
14
- table_data = [row.split() for row in rows if row]
15
- if table_data:
16
- tables.append(table_data)
17
 
18
- return tables
 
 
 
 
 
19
 
20
- def main():
21
- st.title("PDF Table Extraction Tool")
 
 
 
 
 
 
 
 
 
22
 
 
 
 
 
 
 
 
23
  # File uploader widget in Streamlit
24
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
25
-
26
  if uploaded_file is not None:
27
- # Call function to process the uploaded PDF file
28
- tables = extract_tables_from_pdf(uploaded_file)
29
- if tables:
30
- st.write("Extracted Tables:")
31
- for table in tables:
32
- st.write(pd.DataFrame(table))
33
- else:
34
- st.write("No tables found in the PDF.")
 
 
 
 
 
 
 
 
 
 
35
 
36
  if __name__ == "__main__":
37
  main()
 
1
  import streamlit as st
2
  import fitz # PyMuPDF
3
+ import pytesseract
4
+ from pdf2image import convert_from_path
5
+ from PIL import Image
6
  import pandas as pd
7
+ from docx import Document
8
+ import io
9
 
10
+ # OCR function to convert image-based PDF to text
11
+ def extract_text_from_image_pdf(uploaded_file):
12
+ # Convert PDF to images
13
+ images = convert_from_path(uploaded_file)
14
+ extracted_text = []
 
 
 
 
 
 
 
15
 
16
+ for image in images:
17
+ # Use pytesseract to do OCR on the image
18
+ text = pytesseract.image_to_string(image)
19
+ extracted_text.append(text)
20
+
21
+ return "\n".join(extracted_text)
22
 
23
+ # Save text to Word document
24
+ def save_to_word(text, output_filename):
25
+ doc = Document()
26
+ doc.add_paragraph(text)
27
+ doc.save(output_filename)
28
+
29
+ # Save text to Excel document
30
+ def save_to_excel(text, output_filename):
31
+ # Split the text into rows and columns (simplified, adjust based on your data)
32
+ rows = text.split("\n")
33
+ table_data = [row.split() for row in rows if row] # You can adjust this for proper column splitting
34
 
35
+ df = pd.DataFrame(table_data)
36
+ df.to_excel(output_filename, index=False)
37
+
38
+ # Main function
39
+ def main():
40
+ st.title("PDF (Image-based) to Text-based Document Converter")
41
+
42
  # File uploader widget in Streamlit
43
  uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
44
+
45
  if uploaded_file is not None:
46
+ # Convert image-based PDF to text using OCR
47
+ extracted_text = extract_text_from_image_pdf(uploaded_file)
48
+
49
+ st.write("Extracted Text:")
50
+ st.text_area("Text from PDF", extracted_text, height=300)
51
+
52
+ # Convert the extracted text to Word or Excel
53
+ if st.button("Save as Word"):
54
+ # Save to Word file
55
+ word_filename = "extracted_text.docx"
56
+ save_to_word(extracted_text, word_filename)
57
+ st.success(f"Saved to {word_filename}")
58
+
59
+ if st.button("Save as Excel"):
60
+ # Save to Excel file
61
+ excel_filename = "extracted_text.xlsx"
62
+ save_to_excel(extracted_text, excel_filename)
63
+ st.success(f"Saved to {excel_filename}")
64
 
65
  if __name__ == "__main__":
66
  main()