AzizWazir commited on
Commit
145992a
·
verified ·
1 Parent(s): 529e2f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -75
app.py CHANGED
@@ -1,82 +1,31 @@
1
- import streamlit as st
2
- import pytesseract
3
- from PIL import Image
4
- import docx
5
- import pdf2image
6
- import camelot
7
 
8
- # Set Tesseract path if not set already
9
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
10
-
11
- def pdf_to_docx(pdf_file):
12
- """Converts a PDF file to a Word document (.docx) using OCR.
13
-
14
- Args:
15
- pdf_file: The path to the PDF file.
16
-
17
- Returns:
18
- A Word document object.
19
- """
20
-
21
- # Extract images from the PDF file
22
- pages = pdf2image.convert_from_path(pdf_file, dpi=200)
23
-
24
- # Create a Word document
25
- doc = docx.Document()
26
-
27
- # Iterate over the extracted images and perform OCR
28
- for page in pages:
29
- text = pytesseract.image_to_string(page)
30
- doc.add_paragraph(text)
31
-
32
- return doc
33
-
34
- def pdf_to_xlsx(pdf_file):
35
- """Converts a PDF file to an Excel spreadsheet (.xlsx) using Camelot.
36
-
37
- Args:
38
- pdf_file: The path to the PDF file.
39
-
40
- Returns:
41
- A list of Excel tables extracted from the PDF.
42
- """
43
-
44
- tables = camelot.read_pdf(pdf_file, flavor='streamlit')
45
- return tables
46
-
47
- def main():
48
- """Streamlit app for converting PDF files to Word and Excel."""
49
-
50
- # Title and description
51
- st.title("PDF Converter App")
52
- st.subheader("Convert your PDFs to editable Word documents and Excel spreadsheets.")
53
-
54
- # Upload PDF file
55
- uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
56
-
57
- if uploaded_file is not None:
58
- # Convert PDF to Word and Excel
59
  try:
60
- doc = pdf_to_docx(uploaded_file)
61
- tables = pdf_to_xlsx(uploaded_file)
62
 
63
- # Download options
64
- if st.button("Download Word document"):
65
- with open("output.docx", "wb") as f:
66
- doc.save(f)
67
- st.success("Word document downloaded!")
68
-
69
- if tables:
70
- st.header("Extracted Excel Tables")
71
- for i, table in enumerate(tables):
72
- st.subheader(f"Table {i+1}")
73
- st.dataframe(table.df)
74
- if st.button(f"Download Excel table {i+1}"):
75
- table.df.to_excel(f"table_{i+1}.xlsx", index=False)
76
- st.success(f"Excel table {i+1} downloaded!")
77
 
78
  except Exception as e:
79
- st.error(f"Error converting PDF: {e}")
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  if __name__ == "__main__":
82
- main()
 
1
+ from docx import Document
 
 
 
 
 
2
 
3
+ def extract_text_from_docx(file_path):
4
+ """Extracts all text from a .docx file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  try:
6
+ # Open the .docx file
7
+ doc = Document(file_path)
8
 
9
+ # Extract text from each paragraph in the document
10
+ text = ""
11
+ for paragraph in doc.paragraphs:
12
+ text += paragraph.text + '\n'
13
+
14
+ return text
 
 
 
 
 
 
 
 
15
 
16
  except Exception as e:
17
+ print(f"Error processing the document: {e}")
18
+ return None
19
+
20
+ def main():
21
+ file_path = "your_document.docx" # Replace with your actual file path
22
+ text = extract_text_from_docx(file_path)
23
+
24
+ if text:
25
+ print("Extracted Text:")
26
+ print(text)
27
+ else:
28
+ print("Failed to extract text.")
29
 
30
  if __name__ == "__main__":
31
+ main()