AzizWazir commited on
Commit
da16d5a
·
verified ·
1 Parent(s): 09938bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -29
app.py CHANGED
@@ -1,38 +1,61 @@
1
- import streamlit as st
 
2
  from docx import Document
3
  import io
 
4
 
5
- def extract_text_from_docx(file):
6
- """Extracts all text from an uploaded .docx file"""
7
- try:
8
- # Open the uploaded .docx file
9
- doc = Document(io.BytesIO(file.read()))
10
 
11
- # Extract text from each paragraph in the document
12
- text = ""
13
- for paragraph in doc.paragraphs:
14
- text += paragraph.text + '\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- return text
 
 
 
17
 
18
- except Exception as e:
19
- st.error(f"Error processing the document: {e}")
20
- return None
21
 
22
- def main():
23
- st.title("Extract Text from DOCX")
 
24
 
25
- # File upload
26
- uploaded_file = st.file_uploader("Choose a DOCX file", type=["docx"])
 
 
 
 
 
 
 
 
 
27
 
28
- if uploaded_file is not None:
29
- text = extract_text_from_docx(uploaded_file)
30
-
31
- if text:
32
- st.subheader("Extracted Text")
33
- st.text(text)
34
- else:
35
- st.error("Failed to extract text.")
36
-
37
- if __name__ == "__main__":
38
- main()
 
1
+ import pytesseract
2
+ from pdf2image import convert_from_path
3
  from docx import Document
4
  import io
5
+ import fitz # PyMuPDF
6
 
7
+ # OCR Setup
8
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path based on your installation
 
 
 
9
 
10
+ # Function to extract images from a PDF
11
+ def extract_images_from_pdf(pdf_path):
12
+ images = []
13
+ doc = fitz.open(pdf_path)
14
+
15
+ for page_num in range(len(doc)):
16
+ page = doc.load_page(page_num)
17
+ pix = page.get_pixmap()
18
+ img = pix.tobytes()
19
+ images.append(img)
20
+
21
+ return images
22
+
23
+ # Function to perform OCR on images and extract text
24
+ def ocr_from_images(images):
25
+ extracted_text = ""
26
+ for img in images:
27
+ text = pytesseract.image_to_string(img)
28
+ extracted_text += text + "\n"
29
+ return extracted_text
30
 
31
+ # Function to convert PDF with images to a Word document
32
+ def pdf_to_word(pdf_path, word_output_path):
33
+ # Extract images from PDF
34
+ images = extract_images_from_pdf(pdf_path)
35
 
36
+ # Perform OCR on the images
37
+ ocr_text = ocr_from_images(images)
 
38
 
39
+ # Convert PDF text to Word
40
+ doc = Document()
41
+ doc.add_heading('Converted PDF Text', 0)
42
 
43
+ # Extract PDF text (non-image content)
44
+ pdf_text = ""
45
+ with open(pdf_path, 'rb') as file:
46
+ doc = fitz.open(file)
47
+ for page in doc:
48
+ pdf_text += page.get_text()
49
+
50
+ # Add both PDF text and OCR extracted text to Word
51
+ doc.add_paragraph(pdf_text)
52
+ doc.add_paragraph("Extracted Text from Images (OCR):")
53
+ doc.add_paragraph(ocr_text)
54
 
55
+ doc.save(word_output_path)
56
+ print(f"Word document saved as: {word_output_path}")
57
+
58
+ # Example usage
59
+ pdf_path = "your_pdf_file.pdf" # Provide the path to your PDF file
60
+ word_output_path = "output.docx" # Provide the desired output Word file path
61
+ pdf_to_word(pdf_path, word_output_path)