AzizWazir commited on
Commit
ea1977e
·
verified ·
1 Parent(s): da16d5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -54
app.py CHANGED
@@ -1,61 +1,30 @@
1
- import pytesseract
2
- from pdf2image import convert_from_path
3
- from docx import Document
4
- import io
5
- import fitz # PyMuPDF
6
 
7
- # OCR Setup
8
- pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Update this path based on your installation
9
-
10
- # Function to extract images from a PDF
11
- def extract_images_from_pdf(pdf_path):
12
- images = []
13
- doc = fitz.open(pdf_path)
14
 
15
- for page_num in range(len(doc)):
16
- page = doc.load_page(page_num)
17
- pix = page.get_pixmap()
18
- img = pix.tobytes()
19
- images.append(img)
20
 
21
- return images
22
-
23
- # Function to perform OCR on images and extract text
24
- def ocr_from_images(images):
25
- extracted_text = ""
26
- for img in images:
27
- text = pytesseract.image_to_string(img)
28
- extracted_text += text + "\n"
29
- return extracted_text
30
-
31
- # Function to convert PDF with images to a Word document
32
- def pdf_to_word(pdf_path, word_output_path):
33
- # Extract images from PDF
34
- images = extract_images_from_pdf(pdf_path)
35
-
36
- # Perform OCR on the images
37
- ocr_text = ocr_from_images(images)
38
-
39
- # Convert PDF text to Word
40
- doc = Document()
41
- doc.add_heading('Converted PDF Text', 0)
42
 
43
- # Extract PDF text (non-image content)
44
- pdf_text = ""
45
- with open(pdf_path, 'rb') as file:
46
- doc = fitz.open(file)
47
- for page in doc:
48
- pdf_text += page.get_text()
49
-
50
- # Add both PDF text and OCR extracted text to Word
51
- doc.add_paragraph(pdf_text)
52
- doc.add_paragraph("Extracted Text from Images (OCR):")
53
- doc.add_paragraph(ocr_text)
54
 
55
- doc.save(word_output_path)
56
- print(f"Word document saved as: {word_output_path}")
57
 
58
  # Example usage
59
- pdf_path = "your_pdf_file.pdf" # Provide the path to your PDF file
60
- word_output_path = "output.docx" # Provide the desired output Word file path
61
- pdf_to_word(pdf_path, word_output_path)
 
1
+ import pandas as pd
 
 
 
 
2
 
3
+ def pdf_to_excel(pdf_path, excel_output_path):
4
+ # Example: If your PDF has structured data that can be parsed into a table
5
+ # (You can use libraries like pdfplumber for extracting tables)
 
 
 
 
6
 
7
+ tables = [] # List to store the extracted tables
 
 
 
 
8
 
9
+ # Example of extracting a table (this part would depend on your PDF content)
10
+ # Extract tables using pdfplumber, PyMuPDF, or a similar library
11
+ # Example with pdfplumber (if tables are present in your PDF)
12
+ import pdfplumber
13
+ with pdfplumber.open(pdf_path) as pdf:
14
+ for page in pdf.pages:
15
+ table = page.extract_table()
16
+ if table:
17
+ tables.append(table)
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Write the extracted tables to an Excel file
20
+ with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
21
+ for i, table in enumerate(tables):
22
+ df = pd.DataFrame(table[1:], columns=table[0]) # Converting to DataFrame
23
+ df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
 
 
 
 
 
 
24
 
25
+ print(f"Excel file saved as: {excel_output_path}")
 
26
 
27
  # Example usage
28
+ pdf_path = "your_pdf_file.pdf"
29
+ excel_output_path = "output.xlsx"
30
+ pdf_to_excel(pdf_path, excel_output_path)