Surya152002 commited on
Commit
3dc4059
·
1 Parent(s): f4aec90

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tabula
2
+ from docx import Document
3
+ import cv2
4
+ import pytesseract
5
+ import pandas as pd
6
+ pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe' # Change the path accordingly
7
+
8
+
9
+
10
+ def extract_tables_from_pdf(file_path):
11
+ return tabula.read_pdf(file_path, pages="all", multiple_tables=True)
12
+
13
+
14
+ def extract_tables_from_image(image_path):
15
+ image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
16
+ text = pytesseract.image_to_string(image)
17
+
18
+ # Convert the extracted text to a dataframe (assuming one table in the image)
19
+ # The logic may vary based on the nature of your table
20
+ rows = text.split('\n')
21
+ data = [row.split() for row in rows if row]
22
+ df = pd.DataFrame(data[1:], columns=data[0])
23
+
24
+ return [df] # Returning as a list to be consistent with the PDF extraction function
25
+
26
+
27
+ # Decide extractor based on file extension
28
+ file_path = "./1234.jpg" # Change the extension to test
29
+ file_extension = file_path.split('.')[-1].lower()
30
+
31
+ if file_extension == "pdf":
32
+ tables = extract_tables_from_pdf(file_path)
33
+ elif file_extension in ["jpg", "jpeg", "png"]:
34
+ tables = extract_tables_from_image(file_path)
35
+ else:
36
+ raise ValueError(f"Unsupported file format: {file_extension}")
37
+
38
+
39
+ # Create a new Word document
40
+ doc = Document()
41
+
42
+ # Iterate through the extracted tables
43
+ for table_df in tables:
44
+ # Add table to Word document
45
+ t = doc.add_table(rows=1, cols=table_df.shape[1])
46
+ hdr_cells = t.rows[0].cells
47
+ for i, column in enumerate(table_df.columns):
48
+ hdr_cells[i].text = str(column)
49
+
50
+ for index, row in table_df.iterrows():
51
+ cells = t.add_row().cells
52
+ for i, value in enumerate(row):
53
+ cells[i].text = str(value)
54
+
55
+ # Save the Word document
56
+ doc.save("output.docx")
57
+
58
+ print("Tables exported to output.docx!")