Spaces:

Surya152002
/

appa

Sleeping

App Files Files Community

Surya152002 commited on Oct 23, 2023

Commit

3dc4059

1 Parent(s): f4aec90

Create app.py

Browse files

Files changed (1) hide show

app.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import tabula
+from docx import Document
+import cv2
+import pytesseract
+import pandas as pd
+pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe'  # Change the path accordingly
+def extract_tables_from_pdf(file_path):
+    return tabula.read_pdf(file_path, pages="all", multiple_tables=True)
+def extract_tables_from_image(image_path):
+    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+    text = pytesseract.image_to_string(image)
+    # Convert the extracted text to a dataframe (assuming one table in the image)
+    # The logic may vary based on the nature of your table
+    rows = text.split('\n')
+    data = [row.split() for row in rows if row]
+    df = pd.DataFrame(data[1:], columns=data[0])
+    return [df]  # Returning as a list to be consistent with the PDF extraction function
+# Decide extractor based on file extension
+file_path = "./1234.jpg"  # Change the extension to test
+file_extension = file_path.split('.')[-1].lower()
+if file_extension == "pdf":
+    tables = extract_tables_from_pdf(file_path)
+elif file_extension in ["jpg", "jpeg", "png"]:
+    tables = extract_tables_from_image(file_path)
+else:
+    raise ValueError(f"Unsupported file format: {file_extension}")
+# Create a new Word document
+doc = Document()
+# Iterate through the extracted tables
+for table_df in tables:
+    # Add table to Word document
+    t = doc.add_table(rows=1, cols=table_df.shape[1])
+    hdr_cells = t.rows[0].cells
+    for i, column in enumerate(table_df.columns):
+        hdr_cells[i].text = str(column)
+    for index, row in table_df.iterrows():
+        cells = t.add_row().cells
+        for i, value in enumerate(row):
+            cells[i].text = str(value)
+# Save the Word document
+doc.save("output.docx")
+print("Tables exported to output.docx!")