import tabula from docx import Document import cv2 import pytesseract import pandas as pd pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe' # Change the path accordingly def extract_tables_from_pdf(file_path): return tabula.read_pdf(file_path, pages="all", multiple_tables=True) def extract_tables_from_image(image_path): image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) text = pytesseract.image_to_string(image) # Convert the extracted text to a dataframe (assuming one table in the image) # The logic may vary based on the nature of your table rows = text.split('\n') data = [row.split() for row in rows if row] df = pd.DataFrame(data[1:], columns=data[0]) return [df] # Returning as a list to be consistent with the PDF extraction function # Decide extractor based on file extension file_path = "./1234.jpg" # Change the extension to test file_extension = file_path.split('.')[-1].lower() if file_extension == "pdf": tables = extract_tables_from_pdf(file_path) elif file_extension in ["jpg", "jpeg", "png"]: tables = extract_tables_from_image(file_path) else: raise ValueError(f"Unsupported file format: {file_extension}") # Create a new Word document doc = Document() # Iterate through the extracted tables for table_df in tables: # Add table to Word document t = doc.add_table(rows=1, cols=table_df.shape[1]) hdr_cells = t.rows[0].cells for i, column in enumerate(table_df.columns): hdr_cells[i].text = str(column) for index, row in table_df.iterrows(): cells = t.add_row().cells for i, value in enumerate(row): cells[i].text = str(value) # Save the Word document doc.save("output.docx") print("Tables exported to output.docx!")