Spaces:
Sleeping
Sleeping
| import tabula | |
| from docx import Document | |
| import cv2 | |
| import pytesseract | |
| import pandas as pd | |
| pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe' # Change the path accordingly | |
| def extract_tables_from_pdf(file_path): | |
| return tabula.read_pdf(file_path, pages="all", multiple_tables=True) | |
| def extract_tables_from_image(image_path): | |
| image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) | |
| text = pytesseract.image_to_string(image) | |
| # Convert the extracted text to a dataframe (assuming one table in the image) | |
| # The logic may vary based on the nature of your table | |
| rows = text.split('\n') | |
| data = [row.split() for row in rows if row] | |
| df = pd.DataFrame(data[1:], columns=data[0]) | |
| return [df] # Returning as a list to be consistent with the PDF extraction function | |
| # Decide extractor based on file extension | |
| file_path = "./1234.jpg" # Change the extension to test | |
| file_extension = file_path.split('.')[-1].lower() | |
| if file_extension == "pdf": | |
| tables = extract_tables_from_pdf(file_path) | |
| elif file_extension in ["jpg", "jpeg", "png"]: | |
| tables = extract_tables_from_image(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_extension}") | |
| # Create a new Word document | |
| doc = Document() | |
| # Iterate through the extracted tables | |
| for table_df in tables: | |
| # Add table to Word document | |
| t = doc.add_table(rows=1, cols=table_df.shape[1]) | |
| hdr_cells = t.rows[0].cells | |
| for i, column in enumerate(table_df.columns): | |
| hdr_cells[i].text = str(column) | |
| for index, row in table_df.iterrows(): | |
| cells = t.add_row().cells | |
| for i, value in enumerate(row): | |
| cells[i].text = str(value) | |
| # Save the Word document | |
| doc.save("output.docx") | |
| print("Tables exported to output.docx!") | |