Spaces:

Surya152002
/

appa

Sleeping

File size: 1,774 Bytes

3dc4059

import tabula
from docx import Document
import cv2
import pytesseract
import pandas as pd
pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe'  # Change the path accordingly



def extract_tables_from_pdf(file_path):
    return tabula.read_pdf(file_path, pages="all", multiple_tables=True)


def extract_tables_from_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    text = pytesseract.image_to_string(image)
    
    # Convert the extracted text to a dataframe (assuming one table in the image)
    # The logic may vary based on the nature of your table
    rows = text.split('\n')
    data = [row.split() for row in rows if row]
    df = pd.DataFrame(data[1:], columns=data[0])

    return [df]  # Returning as a list to be consistent with the PDF extraction function


# Decide extractor based on file extension
file_path = "./1234.jpg"  # Change the extension to test
file_extension = file_path.split('.')[-1].lower()

if file_extension == "pdf":
    tables = extract_tables_from_pdf(file_path)
elif file_extension in ["jpg", "jpeg", "png"]:
    tables = extract_tables_from_image(file_path)
else:
    raise ValueError(f"Unsupported file format: {file_extension}")


# Create a new Word document
doc = Document()

# Iterate through the extracted tables
for table_df in tables:
    # Add table to Word document
    t = doc.add_table(rows=1, cols=table_df.shape[1])
    hdr_cells = t.rows[0].cells
    for i, column in enumerate(table_df.columns):
        hdr_cells[i].text = str(column)

    for index, row in table_df.iterrows():
        cells = t.add_row().cells
        for i, value in enumerate(row):
            cells[i].text = str(value)

# Save the Word document
doc.save("output.docx")

print("Tables exported to output.docx!")