Spaces:

Surya152002
/

appa

Sleeping

App Files Files Community

appa / app.py

Surya152002

Create app.py

3dc4059 over 2 years ago

raw

history blame contribute delete

1.77 kB

	import tabula
	from docx import Document
	import cv2
	import pytesseract
	import pandas as pd
	pytesseract.pytesseract.tesseract_cmd = r'./tesseract.exe' # Change the path accordingly



	def extract_tables_from_pdf(file_path):
	return tabula.read_pdf(file_path, pages="all", multiple_tables=True)


	def extract_tables_from_image(image_path):
	image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
	text = pytesseract.image_to_string(image)

	# Convert the extracted text to a dataframe (assuming one table in the image)
	# The logic may vary based on the nature of your table
	rows = text.split('\n')
	data = [row.split() for row in rows if row]
	df = pd.DataFrame(data[1:], columns=data[0])

	return [df] # Returning as a list to be consistent with the PDF extraction function


	# Decide extractor based on file extension
	file_path = "./1234.jpg" # Change the extension to test
	file_extension = file_path.split('.')[-1].lower()

	if file_extension == "pdf":
	tables = extract_tables_from_pdf(file_path)
	elif file_extension in ["jpg", "jpeg", "png"]:
	tables = extract_tables_from_image(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")


	# Create a new Word document
	doc = Document()

	# Iterate through the extracted tables
	for table_df in tables:
	# Add table to Word document
	t = doc.add_table(rows=1, cols=table_df.shape[1])
	hdr_cells = t.rows[0].cells
	for i, column in enumerate(table_df.columns):
	hdr_cells[i].text = str(column)

	for index, row in table_df.iterrows():
	cells = t.add_row().cells
	for i, value in enumerate(row):
	cells[i].text = str(value)

	# Save the Word document
	doc.save("output.docx")

	print("Tables exported to output.docx!")