Spaces:

happy4040
/

contextIQ

Sleeping

satheeshbhukya

first commit

bd91918 8 days ago

2.99 kB

	import fitz # PyMuPDF
	import docx
	import pandas as pd
	from pptx import Presentation
	from typing import BinaryIO


	class DocumentParser:

	@staticmethod
	def extract_text_from_pdf(file: BinaryIO) -> str:
	try:
	pdf_document = fitz.open(stream=file.read(), filetype="pdf")
	text = []
	for page in pdf_document:
	text.append(page.get_text())
	return "\n".join(text)
	except Exception as e:
	raise ValueError(f"Error extracting text from PDF: {str(e)}")

	@staticmethod
	def extract_text_from_word(file: BinaryIO) -> str:
	try:
	doc = docx.Document(file)
	full_text = []

	# Extract paragraphs
	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	full_text.append(paragraph.text)

	# Extract tables
	for table in doc.tables:
	for row in table.rows:
	row_text = " \| ".join([cell.text.strip() for cell in row.cells if cell.text.strip()])
	if row_text:
	full_text.append(row_text)

	return "\n".join(full_text)
	except Exception as e:
	raise ValueError(f"Error extracting text from Word: {str(e)}")

	@staticmethod
	def extract_text_from_excel(file: BinaryIO) -> str:
	try:
	df = pd.read_excel(file)
	return df.to_string(index=False)
	except Exception as e:
	raise ValueError(f"Error extracting text from Excel: {str(e)}")

	@staticmethod
	def extract_text_from_ppt(file: BinaryIO) -> str:
	try:
	presentation = Presentation(file)
	text = []
	for slide in presentation.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text_frame"):
	for paragraph in shape.text_frame.paragraphs:
	text.append(paragraph.text)
	return "\n".join(text)
	except Exception as e:
	raise ValueError(f"Error extracting text from PowerPoint: {str(e)}")

	@staticmethod
	def extract_text_from_txt(file: BinaryIO) -> str:
	try:
	return file.read().decode('utf-8')
	except Exception as e:
	raise ValueError(f"Error extracting text from TXT: {str(e)}")

	@classmethod
	def parse_document(cls, file: BinaryIO, file_type: str) -> str:
	parsers = {
	'pdf': cls.extract_text_from_pdf,
	'docx': cls.extract_text_from_word,
	'xlsx': cls.extract_text_from_excel,
	'pptx': cls.extract_text_from_ppt,
	'txt': cls.extract_text_from_txt
	}

	parser = parsers.get(file_type.lower())
	if not parser:
	raise ValueError(f"Unsupported file type: {file_type}")

	return parser(file)