Spaces:

Vashishta-S-2141
/

LLM_Powered_Database_Chatbot

Sleeping

LLM_Powered_Database_Chatbot / backend /document_parser.py

SVashishta1

Initial commit

6950cd1 9 months ago

4.51 kB

	import os
	import fitz # PyMuPDF
	import pandas as pd
	from typing import List
	import docx

	class SimpleDocumentParser:
	def __init__(self):
	"""Initialize simple document parser for various file types"""
	pass

	def parse_document(self, file_path: str) -> List[str]:
	"""Parse a document and return text chunks"""
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	return self.parse_pdf(file_path)
	elif file_ext == '.txt':
	return self.parse_text(file_path)
	elif file_ext == '.docx':
	return self.parse_docx(file_path)
	elif file_ext in ['.csv', '.xlsx', '.xls']:
	return self.parse_tabular(file_path)
	else:

	return self.parse_text(file_path)

	def parse_pdf(self, file_path: str) -> List[str]:
	"""Parse PDF using PyMuPDF"""
	chunks = []
	try:
	# Opening the PDF
	doc = fitz.open(file_path)

	# Extracting text from each page
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text = page.get_text()

	# Simple chunking by paragraphs
	paragraphs = text.split('\n\n')
	for para in paragraphs:
	if len(para.strip()) > 0:
	chunks.append(para.strip())

	doc.close()
	except Exception as e:
	print(f"Error parsing PDF {file_path}: {e}")
	chunks = [f"Error parsing PDF: {str(e)}"]

	return chunks

	def parse_text(self, file_path: str) -> List[str]:
	"""Parse plain text file"""
	chunks = []
	try:
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()

	# Splitting by paragraphs
	paragraphs = text.split('\n\n')
	for para in paragraphs:
	if len(para.strip()) > 0:
	chunks.append(para.strip())
	except Exception as e:
	print(f"Error parsing text file {file_path}: {e}")
	chunks = [f"Error parsing text file: {str(e)}"]

	return chunks

	def parse_docx(self, file_path: str) -> List[str]:
	"""Parse DOCX using python-docx"""
	chunks = []
	try:
	doc = docx.Document(file_path)

	# Extracting text from paragraphs
	for para in doc.paragraphs:
	if len(para.text.strip()) > 0:
	chunks.append(para.text.strip())
	except Exception as e:
	print(f"Error parsing DOCX {file_path}: {e}")
	chunks = [f"Error parsing DOCX: {str(e)}"]

	return chunks

	def parse_tabular(self, file_path: str) -> List[str]:
	"""Parsing CSV or Excel files using pandas"""
	chunks = []
	try:
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.csv':
	df = pd.read_csv(file_path)
	else: # Excel files
	df = pd.read_excel(file_path)

	# Adding table summary
	summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
	summary += f"Columns: {', '.join(df.columns.tolist())}"
	chunks.append(summary)

	# Adding column descriptions with data types
	col_types = df.dtypes.to_dict()
	col_desc = "Column details:\n"
	for col, dtype in col_types.items():
	# Adding sample values for each column (first 3 unique values)
	sample_values = df[col].dropna().unique()[:3]
	sample_str = ", ".join([str(v) for v in sample_values])
	col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
	chunks.append(col_desc)

	# Converting each row to a text chunk (limit to first 50 rows for indexing)
	for index, row in df.head(50).iterrows():
	row_text = " \| ".join([f"{col}: {val}" for col, val in row.items()])
	chunks.append(row_text)

	except Exception as e:
	print(f"Error parsing tabular file {file_path}: {e}")
	chunks = [f"Error parsing tabular file: {str(e)}"]

	return chunks