SVashishta1
Initial commit
6950cd1
import os
import fitz # PyMuPDF
import pandas as pd
from typing import List
import docx
class SimpleDocumentParser:
def __init__(self):
"""Initialize simple document parser for various file types"""
pass
def parse_document(self, file_path: str) -> List[str]:
"""Parse a document and return text chunks"""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self.parse_pdf(file_path)
elif file_ext == '.txt':
return self.parse_text(file_path)
elif file_ext == '.docx':
return self.parse_docx(file_path)
elif file_ext in ['.csv', '.xlsx', '.xls']:
return self.parse_tabular(file_path)
else:
return self.parse_text(file_path)
def parse_pdf(self, file_path: str) -> List[str]:
"""Parse PDF using PyMuPDF"""
chunks = []
try:
# Opening the PDF
doc = fitz.open(file_path)
# Extracting text from each page
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
# Simple chunking by paragraphs
paragraphs = text.split('\n\n')
for para in paragraphs:
if len(para.strip()) > 0:
chunks.append(para.strip())
doc.close()
except Exception as e:
print(f"Error parsing PDF {file_path}: {e}")
chunks = [f"Error parsing PDF: {str(e)}"]
return chunks
def parse_text(self, file_path: str) -> List[str]:
"""Parse plain text file"""
chunks = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()
# Splitting by paragraphs
paragraphs = text.split('\n\n')
for para in paragraphs:
if len(para.strip()) > 0:
chunks.append(para.strip())
except Exception as e:
print(f"Error parsing text file {file_path}: {e}")
chunks = [f"Error parsing text file: {str(e)}"]
return chunks
def parse_docx(self, file_path: str) -> List[str]:
"""Parse DOCX using python-docx"""
chunks = []
try:
doc = docx.Document(file_path)
# Extracting text from paragraphs
for para in doc.paragraphs:
if len(para.text.strip()) > 0:
chunks.append(para.text.strip())
except Exception as e:
print(f"Error parsing DOCX {file_path}: {e}")
chunks = [f"Error parsing DOCX: {str(e)}"]
return chunks
def parse_tabular(self, file_path: str) -> List[str]:
"""Parsing CSV or Excel files using pandas"""
chunks = []
try:
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.csv':
df = pd.read_csv(file_path)
else: # Excel files
df = pd.read_excel(file_path)
# Adding table summary
summary = f"Table with {len(df)} rows and {len(df.columns)} columns. "
summary += f"Columns: {', '.join(df.columns.tolist())}"
chunks.append(summary)
# Adding column descriptions with data types
col_types = df.dtypes.to_dict()
col_desc = "Column details:\n"
for col, dtype in col_types.items():
# Adding sample values for each column (first 3 unique values)
sample_values = df[col].dropna().unique()[:3]
sample_str = ", ".join([str(v) for v in sample_values])
col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n"
chunks.append(col_desc)
# Converting each row to a text chunk (limit to first 50 rows for indexing)
for index, row in df.head(50).iterrows():
row_text = " | ".join([f"{col}: {val}" for col, val in row.items()])
chunks.append(row_text)
except Exception as e:
print(f"Error parsing tabular file {file_path}: {e}")
chunks = [f"Error parsing tabular file: {str(e)}"]
return chunks