|
|
import os |
|
|
import fitz |
|
|
import pandas as pd |
|
|
from typing import List |
|
|
import docx |
|
|
|
|
|
class SimpleDocumentParser: |
|
|
def __init__(self): |
|
|
"""Initialize simple document parser for various file types""" |
|
|
pass |
|
|
|
|
|
def parse_document(self, file_path: str) -> List[str]: |
|
|
"""Parse a document and return text chunks""" |
|
|
file_ext = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
if file_ext == '.pdf': |
|
|
return self.parse_pdf(file_path) |
|
|
elif file_ext == '.txt': |
|
|
return self.parse_text(file_path) |
|
|
elif file_ext == '.docx': |
|
|
return self.parse_docx(file_path) |
|
|
elif file_ext in ['.csv', '.xlsx', '.xls']: |
|
|
return self.parse_tabular(file_path) |
|
|
else: |
|
|
|
|
|
return self.parse_text(file_path) |
|
|
|
|
|
def parse_pdf(self, file_path: str) -> List[str]: |
|
|
"""Parse PDF using PyMuPDF""" |
|
|
chunks = [] |
|
|
try: |
|
|
|
|
|
doc = fitz.open(file_path) |
|
|
|
|
|
|
|
|
for page_num in range(len(doc)): |
|
|
page = doc.load_page(page_num) |
|
|
text = page.get_text() |
|
|
|
|
|
|
|
|
paragraphs = text.split('\n\n') |
|
|
for para in paragraphs: |
|
|
if len(para.strip()) > 0: |
|
|
chunks.append(para.strip()) |
|
|
|
|
|
doc.close() |
|
|
except Exception as e: |
|
|
print(f"Error parsing PDF {file_path}: {e}") |
|
|
chunks = [f"Error parsing PDF: {str(e)}"] |
|
|
|
|
|
return chunks |
|
|
|
|
|
def parse_text(self, file_path: str) -> List[str]: |
|
|
"""Parse plain text file""" |
|
|
chunks = [] |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
|
|
text = f.read() |
|
|
|
|
|
|
|
|
paragraphs = text.split('\n\n') |
|
|
for para in paragraphs: |
|
|
if len(para.strip()) > 0: |
|
|
chunks.append(para.strip()) |
|
|
except Exception as e: |
|
|
print(f"Error parsing text file {file_path}: {e}") |
|
|
chunks = [f"Error parsing text file: {str(e)}"] |
|
|
|
|
|
return chunks |
|
|
|
|
|
def parse_docx(self, file_path: str) -> List[str]: |
|
|
"""Parse DOCX using python-docx""" |
|
|
chunks = [] |
|
|
try: |
|
|
doc = docx.Document(file_path) |
|
|
|
|
|
|
|
|
for para in doc.paragraphs: |
|
|
if len(para.text.strip()) > 0: |
|
|
chunks.append(para.text.strip()) |
|
|
except Exception as e: |
|
|
print(f"Error parsing DOCX {file_path}: {e}") |
|
|
chunks = [f"Error parsing DOCX: {str(e)}"] |
|
|
|
|
|
return chunks |
|
|
|
|
|
def parse_tabular(self, file_path: str) -> List[str]: |
|
|
"""Parsing CSV or Excel files using pandas""" |
|
|
chunks = [] |
|
|
try: |
|
|
file_ext = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
if file_ext == '.csv': |
|
|
df = pd.read_csv(file_path) |
|
|
else: |
|
|
df = pd.read_excel(file_path) |
|
|
|
|
|
|
|
|
summary = f"Table with {len(df)} rows and {len(df.columns)} columns. " |
|
|
summary += f"Columns: {', '.join(df.columns.tolist())}" |
|
|
chunks.append(summary) |
|
|
|
|
|
|
|
|
col_types = df.dtypes.to_dict() |
|
|
col_desc = "Column details:\n" |
|
|
for col, dtype in col_types.items(): |
|
|
|
|
|
sample_values = df[col].dropna().unique()[:3] |
|
|
sample_str = ", ".join([str(v) for v in sample_values]) |
|
|
col_desc += f"- {col} (Type: {dtype}): Sample values: {sample_str}\n" |
|
|
chunks.append(col_desc) |
|
|
|
|
|
|
|
|
for index, row in df.head(50).iterrows(): |
|
|
row_text = " | ".join([f"{col}: {val}" for col, val in row.items()]) |
|
|
chunks.append(row_text) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error parsing tabular file {file_path}: {e}") |
|
|
chunks = [f"Error parsing tabular file: {str(e)}"] |
|
|
|
|
|
return chunks |