MANN-Engram-Showcase / mann_engram_en /document_parser.py
wuff-mann's picture
Upload 12 files
927b6c2 verified
import os
import re
import json
from typing import List
import pandas as pd
import PyPDF2
import docx
class DocumentParser:
"""Handles various file types and chunks them into semantic text blocks."""
@staticmethod
def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> List[str]:
"""Simple word-based sliding window chunking."""
words = text.split()
if len(words) <= chunk_size:
return [text]
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
@staticmethod
def parse_file(file_path: str) -> List[str]:
"""
Reads a file based on its extension and returns chunked text.
Supported formats: txt, md, pdf, docx, xlsx, xls, csv, json.
"""
ext = os.path.splitext(file_path)[1].lower()
extracted_text = ""
try:
if ext in ['.txt', '.md']:
with open(file_path, 'r', encoding='utf-8') as f:
extracted_text = f.read()
elif ext == '.pdf':
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
extracted_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
elif ext == '.docx':
doc = docx.Document(file_path)
extracted_text = "\n".join([para.text for para in doc.paragraphs])
elif ext in ['.xlsx', '.xls', '.csv']:
df = pd.read_csv(file_path) if ext == '.csv' else pd.read_excel(file_path)
# Convert rows to string representation
extracted_text = "\n".join(df.astype(str).apply(lambda x: ' | '.join(x), axis=1))
elif ext == '.json':
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
extracted_text = json.dumps(data, indent=2, ensure_ascii=False)
else:
print(f"[MANN-Engram Warning] Unsupported file format: {ext}")
return []
# Clean and apply sliding window chunking
extracted_text = re.sub(r'\n+', '\n', extracted_text).strip()
return DocumentParser.chunk_text(extracted_text)
except Exception as e:
print(f"[MANN-Engram Error] Failed to parse {file_path}: {str(e)}")
return []