Spaces:

Arif-Badhon
/

Generative_AI_Project

Sleeping

Generative_AI_Project / app /services /document_processor.py

Arif

Demo data added for query test

704b133 3 months ago

2.15 kB

	from typing import List, Dict
	from pathlib import Path
	import pypdf
	from docx import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	class DocumentProcessor:
	def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
	)

	def load_pdf(self, file_path: str) -> str:
	"""Load text from PDF"""
	with open(file_path, 'rb') as file:
	reader = pypdf.PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def load_docx(self, file_path: str) -> str:
	"""Load text from DOCX"""
	doc = Document(file_path)
	return "\n".join([paragraph.text for paragraph in doc.paragraphs])

	def load_txt(self, file_path: str) -> str:
	"""Load text from TXT"""
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()

	def process_document(self, file_path: str) -> List[Dict]:
	"""Process document and return chunks with metadata"""
	path = Path(file_path)

	# Load based on extension
	if path.suffix == '.pdf':
	text = self.load_pdf(file_path)
	elif path.suffix == '.docx':
	text = self.load_docx(file_path)
	elif path.suffix == '.txt':
	text = self.load_txt(file_path)
	else:
	raise ValueError(f"Unsupported file type: {path.suffix}")

	# Split into chunks
	chunks = self.text_splitter.split_text(text)

	# Add metadata
	chunk_data = []
	for idx, chunk in enumerate(chunks):
	chunk_data.append({
	"text": chunk,
	"metadata": {
	"source": path.name,
	"chunk_index": idx,
	"total_chunks": len(chunks)
	}
	})

	return chunk_data