Spaces:

AnuragShirke
/

knowledge-assistant-backend

Sleeping

fix: Add langchain-text-splitters and update import

df842c7 3 months ago

1.65 kB

	import fitz # PyMuPDF
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	import docx # Added for docx parsing

	def parse_pdf(file_path: str) -> str:
	"""Extracts text from a PDF file."""
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	doc.close()
	return text

	def parse_txt(file_path: str) -> str:
	"""Extracts text from a TXT file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	return text

	def parse_docx(file_path: str) -> str:
	"""Extracts text from a DOCX file."""
	document = docx.Document(file_path)
	text = []
	for paragraph in document.paragraphs:
	text.append(paragraph.text)
	return '\n'.join(text)

	def parse_document(file_path: str, file_extension: str) -> str:
	"""Dispatches to the correct parser based on file extension."""
	if file_extension == ".pdf":
	return parse_pdf(file_path)
	elif file_extension == ".txt":
	return parse_txt(file_path)
	elif file_extension == ".docx":
	return parse_docx(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_extension}")

	def chunk_text(text: str) -> list[str]:
	"""Splits text into smaller chunks."""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	return text_splitter.split_text(text)

	def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
	"""Loads the sentence-transformer model."""
	return SentenceTransformer(model_name)