Spaces:

happy4040
/

contextIQ

Sleeping

satheeshbhukya

first commit

bd91918 8 days ago

1.29 kB

	from sentence_transformers import SentenceTransformer
	import numpy as np
	from typing import List, Tuple
	from langchain_text_splitters import RecursiveCharacterTextSplitter


	class TextProcessor:

	def __init__(self, chunk_size: int = 500, chunk_overlap: int = 100):
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # change to a another model if needed

	def chunk_text(self, text: str) -> List[str]:
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)
	chunks = splitter.split_text(text)
	return chunks

	def get_embeddings(self, texts: List[str]) -> np.ndarray:
	embeddings = self.embedding_model.encode(
	texts,
	convert_to_numpy=True,
	normalize_embeddings=True,
	show_progress_bar=True
	)
	return embeddings.astype(np.float32)

	def process_text(self, text: str) -> Tuple[List[str], np.ndarray]:
	chunks = self.chunk_text(text)
	embeddings = self.get_embeddings(chunks)
	return chunks, embeddings