contextIQ / scripts /text_processor.py
satheeshbhukya
first commit
bd91918
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
class TextProcessor:
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 100):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # change to a another model if needed
def chunk_text(self, text: str) -> List[str]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_text(text)
return chunks
def get_embeddings(self, texts: List[str]) -> np.ndarray:
embeddings = self.embedding_model.encode(
texts,
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=True
)
return embeddings.astype(np.float32)
def process_text(self, text: str) -> Tuple[List[str], np.ndarray]:
chunks = self.chunk_text(text)
embeddings = self.get_embeddings(chunks)
return chunks, embeddings