qa-rag-fastapi / embeddings.py
vansh27's picture
Deploy Fastapi RAG System
a86c572
raw
history blame contribute delete
965 Bytes
import re
import numpy as np
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2"
)
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def semantic_chunking(text, similarity_threshold=0.75):
sentences = re.split(r'(?<=[.!?])\s+', text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
if len(sentences) <= 1:
return sentences
embeddings = embedding_model.encode(sentences)
chunks = []
current_chunk = [sentences[0]]
for i in range(1, len(sentences)):
sim = cosine_similarity(embeddings[i - 1], embeddings[i])
if sim >= similarity_threshold:
current_chunk.append(sentences[i])
else:
chunks.append(" ".join(current_chunk))
current_chunk = [sentences[i]]
chunks.append(" ".join(current_chunk))
return chunks