Spaces:
Sleeping
Sleeping
| import re | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| embedding_model = SentenceTransformer( | |
| "sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| def cosine_similarity(a, b): | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) | |
| def semantic_chunking(text, similarity_threshold=0.75): | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| sentences = [s.strip() for s in sentences if len(s.strip()) > 20] | |
| if len(sentences) <= 1: | |
| return sentences | |
| embeddings = embedding_model.encode(sentences) | |
| chunks = [] | |
| current_chunk = [sentences[0]] | |
| for i in range(1, len(sentences)): | |
| sim = cosine_similarity(embeddings[i - 1], embeddings[i]) | |
| if sim >= similarity_threshold: | |
| current_chunk.append(sentences[i]) | |
| else: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [sentences[i]] | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |