Spaces:

Vanshcc
/

qa-rag-mysql

Runtime error

App Files Files Community

Vanshcc commited on Jan 13

Commit

9c46730

verified ·

1 Parent(s): de0c065

Update embeddings.py

Browse files

Files changed (1) hide show

embeddings.py +84 -39

embeddings.py CHANGED Viewed

@@ -1,39 +1,84 @@
-import re
-import numpy as np
-from sentence_transformers import SentenceTransformer
-embedding_model = SentenceTransformer(
-    "sentence-transformers/all-MiniLM-L6-v2"
-)
-# -------------------------
-# Custom cosine similarity
-# -------------------------
-def cosine_similarity(a, b):
-    dot = np.dot(a, b)
-    norm_a = np.linalg.norm(a)
-    norm_b = np.linalg.norm(b)
-    if norm_a == 0 or norm_b == 0:
-        return 0.0
-    return dot / (norm_a * norm_b)
-# -------------------------
-# Custom semantic chunking
-# -------------------------
-def semantic_chunking(text, max_sentences=3):
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
-    chunks = []
-    current = []
-    for sentence in sentences:
-        current.append(sentence)
-        if len(current) >= max_sentences:
-            chunks.append(" ".join(current))
-            current = []
-    if current:
-        chunks.append(" ".join(current))
-    return chunks

+import re
+import numpy as np
+from sentence_transformers import SentenceTransformer
+# -------------------------------------------------
+# Load embedding model
+# -------------------------------------------------
+embedding_model = SentenceTransformer(
+    "sentence-transformers/all-MiniLM-L6-v2"
+)
+# -------------------------------------------------
+# Custom cosine similarity function
+# -------------------------------------------------
+def cosine_similarity(a, b):
+    """
+    Compute cosine similarity between two vectors.
+    """
+    dot_product = np.dot(a, b)
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return dot_product / (norm_a * norm_b)
+# -------------------------------------------------
+# Custom semantic chunking function
+# -------------------------------------------------
+def semantic_chunking(
+    text,
+    similarity_threshold=0.75,
+    max_sentences=5,
+    min_sentence_length=30
+):
+    """
+    Perform semantic chunking on input text.
+    Steps:
+    1. Split text into sentences
+    2. Generate embeddings for each sentence
+    3. Compare semantic similarity between adjacent sentences
+    4. Create new chunk when similarity drops below threshold
+    """
+    # 1. Sentence segmentation
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    sentences = [
+        s.strip() for s in sentences
+        if len(s.strip()) >= min_sentence_length
+    ]
+    if len(sentences) == 0:
+        return []
+    # 2. Generate sentence embeddings
+    sentence_embeddings = embedding_model.encode(sentences)
+    chunks = []
+    current_chunk = [sentences[0]]
+    current_embeddings = [sentence_embeddings[0]]
+    # 3. Semantic comparison loop
+    for i in range(1, len(sentences)):
+        prev_embedding = current_embeddings[-1]
+        curr_embedding = sentence_embeddings[i]
+        similarity = cosine_similarity(prev_embedding, curr_embedding)
+        # 4. Chunk decision
+        if similarity >= similarity_threshold and len(current_chunk) < max_sentences:
+            current_chunk.append(sentences[i])
+            current_embeddings.append(curr_embedding)
+        else:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentences[i]]
+            current_embeddings = [curr_embedding]
+    # 5. Add last chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks