Vanshcc commited on
Commit
9c46730
·
verified ·
1 Parent(s): de0c065

Update embeddings.py

Browse files
Files changed (1) hide show
  1. embeddings.py +84 -39
embeddings.py CHANGED
@@ -1,39 +1,84 @@
1
- import re
2
- import numpy as np
3
- from sentence_transformers import SentenceTransformer
4
-
5
- embedding_model = SentenceTransformer(
6
- "sentence-transformers/all-MiniLM-L6-v2"
7
- )
8
-
9
- # -------------------------
10
- # Custom cosine similarity
11
- # -------------------------
12
- def cosine_similarity(a, b):
13
- dot = np.dot(a, b)
14
- norm_a = np.linalg.norm(a)
15
- norm_b = np.linalg.norm(b)
16
- if norm_a == 0 or norm_b == 0:
17
- return 0.0
18
- return dot / (norm_a * norm_b)
19
-
20
- # -------------------------
21
- # Custom semantic chunking
22
- # -------------------------
23
- def semantic_chunking(text, max_sentences=3):
24
- sentences = re.split(r'(?<=[.!?])\s+', text)
25
- sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
26
-
27
- chunks = []
28
- current = []
29
-
30
- for sentence in sentences:
31
- current.append(sentence)
32
- if len(current) >= max_sentences:
33
- chunks.append(" ".join(current))
34
- current = []
35
-
36
- if current:
37
- chunks.append(" ".join(current))
38
-
39
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ # -------------------------------------------------
6
+ # Load embedding model
7
+ # -------------------------------------------------
8
+ embedding_model = SentenceTransformer(
9
+ "sentence-transformers/all-MiniLM-L6-v2"
10
+ )
11
+
12
+ # -------------------------------------------------
13
+ # Custom cosine similarity function
14
+ # -------------------------------------------------
15
+ def cosine_similarity(a, b):
16
+ """
17
+ Compute cosine similarity between two vectors.
18
+ """
19
+ dot_product = np.dot(a, b)
20
+ norm_a = np.linalg.norm(a)
21
+ norm_b = np.linalg.norm(b)
22
+
23
+ if norm_a == 0 or norm_b == 0:
24
+ return 0.0
25
+
26
+ return dot_product / (norm_a * norm_b)
27
+
28
+ # -------------------------------------------------
29
+ # Custom semantic chunking function
30
+ # -------------------------------------------------
31
+ def semantic_chunking(
32
+ text,
33
+ similarity_threshold=0.75,
34
+ max_sentences=5,
35
+ min_sentence_length=30
36
+ ):
37
+ """
38
+ Perform semantic chunking on input text.
39
+
40
+ Steps:
41
+ 1. Split text into sentences
42
+ 2. Generate embeddings for each sentence
43
+ 3. Compare semantic similarity between adjacent sentences
44
+ 4. Create new chunk when similarity drops below threshold
45
+ """
46
+
47
+ # 1. Sentence segmentation
48
+ sentences = re.split(r'(?<=[.!?])\s+', text)
49
+ sentences = [
50
+ s.strip() for s in sentences
51
+ if len(s.strip()) >= min_sentence_length
52
+ ]
53
+
54
+ if len(sentences) == 0:
55
+ return []
56
+
57
+ # 2. Generate sentence embeddings
58
+ sentence_embeddings = embedding_model.encode(sentences)
59
+
60
+ chunks = []
61
+ current_chunk = [sentences[0]]
62
+ current_embeddings = [sentence_embeddings[0]]
63
+
64
+ # 3. Semantic comparison loop
65
+ for i in range(1, len(sentences)):
66
+ prev_embedding = current_embeddings[-1]
67
+ curr_embedding = sentence_embeddings[i]
68
+
69
+ similarity = cosine_similarity(prev_embedding, curr_embedding)
70
+
71
+ # 4. Chunk decision
72
+ if similarity >= similarity_threshold and len(current_chunk) < max_sentences:
73
+ current_chunk.append(sentences[i])
74
+ current_embeddings.append(curr_embedding)
75
+ else:
76
+ chunks.append(" ".join(current_chunk))
77
+ current_chunk = [sentences[i]]
78
+ current_embeddings = [curr_embedding]
79
+
80
+ # 5. Add last chunk
81
+ if current_chunk:
82
+ chunks.append(" ".join(current_chunk))
83
+
84
+ return chunks