cryogenic22 commited on
Commit
5af14f8
·
verified ·
1 Parent(s): a7d713c

Update core/embeddings.py

Browse files
Files changed (1) hide show
  1. core/embeddings.py +14 -6
core/embeddings.py CHANGED
@@ -1,5 +1,5 @@
1
- # core/embeddings.py
2
- from langchain_community.embeddings import HuggingFaceEmbeddings
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from transformers import AutoTokenizer
5
  import faiss
@@ -12,8 +12,13 @@ class DocumentEmbedder:
12
  self.model_name = model_name
13
  self.embedding_model = HuggingFaceEmbeddings(
14
  model_name=model_name,
15
- model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
16
- encode_kwargs={"normalize_embeddings": True} # For cosine similarity
 
 
 
 
 
17
  )
18
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
19
  self.text_splitter = self._initialize_splitter()
@@ -34,7 +39,7 @@ class DocumentEmbedder:
34
 
35
  return RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
36
  self.tokenizer,
37
- chunk_size=500, # Adjusted for better semantic units
38
  chunk_overlap=50,
39
  add_start_index=True,
40
  strip_whitespace=True,
@@ -50,7 +55,10 @@ class DocumentEmbedder:
50
  for doc in documents:
51
  doc_chunks = self.text_splitter.split_text(doc["content"])
52
  chunks.extend(doc_chunks)
53
- metadatas.extend([{"source": doc["source"]} for _ in doc_chunks])
 
 
 
54
 
55
  # Generate embeddings
56
  embeddings = self.embedding_model.embed_documents(chunks)
 
1
+ # src/core/embeddings.py
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from transformers import AutoTokenizer
5
  import faiss
 
12
  self.model_name = model_name
13
  self.embedding_model = HuggingFaceEmbeddings(
14
  model_name=model_name,
15
+ model_kwargs={
16
+ "device": "cuda" if torch.cuda.is_available() else "cpu"
17
+ },
18
+ encode_kwargs={
19
+ "normalize_embeddings": True, # For cosine similarity
20
+ "batch_size": 32
21
+ }
22
  )
23
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
24
  self.text_splitter = self._initialize_splitter()
 
39
 
40
  return RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
41
  self.tokenizer,
42
+ chunk_size=500,
43
  chunk_overlap=50,
44
  add_start_index=True,
45
  strip_whitespace=True,
 
55
  for doc in documents:
56
  doc_chunks = self.text_splitter.split_text(doc["content"])
57
  chunks.extend(doc_chunks)
58
+ metadatas.extend([{
59
+ "source": doc["source"],
60
+ "chunk_index": i
61
+ } for i in range(len(doc_chunks))])
62
 
63
  # Generate embeddings
64
  embeddings = self.embedding_model.embed_documents(chunks)