cryogenic22 commited on
Commit
8665f8c
·
verified ·
1 Parent(s): f9867d9

Create core/embeddings.py

Browse files
Files changed (1) hide show
  1. core/embeddings.py +58 -0
core/embeddings.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/embeddings.py
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from transformers import AutoTokenizer
5
+ import faiss
6
+ import numpy as np
7
+ from typing import List, Dict
8
+ import torch
9
+
10
+ class DocumentEmbedder:
11
+ def __init__(self, model_name: str = "thenlper/gte-small"):
12
+ self.model_name = model_name
13
+ self.embedding_model = HuggingFaceEmbeddings(
14
+ model_name=model_name,
15
+ model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
16
+ encode_kwargs={"normalize_embeddings": True} # For cosine similarity
17
+ )
18
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ self.text_splitter = self._initialize_splitter()
20
+
21
+ def _initialize_splitter(self) -> RecursiveCharacterTextSplitter:
22
+ # Using markdown-optimized separators
23
+ MARKDOWN_SEPARATORS = [
24
+ "\n#{1,6} ",
25
+ "```\n",
26
+ "\n\\*\\*\\*+\n",
27
+ "\n---+\n",
28
+ "\n___+\n",
29
+ "\n\n",
30
+ "\n",
31
+ " ",
32
+ ""
33
+ ]
34
+
35
+ return RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
36
+ self.tokenizer,
37
+ chunk_size=500, # Adjusted for better semantic units
38
+ chunk_overlap=50,
39
+ add_start_index=True,
40
+ strip_whitespace=True,
41
+ separators=MARKDOWN_SEPARATORS
42
+ )
43
+
44
+ def process_documents(self, documents: List[Dict]) -> tuple:
45
+ """Process documents and return chunks and their embeddings."""
46
+ # Split documents into chunks
47
+ chunks = []
48
+ metadatas = []
49
+
50
+ for doc in documents:
51
+ doc_chunks = self.text_splitter.split_text(doc["content"])
52
+ chunks.extend(doc_chunks)
53
+ metadatas.extend([{"source": doc["source"]} for _ in doc_chunks])
54
+
55
+ # Generate embeddings
56
+ embeddings = self.embedding_model.embed_documents(chunks)
57
+
58
+ return chunks, embeddings, metadatas