ajoy0071998 commited on
Commit
026a1e4
·
verified ·
1 Parent(s): 1188884

Delete embeddings.py

Browse files
Files changed (1) hide show
  1. embeddings.py +0 -50
embeddings.py DELETED
@@ -1,50 +0,0 @@
1
- ##return embedding vector for a given text
2
- ##uses senetence based emebdings
3
- from langchain_text_splitters import CharacterTextSplitter
4
- from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain_core.documents import Document
6
- from langchain_chroma import Chroma
7
-
8
- model_name = "sentence-transformers/all-MiniLM-L6-v2"
9
- model_kwargs = {"device": "cpu"}
10
- encode_kwargs = {"normalize_embeddings": True}
11
- hf = HuggingFaceEmbeddings(
12
- model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
13
- )
14
-
15
-
16
- vector_store = Chroma(
17
- collection_name="collection",
18
- embedding_function=hf,
19
- persist_directory="chroma_langchain_db",
20
- )
21
-
22
-
23
- def set_embedding(text:str,doc_id:str,user_id:str):
24
- text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
25
- encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
26
- )
27
- texts = text_splitter.split_text(text)
28
- print(type(texts[0]))## IT IS LIST OF STRINGS
29
-
30
- for i in range(len(texts)):
31
- ##vector=hf.embed_query(texts[i])
32
- vector_id=(user_id+doc_id+str(i))
33
- globals()[f"document_{i}"]=Document(
34
- page_content= texts[i],
35
- metadata={"doc_id": doc_id, "user_id": user_id},
36
- id= vector_id,
37
- )
38
- vector_store.add_documents([globals()[f"document_{i}"]])
39
- print(f"Added document {i} with id {vector_id}")
40
-
41
- def get_chunks(query:str,user_id:str,doc_id:str):
42
- results = vector_store.similarity_search(
43
- query,
44
- k=5,
45
- filter={"user_id": user_id}
46
- )
47
- list_of_chunks=[]
48
- for res in results:
49
- list_of_chunks.append(res.page_content)
50
- return list_of_chunks