Spaces:
No application file
No application file
Commit ·
3c30949
1
Parent(s): a56af3d
Estudando o DB redis
Browse files
Index_Creation/1_index_creation_and_query.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Similaridade do Vetor
|
| 6 |
+
=====================
|
| 7 |
+
Usamos os vetores (Embeddings) em sistemas de recomendação, pesquisa de
|
| 8 |
+
imagens e vídeos, recuperação de documentos e Q&A 🤗.
|
| 9 |
+
|
| 10 |
+
Versão: 1.0.0
|
| 11 |
+
Data: 29/07/2023
|
| 12 |
+
Autor: Dr.Eddy Giusepe
|
| 13 |
+
|
| 14 |
+
Método de execução:
|
| 15 |
+
$ python 1_index_creation.py
|
| 16 |
+
"""
|
| 17 |
+
import redis
|
| 18 |
+
from redis.commands.search.field import TagField, VectorField
|
| 19 |
+
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
|
| 20 |
+
from redis.commands.search.query import Query
|
| 21 |
+
|
| 22 |
+
r = redis.Redis(host="localhost", port=6379)
|
| 23 |
+
|
| 24 |
+
INDEX_NAME = "index" # Vector Index Name
|
| 25 |
+
DOC_PREFIX = "doc:" # RediSearch Key Prefix for the Index
|
| 26 |
+
|
| 27 |
+
def create_index(vector_dimensions: int):
|
| 28 |
+
try:
|
| 29 |
+
# check to see if index exists
|
| 30 |
+
r.ft(INDEX_NAME).info()
|
| 31 |
+
print("Index already exists!")
|
| 32 |
+
except:
|
| 33 |
+
# schema
|
| 34 |
+
schema = (
|
| 35 |
+
TagField("tag"), # Tag Field Name
|
| 36 |
+
TagField("id"),
|
| 37 |
+
VectorField("vector", # Vector Field Name
|
| 38 |
+
"FLAT", { # Vector Index Type: FLAT or HNSW
|
| 39 |
+
"TYPE": "FLOAT32", # FLOAT32 or FLOAT64
|
| 40 |
+
"DIM": vector_dimensions, # Number of Vector Dimensions
|
| 41 |
+
"DISTANCE_METRIC": "COSINE", # Vector Search Distance Metric
|
| 42 |
+
}
|
| 43 |
+
),
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# index Definition
|
| 47 |
+
definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)
|
| 48 |
+
|
| 49 |
+
# create Index
|
| 50 |
+
r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
|
| 51 |
+
|
| 52 |
+
# define vector dimensions
|
| 53 |
+
VECTOR_DIMENSIONS = 1536
|
| 54 |
+
|
| 55 |
+
texts = [
|
| 56 |
+
"Hoje é realmente um ótimo dia!",
|
| 57 |
+
"O cachorro da porta ao lado late muito alto.",
|
| 58 |
+
"Meu gato escapou e saiu antes que eu pudesse fechar a porta.",
|
| 59 |
+
"Amanhã deve chover e trovejar."
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# delete index
|
| 63 |
+
#r.ft(INDEX_NAME).dropindex(delete_documents=True)
|
| 64 |
+
|
| 65 |
+
# make a new one
|
| 66 |
+
create_index(vector_dimensions=VECTOR_DIMENSIONS)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
import openai
|
| 70 |
+
import os
|
| 71 |
+
import numpy as np
|
| 72 |
+
from dotenv import load_dotenv, find_dotenv
|
| 73 |
+
_ = load_dotenv(find_dotenv()) # read local .env file
|
| 74 |
+
|
| 75 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# Create Embeddings with OpenAI text-embedding-ada-002
|
| 79 |
+
# https://openai.com/blog/new-and-improved-embedding-model
|
| 80 |
+
response = openai.Embedding.create(input=texts, engine="text-embedding-ada-002")
|
| 81 |
+
#print(response)
|
| 82 |
+
embeddings = np.array([r["embedding"] for r in response["data"]], dtype=np.float32)
|
| 83 |
+
|
| 84 |
+
# Write to Redis
|
| 85 |
+
pipe = r.pipeline()
|
| 86 |
+
for i, embedding in enumerate(embeddings):
|
| 87 |
+
pipe.hset(f"doc:{i}", mapping = {
|
| 88 |
+
"vector": embedding.tobytes(),
|
| 89 |
+
"content": texts[i],
|
| 90 |
+
"tag": "Eddy"
|
| 91 |
+
})
|
| 92 |
+
res = pipe.execute()
|
| 93 |
+
#print("🤗")
|
| 94 |
+
#print(embeddings)
|
| 95 |
+
|
| 96 |
+
#text = "animals"
|
| 97 |
+
text = "Eu gosto muito de pets, especificamente de gatos"
|
| 98 |
+
|
| 99 |
+
# create query embedding
|
| 100 |
+
response = openai.Embedding.create(input=[text], engine="text-embedding-ada-002")
|
| 101 |
+
#print(response["data"][0])
|
| 102 |
+
query_embedding = np.array([r["embedding"] for r in response["data"]], dtype=np.float32)[0]
|
| 103 |
+
#print("🤗🤗")
|
| 104 |
+
#print(len(query_embedding))
|
| 105 |
+
|
| 106 |
+
filter_query = "*"
|
| 107 |
+
# query for similar documents
|
| 108 |
+
query = (
|
| 109 |
+
Query(f"{filter_query}=>[KNN 3 @vector $vec as score]")
|
| 110 |
+
.sort_by("score")
|
| 111 |
+
.return_fields("content", "tag", "score")
|
| 112 |
+
.paging(0, 3)
|
| 113 |
+
.dialect(2)
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
query_params = {"vec": query_embedding.tobytes()}
|
| 117 |
+
response = r.ft(INDEX_NAME).search(query, query_params).docs
|
| 118 |
+
print("🤗🤗🤗")
|
| 119 |
+
print(response)
|
Index_Creation/query_by_index.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Similaridade do Vetor
|
| 6 |
+
=====================
|
| 7 |
+
Usamos os vetores (Embeddings) em sistemas de recomendação, pesquisa de
|
| 8 |
+
imagens e vídeos, recuperação de documentos e Q&A 🤗.
|
| 9 |
+
|
| 10 |
+
Versão: 1.0.0
|
| 11 |
+
Data: 29/07/2023
|
| 12 |
+
Autor: Dr.Eddy Giusepe
|
| 13 |
+
|
| 14 |
+
Método de execução:
|
| 15 |
+
$ python query_by_index.py
|
| 16 |
+
"""
|
| 17 |
+
import redis
|
| 18 |
+
from redis.commands.search.query import Query # Para fazer a pesquisa dos K vizinhos mais próximos
|
| 19 |
+
import numpy as np
|
| 20 |
+
import openai
|
| 21 |
+
import os
|
| 22 |
+
from dotenv import load_dotenv, find_dotenv
|
| 23 |
+
_ = load_dotenv(find_dotenv()) # read local .env file
|
| 24 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
| 25 |
+
|
| 26 |
+
# Conecte-se ao servidor Redis local
|
| 27 |
+
redis_host = "localhost"
|
| 28 |
+
redis_port = 6379
|
| 29 |
+
r = redis.Redis(host=redis_host, port=redis_port)
|
| 30 |
+
|
| 31 |
+
# Texto da pergunta
|
| 32 |
+
text = "Qual é o horário de atendimento no codhab?"
|
| 33 |
+
|
| 34 |
+
# Crie o embedding da pergunta
|
| 35 |
+
response = openai.Embedding.create(input=text, engine="text-embedding-ada-002")
|
| 36 |
+
embedding_text = response['data'][0]['embedding']
|
| 37 |
+
#query_embedding = np.array([r["embedding"] for r in response["data"]], dtype=np.float32)[0]
|
| 38 |
+
array_embedding = np.array(embedding_text, dtype=np.float32)
|
| 39 |
+
query_embedding = array_embedding.tobytes() # Tem que estar em Bytes ... o REDEIS pede assim.
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Defina a lista de índices a serem consultados em ordem de prioridade
|
| 44 |
+
index_list = ["secretaria_de_saude_do_df", "detran", "codhab"]
|
| 45 |
+
|
| 46 |
+
# Crie a query considerando a busca por similaridade nos índices
|
| 47 |
+
filter_query = " | ".join(f"(@vector [{index}] $vec){' ' + index if i > 0 else ''}" for i, index in enumerate(index_list))
|
| 48 |
+
query = (
|
| 49 |
+
Query(f"{filter_query} =>[KNN 3 @vector $vec as score]")
|
| 50 |
+
.sort_by("@score")
|
| 51 |
+
.paging(0, 3)
|
| 52 |
+
.return_fields("score")
|
| 53 |
+
.dialect(2)
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Adicione o comando de busca KNN com o query_embedding à (@tag:{ EddyGiusepe })pipeline
|
| 57 |
+
pipe = r.pipeline()
|
| 58 |
+
pipe.execute_command("FT.SEARCH", "codhab", query, "RETURN", 1, "@vector", "VECTORS", query_embedding) # Substitua "myIndex" pelo nome do seu índice
|
| 59 |
+
result = pipe.execute()
|
| 60 |
+
|
| 61 |
+
print(result)
|
| 62 |
+
|
| 63 |
+
#teste = r.hgetall("doc:codhab:86ce630aa248409191e01b33a5f9fedf")
|
| 64 |
+
teste = r.hgetall("doc:codhab:86ce630aa248409191e01b33a5f9fedf")
|
| 65 |
+
print(teste.keys())
|
| 66 |
+
print(teste[b'content_vector']) # Este é o Embedding, não precisa de "utf-8"
|
| 67 |
+
print(teste[b'metadata'].decode('utf-8'))
|
| 68 |
+
print(teste[b'content'].decode('utf-8'))
|
| 69 |
+
|
| 70 |
+
import json
|
| 71 |
+
dic = json.loads(teste[b'metadata'].decode('utf-8'))
|
| 72 |
+
|
| 73 |
+
print(dic["Pergunta"])
|
| 74 |
+
print(dic["Resposta"])
|
| 75 |
+
print(dic["Fonte"])
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
keys = r.keys("doc:codhab:*")
|
| 79 |
+
print(len(keys))
|
| 80 |
+
# if len(result["data"]) == 0:
|
| 81 |
+
# # Se não houver resultados nos índices, faça uma busca em todos eles usando "*"
|
| 82 |
+
# query = (
|
| 83 |
+
# Query(f"* =>[KNN 3 @vector $vec as score]")
|
| 84 |
+
# .sort_by("@score")
|
| 85 |
+
# .paging(0, 3)
|
| 86 |
+
# .return_fields("content", "tag", "score")
|
| 87 |
+
# .dialect(2)
|
| 88 |
+
# )
|
| 89 |
+
# result = query.execute(set={"vec": query_embedding.tolist()})
|
| 90 |
+
|
| 91 |
+
# # Resultados
|
| 92 |
+
# print(result["data"])
|