File size: 5,217 Bytes
cbdf795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
import json
import logging
import os
import sys

import psycopg2
import s3fs
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import (ServiceContext, SimpleDirectoryReader, StorageContext,
                         SummaryIndex, get_response_synthesizer,
                         set_global_service_context)
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.indices.vector_store import VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.schema import IndexNode
from llama_index.vector_stores import PGVectorStore
from sqlalchemy import make_url


# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
def get_embed_model():
    model_kwargs = {'device': 'cpu'}
    if torch.cuda.is_available():
      model_kwargs['device'] = 'cuda'
    if torch.backends.mps.is_available():
      model_kwargs['device'] = 'mps'

    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    print("Loading model...")
    try:
      model_norm = HuggingFaceEmbeddings(
        model_name="thenlper/gte-small",
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
      )
    except Exception as exception:
      print(f"Model not found. Loading fake model...{exception}")
      exit()
    print("Model loaded.")
    return model_norm

def create_table(db_name, connection_string):
  conn = psycopg2.connect(connection_string)
  conn.autocommit = True

  with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")
  return

def create_vector_store():
  db_name = "helm"
  connection_string = "postgresql://adrian@localhost:5432/postgres"
  create_table(db_name, connection_string)
  url = make_url(connection_string)
  vector_store = PGVectorStore.from_params(
      database=db_name,
      host=url.host,
      password=url.password,
      port=url.port,
      user=url.username,
      table_name="f150_manual",
      embed_dim=384,
      hybrid_search=True,
      text_search_config="english",
  )
  return vector_store

def get_remote_filesystem():
  AWS_KEY = "AKIAWCUHDQXX3H7PPRXN"
  AWS_SECRET = "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"

  s3 = s3fs.S3FileSystem(
    key=AWS_KEY,
    secret=AWS_SECRET,
  )
  return s3

def create_vector_index():
  docs = SimpleDirectoryReader(input_dir="docs/chapters").load_data()
  vector_store = create_vector_store()
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
  vector_index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
    embedding_model=None,
    show_progress=True,
    chunk_size=1024,
    chunk_overlap=20)
  return vector_index

def create_recursive_index():
  doc_dir = "./docs/chapters/"
  doc_summaries = {}
  titles = []
  for filename in os.listdir(doc_dir):
    print(filename)
    title = filename.split(".")[0]
    titles.append(title)
    docs = SimpleDirectoryReader(input_files=[f"{doc_dir}{filename}"]).load_data()
    docs[0].doc_id = title
    doc_summaries[title] = docs

  context_window = 4096
  embed_model = get_embed_model()
  chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo-16k")
  service_context = ServiceContext.from_defaults(
    llm=chatgpt,
    embed_model=embed_model,
    chunk_size=1024,
    context_window=context_window)

  s3 = get_remote_filesystem()
  nodes = []
  for title in titles:
      print(title)
      # build vector index
      storage_context = StorageContext.from_defaults()
      vector_index = VectorStoreIndex.from_documents(
          doc_summaries[title],
          service_context=service_context,
          verbose=True,
          storage_context=storage_context,
          show_progress=True,
      )
      vector_index.storage_context.persist(f"f150-user-manual/recursive-agent/{title}/vector_index", fs=s3)
      # build summary index
      response_synthesizer = get_response_synthesizer(
        response_mode="compact_accumulate", use_async=False
      )
      storage_context = StorageContext.from_defaults()
      summary_index = DocumentSummaryIndex.from_documents(
        doc_summaries[title],
        service_context=service_context,
        response_synthesizer=response_synthesizer,
        verbose=True,
        storage_context=storage_context,
        show_progress=True,
      )
      print(summary_index.get_document_summary(title))
      node = IndexNode(text=summary_index.get_document_summary(title), index_id=title)
      nodes.append(node)

  storage_context = StorageContext.from_defaults()
  vector_index = VectorStoreIndex(
    nodes,
    service_context=service_context,
    verbose=True,
    storage_context=storage_context,
    show_progress=True,)
  vector_index.storage_context.persist("f150-user-manual/recursive-agent/vector_index", fs=s3)

def main():
  embed_model = get_embed_model()
  service_context = ServiceContext.from_defaults(embed_model=embed_model)
  set_global_service_context(service_context)
  create_vector_index();
  create_recursive_index();

if __name__ == "__main__":
  main()