NickNYU commited on
Commit
bd5788b
·
1 Parent(s): e448dd7
dataset/graph_store.json CHANGED
@@ -1 +1,3 @@
1
- {"graph_dict": {}}
 
 
 
1
+ {
2
+ "graph_dict": {}
3
+ }
llama/context.py CHANGED
@@ -1,16 +1,26 @@
1
- from llama_index import ServiceContext, LLMPredictor, LangchainEmbedding
2
- from type import Optional
 
 
 
3
  from core.lifecycle import Lifecycle
4
- from langchain.manager import BaseLangChainManager
 
 
 
 
 
 
 
5
 
6
 
7
- class ServiceContextManager(Lifecycle):
8
- service_context: Optional[ServiceContext]
 
9
 
10
- def __init__(self, manager: BaseLangChainManager) -> None:
11
  super().__init__()
12
- self.manager = manager
13
- self.service_context = None
14
 
15
  def get_service_context(self) -> ServiceContext:
16
  if self.lifecycle_state.is_started():
@@ -25,37 +35,75 @@ class ServiceContextManager(Lifecycle):
25
 
26
  def do_init(self) -> None:
27
  # define embedding
28
- embedding = LangchainEmbedding(self.manager.get_embedding())
29
  # define LLM
30
- llm_predictor = LLMPredictor(llm=self.manager.get_llm())
31
  # configure service context
32
  self.service_context = ServiceContext.from_defaults(
33
  llm_predictor=llm_predictor, embed_model=embedding
34
  )
35
 
36
  def do_start(self) -> None:
37
- pass
 
 
 
38
 
39
  def do_stop(self) -> None:
40
- pass
 
 
 
41
 
42
  def do_dispose(self) -> None:
 
 
 
 
 
 
 
43
  pass
44
 
45
 
46
- class StorageContextManager(Lifecycle):
47
- def __init__(self, dataset_path: Optional[str] = "./dataset") -> None:
 
 
 
 
48
  super().__init__()
49
  self.dataset_path = dataset_path
 
 
 
 
50
 
51
  def do_init(self) -> None:
52
- pass
 
 
 
 
 
53
 
54
  def do_start(self) -> None:
55
- pass
56
 
57
  def do_stop(self) -> None:
58
- pass
59
 
60
  def do_dispose(self) -> None:
61
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod, ABC
2
+
3
+ from llama_index import ServiceContext, LLMPredictor, LangchainEmbedding, Document
4
+ from llama_index import StorageContext
5
+
6
  from core.lifecycle import Lifecycle
7
+ from langchain_manager.manager import BaseLangChainManager
8
+
9
+
10
+ class ServiceContextManager(Lifecycle, ABC):
11
+
12
+ @abstractmethod
13
+ def get_service_context(self) -> ServiceContext:
14
+ pass
15
 
16
 
17
+ class AzureServiceContextManager(ServiceContextManager):
18
+ lc_manager: BaseLangChainManager
19
+ service_context: ServiceContext
20
 
21
+ def __init__(self, lc_manager: BaseLangChainManager):
22
  super().__init__()
23
+ self.lc_manager = lc_manager
 
24
 
25
  def get_service_context(self) -> ServiceContext:
26
  if self.lifecycle_state.is_started():
 
35
 
36
  def do_init(self) -> None:
37
  # define embedding
38
+ embedding = LangchainEmbedding(self.lc_manager.get_embedding())
39
  # define LLM
40
+ llm_predictor = LLMPredictor(llm=self.lc_manager.get_llm())
41
  # configure service context
42
  self.service_context = ServiceContext.from_defaults(
43
  llm_predictor=llm_predictor, embed_model=embedding
44
  )
45
 
46
  def do_start(self) -> None:
47
+ self.logger.info("[do_start][embedding] last used usage: %d",
48
+ self.service_context.embed_model.total_tokens_used)
49
+ self.logger.info("[do_start][predict] last used usage: %d",
50
+ self.service_context.llm_predictor.total_tokens_used)
51
 
52
  def do_stop(self) -> None:
53
+ self.logger.info("[do_stop][embedding] last used usage: %d",
54
+ self.service_context.embed_model.total_tokens_used)
55
+ self.logger.info("[do_stop][predict] last used usage: %d",
56
+ self.service_context.llm_predictor.total_tokens_used)
57
 
58
  def do_dispose(self) -> None:
59
+ self.logger.info("[do_dispose] total used token: %d", self.service_context.llm_predictor.total_tokens_used)
60
+
61
+
62
+ class StorageContextManager(Lifecycle, ABC):
63
+
64
+ @abstractmethod
65
+ def get_storage_context(self) -> StorageContext:
66
  pass
67
 
68
 
69
+ class LocalStorageContextManager(StorageContextManager):
70
+ storage_context: StorageContext
71
+
72
+ def __init__(self,
73
+ dataset_path: str = "./dataset",
74
+ service_context_manager: ServiceContextManager = None) -> None:
75
  super().__init__()
76
  self.dataset_path = dataset_path
77
+ self.service_context_manager = service_context_manager
78
+
79
+ def get_storage_context(self) -> StorageContext:
80
+ return self.storage_context
81
 
82
  def do_init(self) -> None:
83
+ from llama.utils import is_local_storage_files_ready
84
+ if is_local_storage_files_ready(self.dataset_path):
85
+ self.storage_context = StorageContext.from_defaults(persist_dir=self.dataset_path)
86
+ else:
87
+ docs = self._download()
88
+ self._indexing(docs)
89
 
90
  def do_start(self) -> None:
91
+ self.logger.info("[do_start]%", **self.storage_context.to_dict())
92
 
93
  def do_stop(self) -> None:
94
+ self.logger.info("[do_stop]%", **self.storage_context.to_dict())
95
 
96
  def do_dispose(self) -> None:
97
+ self.storage_context.persist(self.dataset_path)
98
+
99
+ def _download(self) -> [Document]:
100
+ from llama.data_loader import GithubLoader
101
+ loader = GithubLoader()
102
+ return loader.load()
103
+
104
+ def _indexing(self, docs: [Document]):
105
+ from llama_index import GPTVectorStoreIndex
106
+ index = GPTVectorStoreIndex.from_documents(docs,
107
+ service_context=self.service_context_manager.get_service_context())
108
+ index.storage_context.persist(persist_dir=self.dataset_path)
109
+ self.storage_context = index.storage_context
llama/data_loader.py CHANGED
@@ -16,10 +16,10 @@ class WikiLoader(ABC):
16
 
17
  class GithubLoader(WikiLoader):
18
  def __init__(
19
- self,
20
- github_owner: Optional[str] = None,
21
- repo: Optional[str] = None,
22
- dirs: Optional[Sequence[str]] = None,
23
  ):
24
  super().__init__()
25
  self.owner = (
 
16
 
17
  class GithubLoader(WikiLoader):
18
  def __init__(
19
+ self,
20
+ github_owner: Optional[str] = None,
21
+ repo: Optional[str] = None,
22
+ dirs: Optional[Sequence[str]] = None,
23
  ):
24
  super().__init__()
25
  self.owner = (