File size: 2,814 Bytes
6628fd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

from llama_index import StorageContext
from typing import List
from abc import abstractmethod, ABC

from llama_index import Document

from core.lifecycle import Lifecycle
from llama.data_loader import DirectoryLoader
from llama.service_context import ServiceContextManager


class StorageContextManager(Lifecycle, ABC):
    _storage_context: StorageContext

    @property
    def storage_context(self) -> StorageContext:
        return self._storage_context

    @storage_context.setter
    def storage_context(self, value: StorageContext) -> None:
        self._storage_context = value

    @abstractmethod
    def _is_embedding_ready(self) -> bool:
        pass

    @abstractmethod
    def _load_data(self) -> List[Document]:
        pass

    @abstractmethod
    def _indexing_embedding(self, docs: List[Document]) -> StorageContext:
        pass

    @abstractmethod
    def _load_storage_context(self) -> StorageContext:
        pass

    @abstractmethod
    def _persist(self) -> None:
        pass

    def do_init(self) -> None:
        if self._is_embedding_ready():
            self.storage_context = self._load_storage_context()
        else:
            self.storage_context = self._indexing_embedding(self._load_data())
            self._persist()

    def do_dispose(self) -> None:
        if self._is_embedding_ready():
            self._persist()

    def do_start(self) -> None:
        # self.logger.info("[do_start]%", self.storage_context.__str__())
        pass

    def do_stop(self) -> None:
        # self.logger.info("[do_stop]%", self.storage_context.__str__())
        pass


class LocalStorageContextManager(StorageContextManager):
    def __init__(
        self,
        service_context_manager: ServiceContextManager,
        dataset_path: str = "./dataset",
    ) -> None:
        super().__init__()
        self._dataset_path = dataset_path
        self._service_context_manager = service_context_manager

    def _is_embedding_ready(self) -> bool:
        from llama.utils import is_local_storage_files_ready

        return is_local_storage_files_ready(self._dataset_path)

    def _load_data(self) -> List[Document]:
        return DirectoryLoader(
            dir_path="./docs/faq", required_exts=[".pdf"], exclude_glob=[]
        ).load()

    def _indexing_embedding(self, docs: List[Document]) -> StorageContext:
        from llama_index import GPTVectorStoreIndex

        index = GPTVectorStoreIndex.from_documents(
            docs, service_context=self._service_context_manager.get_service_context()
        )
        return index.storage_context

    def _load_storage_context(self) -> StorageContext:
        return StorageContext.from_defaults(persist_dir=self._dataset_path)

    def _persist(self) -> None:
        self.storage_context.persist(self._dataset_path)