Spaces:

X-Pipe
/

flash

Sleeping

App Files Files Community

NickNYU commited on Jun 25, 2023

Commit

a26db82

1 Parent(s): 0ff6c12

upload github well compiled files

Browse files

Files changed (23) hide show

.gitignore +149 -0
.pre-commit-config.yaml +5 -0
Makefile +14 -0
app.py +69 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/lifecycle.cpython-310.pyc +0 -0
core/__pycache__/logger_factory.cpython-310.pyc +0 -0
core/__pycache__/test_lifecycle.cpython-310.pyc +0 -0
core/lifecycle.py +185 -0
core/logger_factory.py +19 -0
core/test_lifecycle.py +58 -0
docs/docs.pkl +3 -0
github_retriever.py +63 -0
langchain/__init__.py +0 -0
langchain/manager.py +39 -0
llama/__init__.py +0 -0
llama/context.py +61 -0
llama/data_loader.py +62 -0
llama/index.py +18 -0
llama/vector_storage.py +18 -0
pyproject.toml +19 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,149 @@

+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Jetbrains
+.idea
+modules/
+*.swp
+# pipenv
+Pipfile
+Pipfile.lock
+# pyright
+pyrightconfig.json

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.243
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,14 @@

+.PHONY: format lint
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+format:
+	black .
+lint:
+	mypy .
+	black . --check
+	ruff check .
+test:
+	pytest tests

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+from llama_index import SimpleDirectoryReader
+from llama_index.node_parser import SimpleNodeParser
+from llama_index.data_structs.node import Node, DocumentRelationship
+from llama_index import VectorStoreIndex
+from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
+from langchain.llms import AzureOpenAI
+from langchain.embeddings.openai import OpenAIEmbeddings
+from llama_index import LangchainEmbedding, ServiceContext
+from llama_index import StorageContext, load_index_from_storage
+import logging
+import sys
+logging.basicConfig(
+    stream=sys.stdout, level=logging.DEBUG
+)  # logging.DEBUG for more verbose output
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+def main() -> None:
+    documents = SimpleDirectoryReader("./data").load_data()
+    # index = VectorStoreIndex.from_documents(documents)
+    # parser = SimpleNodeParser()
+    # nodes = parser.get_nodes_from_documents(documents)
+    # index = VectorStoreIndex(nodes)
+    # define embedding
+    embedding = LangchainEmbedding(OpenAIEmbeddings(client=None, chunk_size=1))
+    # define LLM
+    llm_predictor = LLMPredictor(
+        llm=AzureOpenAI(
+            client=None,
+            deployment_name="text-davinci-003",
+            model="text-davinci-003",
+        )
+    )
+    # configure service context
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=llm_predictor, embed_model=embedding
+    )
+    # build index
+    index = VectorStoreIndex.from_documents(
+        documents,
+        service_context=service_context,
+    )
+    index.storage_context.persist(persist_dir="./dataset")
+    storage_context = StorageContext.from_defaults(persist_dir="./dataset")
+    index = load_index_from_storage(
+        storage_context=storage_context, service_context=service_context
+    )
+    # index.vector_store.persist("./dataset")
+    # query with embed_model specified
+    query_engine = index.as_query_engine(
+        retriever_mode="embedding", verbose=True, service_context=service_context
+    )
+    response = query_engine.query("请帮忙推荐一杯咖啡给我，我喜欢咖啡因")
+    print(response)
+if __name__ == "__main__":
+    main()

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (147 Bytes). View file

core/__pycache__/lifecycle.cpython-310.pyc ADDED Viewed

Binary file (6.82 kB). View file

core/__pycache__/logger_factory.cpython-310.pyc ADDED Viewed

Binary file (778 Bytes). View file

core/__pycache__/test_lifecycle.cpython-310.pyc ADDED Viewed

Binary file (2.54 kB). View file

core/lifecycle.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import enum
+from abc import ABC, abstractmethod
+from typing import TypeVar, Optional
+from core import logger_factory
+class Initializable(ABC):
+    @abstractmethod
+    def initialize(self) -> None:
+        pass
+class Startable(ABC):
+    @abstractmethod
+    def start(self) -> None:
+        pass
+class Stoppable(ABC):
+    @abstractmethod
+    def stop(self) -> None:
+        pass
+class Disposable(ABC):
+    @abstractmethod
+    def dispose(self) -> None:
+        pass
+class LifecycleAware(ABC):
+    def __init__(self, state: "LifecycleState") -> None:
+        """
+        Args:
+            state(LifecycleState): lifecycle state
+        """
+        self.state = state
+    @property
+    def get_lifecycle_state(self) -> "LifecycleState":
+        return self.state
+class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
+    def __init__(self) -> None:
+        self.logger = logger_factory.get_logger(self.__class__.__name__)
+        self.lifecycle_state = LifecycleState(lifecycle=self)
+    def initialize(self) -> None:
+        if not self.lifecycle_state.can_initialize(self.lifecycle_state.get_phase()):
+            self.logger.warning("[{}]cannot initialize".format(self.__class__.__name__))
+            return
+        self.lifecycle_state.set_phase(LifecyclePhase.INITIALIZING)
+        self.do_init()
+        self.lifecycle_state.set_phase(LifecyclePhase.INITIALIZED)
+    def start(self) -> None:
+        if not self.lifecycle_state.can_start(self.lifecycle_state.get_phase()):
+            self.logger.warning("[{}]cannot start".format(self.__class__.__name__))
+            return
+        self.lifecycle_state.set_phase(LifecyclePhase.STARTING)
+        self.do_start()
+        self.lifecycle_state.set_phase(LifecyclePhase.STARTED)
+    def stop(self) -> None:
+        if not self.lifecycle_state.can_stop(self.lifecycle_state.get_phase()):
+            self.logger.warning("[{}]cannot stop".format(self.__class__.__name__))
+            return
+        self.lifecycle_state.set_phase(LifecyclePhase.STOPPING)
+        self.do_stop()
+        self.lifecycle_state.set_phase(LifecyclePhase.STOPPED)
+    def dispose(self) -> None:
+        if not self.lifecycle_state.can_dispose(self.lifecycle_state.get_phase()):
+            self.logger.warning("[{}]cannot dispose".format(self.__class__.__name__))
+            return
+        self.lifecycle_state.set_phase(LifecyclePhase.DISPOSING)
+        self.do_dispose()
+        self.lifecycle_state.set_phase(LifecyclePhase.DISPOSED)
+    @abstractmethod
+    def do_init(self) -> None:
+        pass
+    @abstractmethod
+    def do_start(self) -> None:
+        pass
+    @abstractmethod
+    def do_stop(self) -> None:
+        pass
+    @abstractmethod
+    def do_dispose(self) -> None:
+        pass
+class LifecyclePhase(enum.Enum):
+    INITIALIZING = 1
+    INITIALIZED = 2
+    STARTING = 3
+    STARTED = 4
+    STOPPING = 5
+    STOPPED = 6
+    DISPOSING = 7
+    DISPOSED = 8
+class LifecycleController(ABC):
+    def can_initialize(self, phase: Optional[LifecyclePhase]) -> bool:
+        return phase is None or phase == LifecyclePhase.DISPOSED
+    def can_start(self, phase: Optional[LifecyclePhase]) -> bool:
+        return phase is not None and (
+            phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
+        )
+    def can_stop(self, phase: Optional[LifecyclePhase]) -> bool:
+        return phase is not None and phase == LifecyclePhase.STARTED
+    def can_dispose(self, phase: Optional[LifecyclePhase]) -> bool:
+        return phase is not None and (
+            phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
+        )
+LS = TypeVar("LS", bound=Lifecycle)
+class LifecycleState(LifecycleController, ABC):
+    phase: Optional[LifecyclePhase]
+    def __init__(self, lifecycle: LS) -> None:
+        self.phase = None
+        self.prev_phase = None
+        self.lifecycle = lifecycle
+        self.logger = logger_factory.get_logger(__name__)
+    def is_initializing(self) -> bool:
+        return self.phase == LifecyclePhase.INITIALIZING
+    def is_initialized(self) -> bool:
+        return self.phase == LifecyclePhase.INITIALIZED
+    def is_starting(self) -> bool:
+        return self.phase == LifecyclePhase.STARTING
+    def is_started(self) -> bool:
+        return self.phase == LifecyclePhase.STARTED
+    def is_stopping(self) -> bool:
+        return self.phase == LifecyclePhase.STOPPING
+    def is_stopped(self) -> bool:
+        return self.phase == LifecyclePhase.STOPPED
+    def is_disposing(self) -> bool:
+        return self.phase == LifecyclePhase.DISPOSING
+    def is_disposed(self) -> bool:
+        return self.phase == LifecyclePhase.DISPOSED
+    def get_phase(self) -> Optional[LifecyclePhase]:
+        return self.phase
+    def set_phase(self, phase: Optional[LifecyclePhase]) -> None:
+        prev = "None"
+        if self.phase is not None:
+            prev = self.phase.name
+        current = "None"
+        if phase is not None:
+            current = phase.name
+        self.logger.info(
+            "[setPhaseName][{}]{} --> {}".format(
+                self.lifecycle.__class__.__name__,
+                prev,
+                current,
+            )
+        )
+        self.phase = phase
+    def rollback(self, err: Exception) -> None:
+        self.phase = self.prev_phase
+        self.prev_phase = None

core/logger_factory.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import logging
+from logging import handlers
+from typing import Optional
+def get_logger(name: str, file_name: Optional[str] = None) -> logging.Logger:
+    logger = logging.getLogger(name)
+    if file_name is None:
+        file_name = "app-default.log"
+    handler = handlers.TimedRotatingFileHandler(
+        filename=file_name, when="d", backupCount=21, encoding="UTF-8"
+    )
+    formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(message)s]")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    # Configure the logger as desired
+    # e.g., add handlers, set log levels, etc.
+    return logger

core/test_lifecycle.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import logging
+from unittest import TestCase
+from core.lifecycle import Lifecycle
+logging.basicConfig()
+class SubLifecycle(Lifecycle):
+    def __init__(self) -> None:
+        super().__init__()
+        self.init_counter = 0
+    def do_init(self) -> None:
+        self.init_counter += 1
+    def do_start(self) -> None:
+        self.init_counter += 1
+    def do_stop(self) -> None:
+        self.init_counter += 1
+    def do_dispose(self) -> None:
+        self.init_counter += 1
+class TestLifecycle(TestCase):
+    def test_initialize(self) -> None:
+        ls = SubLifecycle()
+        ls.initialize()
+        ls.logger.info(ls.lifecycle_state.get_phase())
+        ls.start()
+        ls.logger.info(ls.lifecycle_state.get_phase())
+        ls.stop()
+        ls.logger.info(ls.lifecycle_state.get_phase())
+        ls.dispose()
+        ls.logger.info(ls.lifecycle_state.get_phase())
+    def test_start(self) -> None:
+        self.fail()
+    def test_stop(self) -> None:
+        self.fail()
+    def test_dispose(self) -> None:
+        self.fail()
+    def test_do_init(self) -> None:
+        self.fail()
+    def test_do_start(self) -> None:
+        self.fail()
+    def test_do_stop(self) -> None:
+        self.fail()
+    def test_do_dispose(self) -> None:
+        self.fail()

docs/docs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a47dd9aad6afbb3c118696a8534fea9cc9b0be12746c88ae2dd2777c19423a96
+size 30429

github_retriever.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from llama_hub.github_repo import GithubRepositoryReader, GithubClient
+from llama_index import download_loader, GPTVectorStoreIndex
+from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext
+from langchain.llms import AzureOpenAI
+from langchain.embeddings.openai import OpenAIEmbeddings
+from llama_index import LangchainEmbedding, ServiceContext
+from llama_index import StorageContext, load_index_from_storage
+from dotenv import load_dotenv
+import os
+import pickle
+def main() -> None:
+    # define embedding
+    embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
+    # define LLM
+    llm_predictor = LLMPredictor(
+        llm=AzureOpenAI(
+            engine="text-davinci-003",
+            model_name="text-davinci-003",
+        )
+    )
+    # configure service context
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=llm_predictor, embed_model=embedding
+    )
+    download_loader("GithubRepositoryReader")
+    docs = None
+    if os.path.exists("docs/docs.pkl"):
+        with open("docs/docs.pkl", "rb") as f:
+            docs = pickle.load(f)
+    if docs is None:
+        github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
+        loader = GithubRepositoryReader(
+            github_client,
+            owner="ctripcorp",
+            repo="x-pipe",
+            filter_directories=(
+                [".", "doc"],
+                GithubRepositoryReader.FilterType.INCLUDE,
+            ),
+            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
+            verbose=True,
+            concurrent_requests=10,
+        )
+        docs = loader.load_data(branch="master")
+        with open("docs/docs.pkl", "wb") as f:
+            pickle.dump(docs, f)
+    index = GPTVectorStoreIndex.from_documents(docs, service_context=service_context)
+    query_engine = index.as_query_engine(service_context=service_context)
+    response = query_engine.query("如何使用X-Pipe?")
+    print(response)
+if __name__ == "__main__":
+    load_dotenv()
+    main()

langchain/__init__.py ADDED Viewed

File without changes

langchain/manager.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from abc import abstractmethod, ABC
+from langchain.embeddings.base import Embeddings as LCEmbeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.llms import AzureOpenAI
+from langchain.base_language import BaseLanguageModel
+from core.lifecycle import Lifecycle
+class BaseLangChainManager(Lifecycle, ABC):
+    def __init__(self) -> None:
+        super().__init__()
+    @abstractmethod
+    def get_embedding(self) -> LCEmbeddings:
+        pass
+    @abstractmethod
+    def get_llm(self) -> BaseLanguageModel:
+        pass
+class LangChainAzureManager(BaseLangChainManager):
+    def __init__(self) -> None:
+        super().__init__()
+    # Override
+    def get_embedding(self) -> LCEmbeddings:
+        return OpenAIEmbeddings(client=None, chunk_size=1)
+    # Override
+    def get_llm(self) -> BaseLanguageModel:
+        return AzureOpenAI(
+            deployment_name="text-davinci-003",
+            # model_name="text-davinci-003",
+            model="text-davinci-003",
+            client=None,
+        )

llama/__init__.py ADDED Viewed

File without changes

llama/context.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from llama_index import ServiceContext, LLMPredictor, LangchainEmbedding
+from type import Optional
+from core.lifecycle import Lifecycle
+from langchain.manager import BaseLangChainManager
+class ServiceContextManager(Lifecycle):
+    service_context: Optional[ServiceContext]
+    def __init__(self, manager: BaseLangChainManager) -> None:
+        super().__init__()
+        self.manager = manager
+        self.service_context = None
+    def get_service_context(self) -> ServiceContext:
+        if self.lifecycle_state.is_started():
+            raise KeyError(
+                "incorrect lifecycle state: {}".format(self.lifecycle_state.phase)
+            )
+        if self.service_context is None:
+            raise ValueError(
+                "service context is not ready, check for lifecycle statement"
+            )
+        return self.service_context
+    def do_init(self) -> None:
+        # define embedding
+        embedding = LangchainEmbedding(self.manager.get_embedding())
+        # define LLM
+        llm_predictor = LLMPredictor(llm=self.manager.get_llm())
+        # configure service context
+        self.service_context = ServiceContext.from_defaults(
+            llm_predictor=llm_predictor, embed_model=embedding
+        )
+    def do_start(self) -> None:
+        pass
+    def do_stop(self) -> None:
+        pass
+    def do_dispose(self) -> None:
+        pass
+class StorageContextManager(Lifecycle):
+    def __init__(self, dataset_path: Optional[str] = "./dataset") -> None:
+        super().__init__()
+        self.dataset_path = dataset_path
+    def do_init(self) -> None:
+        pass
+    def do_start(self) -> None:
+        pass
+    def do_stop(self) -> None:
+        pass
+    def do_dispose(self) -> None:
+        pass

llama/data_loader.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import pickle
+from abc import abstractmethod, ABC
+from typing import Optional, Sequence, List
+from llama_hub.github_repo import GithubRepositoryReader, GithubClient
+from llama_index import download_loader
+from llama_index.readers.schema.base import Document
+from core.lifecycle import Lifecycle
+class WikiLoader(ABC):
+    @abstractmethod
+    def load(self) -> List[Document]:
+        pass
+class GithubLoader(WikiLoader, Lifecycle):
+    def __init__(
+        self,
+        github_owner: Optional[str] = None,
+        repo: Optional[str] = None,
+        dirs: Optional[Sequence[str]] = None,
+    ):
+        super().__init__()
+        self.owner = (
+            github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
+        )
+        self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
+        self.dirs = dirs if dirs is not None else [".", "doc"]
+    def load(self) -> List[Document]:
+        download_loader("GithubRepositoryReader")
+        docs = None
+        if os.path.exists("docs/docs.pkl"):
+            with open("docs/docs.pkl", "rb") as f:
+                docs = pickle.load(f)
+        if docs is not None:
+            return docs
+        # otherwise, we download from github and save it locally
+        github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
+        loader = GithubRepositoryReader(
+            github_client,
+            # owner="ctripcorp",
+            owner=self.owner,
+            # repo="x-pipe",
+            repo=self.repo,
+            filter_directories=(self.dirs, GithubRepositoryReader.FilterType.INCLUDE),
+            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
+            verbose=True,
+            concurrent_requests=10,
+        )
+        docs = loader.load_data(branch="master")
+        with open("docs/docs.pkl", "wb") as f:
+            pickle.dump(docs, f)
+        return docs

llama/index.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from core.lifecycle import Lifecycle
+from llama.context import ServiceContextManager
+from llama_index.indices.vector_store import VectorStoreIndex
+from typing import Optional
+class IndexManager(Lifecycle):
+    index: Optional[VectorStoreIndex]
+    def __init__(self, context_manager: ServiceContextManager) -> None:
+        super().__init__()
+        self.index = None
+        self.context_manager = context_manager
+    def get_index(self) -> Optional[VectorStoreIndex]:
+        if not self.lifecycle_state.is_started():
+            raise Exception("Lifecycle state is not correct")
+        return self.index

llama/vector_storage.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from core.lifecycle import Lifecycle
+class VectorStorageManager(Lifecycle):
+    def __init__(self) -> None:
+        super().__init__()
+    def do_init(self) -> None:
+        pass
+    def do_start(self) -> None:
+        pass
+    def do_stop(self) -> None:
+        pass
+    def do_dispose(self) -> None:
+        pass

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[tool.mypy]
+ignore_missing_imports = "True"
+disallow_untyped_defs = "True"
+exclude = ["notebooks", "build", "examples", "docs", "dataset", "app.py", "github_retriever.py"]
+[tool.ruff]
+exclude = [
+    ".venv",
+    "__pycache__",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".ruff_cache",
+    "examples",
+    "notebooks",
+    "docs",
+    "dataset",
+    "app.py",
+    "github_retriever.py"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+llama_index
+llama_hub
+langchain
+dotenv
+ruff
+black
+mypy