Spaces:
Runtime error
Runtime error
File size: 2,762 Bytes
6628fd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import pickle
from abc import abstractmethod, ABC
from typing import Optional, Sequence, List
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from llama_index import download_loader
from llama_index.readers.schema.base import Document
class WikiLoader(ABC):
@abstractmethod
def load(self) -> List[Document]:
pass
class GithubLoader(WikiLoader):
def __init__(
self,
github_owner: Optional[str] = None,
repo: Optional[str] = None,
dirs: Optional[Sequence[str]] = None,
):
super().__init__()
self.owner = (
github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
)
self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
self.dirs = dirs if dirs is not None else [".", "doc"]
def load(self) -> List[Document]:
download_loader("GithubRepositoryReader")
docs = None
if os.path.exists("docs/docs.pkl"):
with open("docs/docs.pkl", "rb") as f:
docs = pickle.load(f)
if docs is not None:
return docs
# otherwise, we download from github and save it locally
github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
loader = GithubRepositoryReader(
github_client,
# owner="ctripcorp",
owner=self.owner,
# repo="x-pipe",
repo=self.repo,
filter_directories=(self.dirs, GithubRepositoryReader.FilterType.INCLUDE),
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
verbose=True,
concurrent_requests=10,
)
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"
docs = loader.load_data(branch="master")
with open("docs/docs.pkl", "wb") as f:
pickle.dump(docs, f)
return docs
class DirectoryLoader(WikiLoader):
def __init__(
self,
exclude_glob: List[str],
required_exts: List[str],
dir_path: str = "./docs/",
recursive: bool = True,
):
super().__init__()
self._dir_path = dir_path
self._exclude_glob = exclude_glob if exclude_glob else []
self._required_exts = required_exts if required_exts else []
self._recusive = recursive
def load(self) -> List[Document]:
from llama_index import SimpleDirectoryReader
reader = SimpleDirectoryReader(
input_dir=self._dir_path,
exclude=self._exclude_glob,
required_exts=self._required_exts,
recursive=self._recusive,
)
return reader.load_data()
|