whitelotus0 commited on
Commit
fff1c68
·
1 Parent(s): 02d7f34

code weaver

Browse files
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ OPENAI_API_KEY=your_openai_api_key
2
+ ACTIVELOOP_TOKEN=your_activeloop_api_token
3
+ ACTIVELOOP_USERNAME=your_activeloop_username
.flake8 ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [flake8]
2
+ ignore = E501, W503
3
+ max-line-length = 88
4
+ exclude = .git,__pycache__,build,dist
.gitignore ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+ repos
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container
8
+ COPY requirements.txt .
9
+
10
+ # Install any dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of the application code into the container
14
+ COPY . .
15
+
16
+ # Expose the port that the app runs on (default 8501)
17
+ EXPOSE 8501
18
+
19
+ # Command to run the application
20
+ CMD ["streamlit", "run", "app.py"]
README.md CHANGED
@@ -8,4 +8,74 @@ pinned: false
8
  short_description: Interact with your github repo
9
  ---
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  short_description: Interact with your github repo
9
  ---
10
 
11
+ # Code Weaver
12
+
13
+ This application is inspired by the [Chat-with-Github-Repo](https://github.com/peterw/Chat-with-Github-Repo) project by peterw, which demonstrates how to create a chatbot using Streamlit, OpenAI, and Activeloop's Deep Lake. Code Weaver builds upon that foundation, offering a more user-friendly experience and optimized performance.
14
+
15
+ Code Weaver is designed to let you interact with a GitHub repository through a conversational interface. It indexes the contents of a repository, allowing you to ask questions and receive answers based on the repository's documentation and code.
16
+
17
+ ![Code Weaver Screenshot](screenshot.png)
18
+
19
+ ## Key Improvements Over Original Implementation
20
+
21
+ Code Weaver offers several key improvements that enhance the user experience and the app's overall performance:
22
+
23
+ * **Purely Frontend UI:**
24
+ * The original application used a command-line interface (CLI) for configuration. Code Weaver provides a completely UI-based experience. All configurations (API keys, repo URLs, dataset names, etc.) are entered directly into the Streamlit app, making it much more accessible to users.
25
+ * **Optimized Document Parsing:**
26
+ * The document parsing and splitting process have been optimized for speed and handling of large files by using `RecursiveCharacterTextSplitter`.
27
+ * **Improved Loading times:**
28
+ * The app is now faster because the embeddings are not recalculated each time the app is started, this is achieved by using `InMemoryCache`.
29
+ * **Direct OpenAI Embeddings Model:**
30
+ * The app now directly specifies the usage of the `"text-embedding-ada-002"` OpenAI embedding model, resulting in faster embedding calculations.
31
+ * **In Memory Cache:**
32
+ * `InMemoryCache` is used to cache the results from the language model improving loading speeds.
33
+ * **Streamlined Setup:**
34
+ * No more need for `.env` files, as all necessary variables are inserted directly in the app UI.
35
+ * **Easier Deployment**: The app is built to be deployed in Hugging Face Spaces, making the deployment process easier.
36
+
37
+ ## How to Use Code Weaver
38
+
39
+ 1. **Clone the repository:**
40
+ ```bash
41
+ https://github.com/WHITELOTUS0/chat-with-a-repo.git
42
+ ```
43
+ 2. **Navigate to the Project Directory:**
44
+
45
+ ```bash
46
+ cd your-repo-name
47
+ ```
48
+ 3. **Install dependencies:**
49
+
50
+ ```bash
51
+ pip install -r requirements.txt
52
+ ```
53
+ 4. **Run the app:**
54
+ ```bash
55
+ streamlit run app.py
56
+ ```
57
+
58
+ 5. **Enter configurations**: The Streamlit app will open in your web browser. In the sidebar, provide:
59
+ * Your OpenAI API Key
60
+ * Your Activeloop Token
61
+ * Your Activeloop Username
62
+ * The GitHub repository URL you want to explore
63
+ * The file extensions to include (optional)
64
+ * A desired name for the DeepLake dataset
65
+ 6. **Process the repository**: Click on the "Process Repository" Button, and wait for it to finish.
66
+ 7. **Start Chatting:** After processing is done, you can start asking questions about the repository in the provided input text area.
67
+
68
+ ## Contributing
69
+
70
+ Contributions to this project are always welcome. If you find any bugs or would like to suggest new features, feel free to create a pull request or open an issue in this repository.
71
+
72
+ ## Credits
73
+
74
+ * Inspired by the [Chat-with-Github-Repo](https://github.com/peterw/Chat-with-Github-Repo) project by peterw.
75
+ * Built using Python, Streamlit, OpenAI, Activeloop, and Langchain.
76
+
77
+ ## License
78
+
79
+ [MIT License](LICENSE)
80
+
81
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import sys
4
+ import streamlit as st
5
+
6
+ # Add the project's root directory to the Python path
7
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
8
+
9
+ # Import and run the chat app
10
+ from src.utils.chat import run_chat_app
11
+
12
+
13
+ if __name__ == "__main__":
14
+ run_chat_app()
dev-requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ black
2
+ flake8
pyproject.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [tool.black]
2
+ line-length = 88
3
+ target-version = ['py37', 'py38', 'py39']
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ deeplake
2
+ langchain
3
+ openai
4
+ pathspec
5
+ python-dotenv
6
+ streamlit
7
+ streamlit_chat
8
+ langchain_community
screenshot.png ADDED
src/__init__.py ADDED
File without changes
src/main.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ from dotenv import load_dotenv
5
+ from streamlit.web import cli as stcli
6
+ from utils.process import process
7
+
8
+ # Load environment variables from a .env file (containing OPENAI_API_KEY)
9
+ load_dotenv()
10
+
11
+
12
+ def extract_repo_name(repo_url):
13
+ """Extract the repository name from the given repository URL."""
14
+ repo_name = repo_url.split("/")[-1].replace(".git", "")
15
+ return repo_name
16
+
17
+
18
+ def process_repo(args):
19
+ """
20
+ Process the git repository by cloning it, filtering files, and
21
+ creating an Activeloop dataset with the contents.
22
+ """
23
+ repo_name = extract_repo_name(args.repo_url)
24
+ activeloop_username = os.environ.get("ACTIVELOOP_USERNAME")
25
+
26
+ if not args.activeloop_dataset_name:
27
+ args.activeloop_dataset_path = f"hub://{activeloop_username}/{repo_name}"
28
+ else:
29
+ args.activeloop_dataset_path = (
30
+ f"hub://{activeloop_username}/{args.activeloop_dataset_name}"
31
+ )
32
+
33
+ process(
34
+ args.repo_url,
35
+ args.include_file_extensions,
36
+ args.activeloop_dataset_path,
37
+ args.repo_destination,
38
+ )
39
+
40
+
41
+ def chat(args):
42
+ """
43
+ Start the Streamlit chat application using the specified Activeloop dataset.
44
+ """
45
+ activeloop_username = os.environ.get("ACTIVELOOP_USERNAME")
46
+
47
+ args.activeloop_dataset_path = (
48
+ f"hub://{activeloop_username}/{args.activeloop_dataset_name}"
49
+ )
50
+
51
+ sys.argv = [
52
+ "streamlit",
53
+ "run",
54
+ "src/utils/chat.py",
55
+ "--",
56
+ f"--activeloop_dataset_path={args.activeloop_dataset_path}",
57
+ ]
58
+
59
+ sys.exit(stcli.main())
60
+
61
+
62
+ def main():
63
+ """Define and parse CLI arguments, then execute the appropriate subcommand."""
64
+ parser = argparse.ArgumentParser(description="Chat with a git repository")
65
+ subparsers = parser.add_subparsers(dest="command")
66
+
67
+ # Process subcommand
68
+ process_parser = subparsers.add_parser("process", help="Process a git repository")
69
+ process_parser.add_argument(
70
+ "--repo-url", required=True, help="The git repository URL"
71
+ )
72
+ process_parser.add_argument(
73
+ "--include-file-extensions",
74
+ nargs="+",
75
+ default=None,
76
+ help=(
77
+ "Exclude all files not matching these extensions. Example:"
78
+ " --include-file-extensions .py .js .ts .html .css .md .txt"
79
+ ),
80
+ )
81
+ process_parser.add_argument(
82
+ "--activeloop-dataset-name",
83
+ help=(
84
+ "The name for the Activeloop dataset. Defaults to the git repository name."
85
+ ),
86
+ )
87
+ process_parser.add_argument(
88
+ "--repo-destination",
89
+ default="repos",
90
+ help="The destination to clone the repository. Defaults to 'repos'.",
91
+ )
92
+
93
+ # Chat subcommand
94
+ chat_parser = subparsers.add_parser("chat", help="Start the chat application")
95
+ chat_parser.add_argument(
96
+ "--activeloop-dataset-name",
97
+ required=True,
98
+ help="The name of one of your existing Activeloop datasets.",
99
+ )
100
+
101
+ args = parser.parse_args()
102
+
103
+ if args.command == "process":
104
+ process_repo(args)
105
+ elif args.command == "chat":
106
+ chat(args)
107
+
108
+
109
+ if __name__ == "__main__":
110
+ main()
src/utils/__init__.py ADDED
File without changes
src/utils/chat.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/chat.py
2
+ import os
3
+ import tempfile
4
+ import streamlit as st
5
+ from langchain_community.vectorstores import DeepLake
6
+ from langchain_community.embeddings import OpenAIEmbeddings
7
+ from langchain_community.chat_models import ChatOpenAI
8
+ from langchain.chains import RetrievalQA
9
+ import openai
10
+ from streamlit_chat import message
11
+ from src.utils.process import process
12
+ from src.utils.load_and_split import load_docs, split_docs
13
+ import shutil
14
+ from langchain.cache import InMemoryCache
15
+ from langchain.globals import set_llm_cache
16
+ set_llm_cache(InMemoryCache())
17
+
18
+
19
+ def run_chat_app():
20
+ """Run the chat application using the Streamlit framework."""
21
+ st.title("Code Weaver") # App title
22
+
23
+ # Initialize session state variables if they don't exist
24
+ if "generated" not in st.session_state:
25
+ st.session_state["generated"] = ["I am ready to help you!"]
26
+ if "past" not in st.session_state:
27
+ st.session_state["past"] = ["Hello"]
28
+
29
+ # Initialize data and status in the session
30
+ if "data" not in st.session_state:
31
+ st.session_state["data"] = {
32
+ "repo_url": None,
33
+ "include_file_extensions": None,
34
+ "activeloop_dataset_path": None,
35
+ "repo_destination": None,
36
+ "status": "Please Provide Data"
37
+ }
38
+ # Sidebar for API keys and data
39
+ with st.sidebar:
40
+ st.header("Configuration")
41
+ # Open AI key
42
+ openai_api_key = st.text_input("OpenAI API Key", type="password")
43
+ if openai_api_key:
44
+ os.environ["OPENAI_API_KEY"] = openai_api_key
45
+ #activeloop key
46
+ activeloop_token = st.text_input("Activeloop Token", type="password")
47
+ if activeloop_token:
48
+ os.environ["ACTIVELOOP_TOKEN"] = activeloop_token
49
+ # activeloop username
50
+ activeloop_username = st.text_input("Activeloop Username")
51
+ if activeloop_username:
52
+ os.environ["ACTIVELOOP_USERNAME"] = activeloop_username
53
+
54
+
55
+ st.session_state["data"]["repo_url"] = st.text_input("GitHub Repository URL")
56
+ file_extensions_input = st.text_input("File Extensions (comma-separated, e.g., .py,.js)").strip()
57
+ st.session_state["data"]["include_file_extensions"] = [ext.strip() for ext in file_extensions_input.split(",")] if file_extensions_input else None
58
+
59
+ dataset_name = st.text_input("Dataset Name")
60
+ if dataset_name:
61
+ st.session_state["data"]["activeloop_dataset_path"] = f"hub://{os.environ.get('ACTIVELOOP_USERNAME')}/{dataset_name}"
62
+ else:
63
+ st.session_state["data"]["activeloop_dataset_path"] = None
64
+
65
+ st.session_state["data"]["repo_destination"] = "repos"
66
+
67
+ if st.button("Process Repository"):
68
+ if st.session_state["data"]["repo_url"] and st.session_state["data"]["activeloop_dataset_path"] and os.environ.get("OPENAI_API_KEY") and os.environ.get("ACTIVELOOP_TOKEN") and os.environ.get("ACTIVELOOP_USERNAME") :
69
+ st.session_state["data"]["status"] = "Processing Data"
70
+ with st.spinner("Processing the repository, please wait"):
71
+ process_repo()
72
+ st.session_state["data"]["status"] = "Ready to Chat!"
73
+ else :
74
+ st.session_state["data"]["status"] = "Missing Data"
75
+
76
+
77
+ # Chat input and display area
78
+ st.write(st.session_state["data"]["status"])
79
+ if st.session_state["data"]["status"] == "Ready to Chat!":
80
+ user_input = get_text()
81
+ if user_input:
82
+ output = search_db(user_input)
83
+ st.session_state.past.append(user_input)
84
+ st.session_state.generated.append(output)
85
+ if st.session_state["generated"]:
86
+ for i in range(len(st.session_state["generated"])):
87
+ message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
88
+ message(st.session_state["generated"][i], key=str(i))
89
+ # Footer
90
+ st.markdown(
91
+ """
92
+ <br><hr style="border:2px solid gray">
93
+ <p style="text-align:center; font-size: 12px;">
94
+ Made with ❤️ by <a href="https://www.linkedin.com/in/glorry-sibomana/">Glorry Sibomana</a>
95
+ </p>
96
+ """,
97
+ unsafe_allow_html=True,
98
+ )
99
+
100
+
101
+
102
+ def get_text():
103
+ """Create a Streamlit input field and return the user's input."""
104
+ input_text = st.text_input("Enter your query:", key="input", label_visibility="hidden")
105
+ return input_text
106
+
107
+
108
+ def search_db(query):
109
+ """Search for a response to the query in the DeepLake database."""
110
+ # Set up embeddings and database
111
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
112
+ db = DeepLake(
113
+ dataset_path=st.session_state["data"]["activeloop_dataset_path"],
114
+ read_only=True,
115
+ embedding_function=embeddings,
116
+ )
117
+
118
+ # Set up retriever with custom search parameters
119
+ retriever = db.as_retriever()
120
+ retriever.search_kwargs["distance_metric"] = "cos"
121
+ retriever.search_kwargs["fetch_k"] = 100
122
+ retriever.search_kwargs["k"] = 10
123
+
124
+ # Initialize chat model
125
+ model = ChatOpenAI(model="gpt-3.5-turbo")
126
+
127
+ # Set up RetrievalQA chain
128
+ qa = RetrievalQA.from_llm(model, retriever=retriever)
129
+ return qa.run(query)
130
+
131
+
132
+
133
+
134
+ def process_repo():
135
+ """Process the repository and save embeddings into Deep Lake dataset."""
136
+
137
+ with tempfile.TemporaryDirectory() as temp_dir:
138
+ repo_destination = os.path.join(temp_dir, "repo_clone")
139
+
140
+ repo_url = st.session_state["data"]["repo_url"]
141
+ include_file_extensions = st.session_state["data"]["include_file_extensions"]
142
+ activeloop_dataset_path = st.session_state["data"]["activeloop_dataset_path"]
143
+
144
+ process(
145
+ repo_url,
146
+ include_file_extensions,
147
+ activeloop_dataset_path,
148
+ repo_destination,
149
+ )
150
+
151
+
152
+ if __name__ == "__main__":
153
+ run_chat_app()
src/utils/load_and_split.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/load_and_split.py
2
+
3
+ import os
4
+ import pathspec
5
+ from langchain_community.document_loaders import TextLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+
9
+ def load_docs(root_dir, file_extensions=None):
10
+ """
11
+ Load documents from the specified root directory.
12
+ Ignore dotfiles, dot directories, and files that match .gitignore rules.
13
+ Optionally filter by file extensions.
14
+ """
15
+ docs = []
16
+
17
+ # Load .gitignore rules
18
+ gitignore_path = os.path.join(root_dir, ".gitignore")
19
+
20
+ if os.path.isfile(gitignore_path):
21
+ with open(gitignore_path, "r") as gitignore_file:
22
+ gitignore = gitignore_file.read()
23
+ spec = pathspec.PathSpec.from_lines(
24
+ pathspec.patterns.GitWildMatchPattern, gitignore.splitlines()
25
+ )
26
+ else:
27
+ spec = None
28
+
29
+ for dirpath, dirnames, filenames in os.walk(root_dir):
30
+ # Remove dot directories from the list of directory names
31
+ dirnames[:] = [d for d in dirnames if not d.startswith(".")]
32
+
33
+ for file in filenames:
34
+ file_path = os.path.join(dirpath, file)
35
+
36
+ # Skip dotfiles
37
+ if file.startswith("."):
38
+ continue
39
+
40
+ # Skip files that match .gitignore rules
41
+ if spec and spec.match_file(file_path):
42
+ continue
43
+
44
+ if file_extensions and os.path.splitext(file)[1] not in file_extensions:
45
+ continue
46
+
47
+ try:
48
+ loader = TextLoader(file_path, encoding="utf-8")
49
+ docs.extend(loader.load_and_split())
50
+ except Exception:
51
+ pass
52
+ return docs
53
+
54
+
55
+ def split_docs(docs):
56
+ """Split the input documents into smaller chunks."""
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
58
+ return text_splitter.split_documents(docs)
src/utils/process.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/process.py
2
+
3
+ import deeplake
4
+ import openai
5
+ import os
6
+ import subprocess
7
+ from langchain_community.document_loaders import TextLoader
8
+ from langchain_community.embeddings import OpenAIEmbeddings
9
+ from langchain_community.vectorstores import DeepLake
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from src.utils.load_and_split import load_docs, split_docs # Updated import
12
+
13
+
14
+ def clone_repository(repo_url, local_path):
15
+ """Clone the specified git repository to the given local path."""
16
+ subprocess.run(["git", "clone", repo_url, local_path], check=True, capture_output=True)
17
+
18
+
19
+ def create_deeplake_dataset(activeloop_dataset_path, activeloop_token):
20
+ """Create an empty DeepLake dataset with the specified path and token."""
21
+ ds = deeplake.empty(
22
+ activeloop_dataset_path,
23
+ token=activeloop_token,
24
+ overwrite=True,
25
+ )
26
+
27
+ ds.create_tensor("ids")
28
+ ds.create_tensor("metadata")
29
+ ds.create_tensor("embedding")
30
+ ds.create_tensor("text")
31
+
32
+
33
+ def process(
34
+ repo_url, include_file_extensions, activeloop_dataset_path, repo_destination
35
+ ):
36
+ """
37
+ Process a git repository by cloning it, filtering files, splitting documents,
38
+ creating embeddings, and storing everything in a DeepLake dataset.
39
+ """
40
+ activeloop_token = os.getenv("ACTIVELOOP_TOKEN")
41
+
42
+ create_deeplake_dataset(activeloop_dataset_path, activeloop_token)
43
+
44
+ clone_repository(repo_url, repo_destination)
45
+
46
+ docs = load_docs(repo_destination, include_file_extensions)
47
+ texts = split_docs(docs)
48
+
49
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
50
+
51
+ db = DeepLake(dataset_path=activeloop_dataset_path, embedding_function=embeddings)
52
+ db.add_documents(texts)