Spaces:
Sleeping
Sleeping
deployment_v1
Browse files- Dockerfile +12 -0
- LICENSE +201 -0
- __pycache__/api.cpython-310.pyc +0 -0
- __pycache__/embed_store.cpython-310.pyc +0 -0
- __pycache__/ingest.cpython-310.pyc +0 -0
- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/query.cpython-310.pyc +0 -0
- api.py +67 -0
- embed_store.py +42 -0
- ingest.py +199 -0
- learnings.txt +4 -0
- main.py +36 -0
- query.py +273 -0
- requirements.txt +189 -0
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
EXPOSE 7860
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
__pycache__/api.cpython-310.pyc
ADDED
|
Binary file (1.72 kB). View file
|
|
|
__pycache__/embed_store.cpython-310.pyc
ADDED
|
Binary file (1.28 kB). View file
|
|
|
__pycache__/ingest.cpython-310.pyc
ADDED
|
Binary file (4.87 kB). View file
|
|
|
__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
__pycache__/query.cpython-310.pyc
ADDED
|
Binary file (6.93 kB). View file
|
|
|
api.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
+
from ingest import ingest_repository
|
| 5 |
+
from query import (
|
| 6 |
+
VECTORSTORE_CACHE,
|
| 7 |
+
MEMORY_CACHE,
|
| 8 |
+
initialize_repo_caches,
|
| 9 |
+
ask_question,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
app = FastAPI(title="RAG Backend", version="1.0.0")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class LoadRepoRequest(BaseModel):
|
| 17 |
+
repo_url: str
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AskRequest(BaseModel):
|
| 21 |
+
repo_name: str
|
| 22 |
+
question: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@app.post("/load_repo")
|
| 26 |
+
def load_repo(payload: LoadRepoRequest):
|
| 27 |
+
repo_url = payload.repo_url.strip()
|
| 28 |
+
if not repo_url:
|
| 29 |
+
raise HTTPException(status_code=400, detail="repo_url is required")
|
| 30 |
+
|
| 31 |
+
repo_name = ingest_repository(repo_url)
|
| 32 |
+
initialize_repo_caches(repo_name)
|
| 33 |
+
print("AFTER LOAD:", VECTORSTORE_CACHE.keys(), MEMORY_CACHE.keys())
|
| 34 |
+
|
| 35 |
+
return {
|
| 36 |
+
"status": "success",
|
| 37 |
+
"repo": repo_name,
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@app.post("/ask")
|
| 42 |
+
def ask(payload: AskRequest):
|
| 43 |
+
repo_name = payload.repo_name.strip()
|
| 44 |
+
question = payload.question.strip()
|
| 45 |
+
|
| 46 |
+
if not repo_name:
|
| 47 |
+
raise HTTPException(status_code=400, detail="repo_name is required")
|
| 48 |
+
if not question:
|
| 49 |
+
raise HTTPException(status_code=400, detail="question is required")
|
| 50 |
+
|
| 51 |
+
if repo_name not in VECTORSTORE_CACHE or repo_name not in MEMORY_CACHE:
|
| 52 |
+
raise HTTPException(status_code=400, detail="repo not loaded")
|
| 53 |
+
|
| 54 |
+
answer, docs = ask_question(question, repo_name)
|
| 55 |
+
sources = []
|
| 56 |
+
seen = set()
|
| 57 |
+
|
| 58 |
+
for doc in docs:
|
| 59 |
+
path = doc.metadata.get("path")
|
| 60 |
+
if path and path not in seen:
|
| 61 |
+
seen.add(path)
|
| 62 |
+
sources.append(path)
|
| 63 |
+
|
| 64 |
+
return {
|
| 65 |
+
"answer": answer,
|
| 66 |
+
"sources": sources,
|
| 67 |
+
}
|
embed_store.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 2 |
+
from langchain_community.vectorstores import Qdrant
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_embeddings():
|
| 7 |
+
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def store_embeddings(chunks, embeddings):
|
| 11 |
+
client = QdrantClient(url="http://localhost:6333")
|
| 12 |
+
|
| 13 |
+
collection_name = "repo_docs"
|
| 14 |
+
|
| 15 |
+
# create collection manually (safe + explicit)
|
| 16 |
+
client.recreate_collection(
|
| 17 |
+
collection_name=collection_name,
|
| 18 |
+
vectors_config={
|
| 19 |
+
"size": 384, # MiniLM embedding size
|
| 20 |
+
"distance": "Cosine"
|
| 21 |
+
}
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
texts = [c["content"] for c in chunks]
|
| 25 |
+
metadatas = [
|
| 26 |
+
{
|
| 27 |
+
"path": c["path"],
|
| 28 |
+
"type": c["type"],
|
| 29 |
+
"file_name": c["file_name"]
|
| 30 |
+
}
|
| 31 |
+
for c in chunks
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
vectorstore = Qdrant(
|
| 35 |
+
client=client,
|
| 36 |
+
collection_name=collection_name,
|
| 37 |
+
embeddings=embeddings,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
vectorstore.add_texts(texts, metadatas=metadatas)
|
| 41 |
+
|
| 42 |
+
return vectorstore
|
ingest.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from git import Repo
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.models import Distance, VectorParams
|
| 5 |
+
from langchain_qdrant import QdrantVectorStore
|
| 6 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_repo_name(repo_url):
|
| 11 |
+
cleaned = repo_url.rstrip("/")
|
| 12 |
+
name = cleaned.split("/")[-1]
|
| 13 |
+
if name.endswith(".git"):
|
| 14 |
+
name = name[:-4]
|
| 15 |
+
return name.replace("-", "_").replace(".", "_")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def clone_repo(repo_url, local_path="cloned_repo"):
|
| 19 |
+
if os.path.exists(local_path):
|
| 20 |
+
return local_path
|
| 21 |
+
|
| 22 |
+
Repo.clone_from(repo_url, local_path)
|
| 23 |
+
return local_path
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def load_code_files(repo_path):
|
| 27 |
+
code_files = []
|
| 28 |
+
|
| 29 |
+
code_ext = (".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs")
|
| 30 |
+
doc_ext = (".md", ".rst", ".txt")
|
| 31 |
+
config_ext = (".json", ".yaml", ".yml", ".toml", ".ini", ".env")
|
| 32 |
+
special_files = ("Dockerfile", "Makefile")
|
| 33 |
+
|
| 34 |
+
skip_dirs = (
|
| 35 |
+
".git",
|
| 36 |
+
"node_modules",
|
| 37 |
+
"pycache",
|
| 38 |
+
"dist",
|
| 39 |
+
"build",
|
| 40 |
+
"venv",
|
| 41 |
+
".venv",
|
| 42 |
+
"env",
|
| 43 |
+
".env",
|
| 44 |
+
"site-packages",
|
| 45 |
+
".idea",
|
| 46 |
+
".vscode",
|
| 47 |
+
"coverage",
|
| 48 |
+
".pytest_cache",
|
| 49 |
+
"logs",
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
lock_files = {"package-lock.json", "yarn.lock", "poetry.lock", "pipfile.lock"}
|
| 53 |
+
main_code_hints = ("/src/", "/core/", "/app/", "/service/","/services/","/lib/","/models/")
|
| 54 |
+
|
| 55 |
+
def _is_comment_or_whitespace_only(content):
|
| 56 |
+
comment_prefixes = ("#", "//", "/*", "*", "*/", "--", "<!--", "-->")
|
| 57 |
+
for line in content.splitlines():
|
| 58 |
+
stripped = line.strip()
|
| 59 |
+
if not stripped:
|
| 60 |
+
continue
|
| 61 |
+
if stripped.startswith(comment_prefixes):
|
| 62 |
+
continue
|
| 63 |
+
return False
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
for root, _, files in os.walk(repo_path):
|
| 67 |
+
root_lower = root.lower().replace("\\", "/")
|
| 68 |
+
if any(skip in root_lower for skip in skip_dirs):
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
for file in files:
|
| 72 |
+
file_lower = file.lower()
|
| 73 |
+
full_path = os.path.join(root, file)
|
| 74 |
+
normalized_path = full_path.replace("\\", "/").lower()
|
| 75 |
+
|
| 76 |
+
if file_lower in lock_files:
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
if file_lower.endswith((".min.js", ".bundle.js")):
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
if os.path.getsize(full_path) > 300 * 1024:
|
| 84 |
+
continue
|
| 85 |
+
except Exception:
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
if file in special_files:
|
| 89 |
+
file_type = "config"
|
| 90 |
+
elif file_lower.endswith(code_ext):
|
| 91 |
+
file_type = "code"
|
| 92 |
+
elif file_lower.endswith(doc_ext):
|
| 93 |
+
file_type = "docs"
|
| 94 |
+
elif file_lower.endswith(config_ext):
|
| 95 |
+
file_type = "config"
|
| 96 |
+
else:
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 101 |
+
content = f.read()
|
| 102 |
+
|
| 103 |
+
if not content.strip():
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
if file_lower == "__init__.py" and len(content) < 200:
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
if file_type != "docs" and _is_comment_or_whitespace_only(content):
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
is_main_code = any(hint in normalized_path for hint in main_code_hints)
|
| 113 |
+
|
| 114 |
+
code_files.append(
|
| 115 |
+
{
|
| 116 |
+
"content": content,
|
| 117 |
+
"path": full_path,
|
| 118 |
+
"priority": file_type,
|
| 119 |
+
"file_name": os.path.basename(full_path),
|
| 120 |
+
"is_main_code": is_main_code,
|
| 121 |
+
}
|
| 122 |
+
)
|
| 123 |
+
except Exception:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
return code_files
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def chunk_files(code_files):
|
| 130 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 131 |
+
chunk_size=800,
|
| 132 |
+
chunk_overlap=100,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
documents = []
|
| 136 |
+
|
| 137 |
+
for file in code_files:
|
| 138 |
+
chunks = splitter.split_text(file["content"])
|
| 139 |
+
|
| 140 |
+
for chunk in chunks:
|
| 141 |
+
documents.append(
|
| 142 |
+
{
|
| 143 |
+
"content": chunk,
|
| 144 |
+
"path": file["path"],
|
| 145 |
+
"file_name": os.path.basename(file["path"]),
|
| 146 |
+
"type": file["priority"],
|
| 147 |
+
}
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
return documents
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def get_embeddings_model():
|
| 154 |
+
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _repo_collection_name(repo_name):
|
| 158 |
+
return f"repo_docs_{repo_name}"
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def store_embeddings(chunks, repo_name):
|
| 162 |
+
collection_name = _repo_collection_name(repo_name)
|
| 163 |
+
client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
|
| 164 |
+
|
| 165 |
+
client.recreate_collection(
|
| 166 |
+
collection_name=collection_name,
|
| 167 |
+
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
vectorstore = QdrantVectorStore(
|
| 171 |
+
client=client,
|
| 172 |
+
collection_name=collection_name,
|
| 173 |
+
embedding=get_embeddings_model(),
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
texts = [c["content"] for c in chunks]
|
| 177 |
+
metadatas = [
|
| 178 |
+
{
|
| 179 |
+
"path": c["path"],
|
| 180 |
+
"type": c["type"],
|
| 181 |
+
"file_name": c["file_name"],
|
| 182 |
+
}
|
| 183 |
+
for c in chunks
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
if texts:
|
| 187 |
+
vectorstore.add_texts(texts, metadatas=metadatas)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def ingest_repository(repo_url, base_dir="cloned_repo"):
|
| 191 |
+
repo_name = get_repo_name(repo_url)
|
| 192 |
+
local_path = os.path.join(base_dir, repo_name)
|
| 193 |
+
|
| 194 |
+
path = clone_repo(repo_url, local_path=local_path)
|
| 195 |
+
files = load_code_files(path)
|
| 196 |
+
chunks = chunk_files(files)
|
| 197 |
+
store_embeddings(chunks, repo_name)
|
| 198 |
+
|
| 199 |
+
return repo_name
|
learnings.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
os
|
| 2 |
+
json operations
|
| 3 |
+
files
|
| 4 |
+
argparse
|
main.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ingest import clone_repo, load_code_files, chunk_files
|
| 2 |
+
from embed_store import get_embeddings, store_embeddings
|
| 3 |
+
from query import ask_question
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def ingest_repository(repo_url):
|
| 7 |
+
print("Cloning repository...")
|
| 8 |
+
path = clone_repo(repo_url)
|
| 9 |
+
print("Loading code files...")
|
| 10 |
+
files = load_code_files(path)
|
| 11 |
+
print("Chunking files...")
|
| 12 |
+
chunks = chunk_files(files)
|
| 13 |
+
print("Generating embeddings...")
|
| 14 |
+
embeddings = get_embeddings()
|
| 15 |
+
print("Storing embeddings...")
|
| 16 |
+
store_embeddings(chunks, embeddings)
|
| 17 |
+
print("Repository ingestion completed.")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
repo_url = input("Repository URL: ").strip()
|
| 22 |
+
if not repo_url:
|
| 23 |
+
raise ValueError("Repository URL cannot be empty.")
|
| 24 |
+
|
| 25 |
+
ingest_repository(repo_url)
|
| 26 |
+
|
| 27 |
+
while True:
|
| 28 |
+
question = input(">> ").strip()
|
| 29 |
+
|
| 30 |
+
if question.lower() == "exit":
|
| 31 |
+
break
|
| 32 |
+
if not question:
|
| 33 |
+
continue
|
| 34 |
+
|
| 35 |
+
answer, _ = ask_question(question)
|
| 36 |
+
print("\nAnswer:\n", answer)
|
query.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import os
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
from qdrant_client.models import Distance, VectorParams
|
| 5 |
+
from langchain_qdrant import QdrantVectorStore
|
| 6 |
+
from langchain_groq import ChatGroq
|
| 7 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
VECTORSTORE_CACHE = {}
|
| 15 |
+
MEMORY_CACHE = {}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _repo_collection_name(repo_name):
|
| 19 |
+
return f"repo_docs_{repo_name}"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _memory_collection_name(repo_name):
|
| 23 |
+
return f"memory_{repo_name}"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_embeddings_model():
|
| 27 |
+
return HuggingFaceEmbeddings(
|
| 28 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_llm():
|
| 33 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 34 |
+
if not groq_api_key:
|
| 35 |
+
raise ValueError("GROQ_API_KEY is not set")
|
| 36 |
+
|
| 37 |
+
return ChatGroq(
|
| 38 |
+
model="llama-3.1-8b-instant",
|
| 39 |
+
temperature=0,
|
| 40 |
+
api_key=groq_api_key,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _invoke_text(llm, prompt):
|
| 45 |
+
result = llm.invoke(prompt)
|
| 46 |
+
|
| 47 |
+
if isinstance(result, str):
|
| 48 |
+
return result
|
| 49 |
+
|
| 50 |
+
content = getattr(result, "content", "")
|
| 51 |
+
if isinstance(content, list):
|
| 52 |
+
parts = []
|
| 53 |
+
for item in content:
|
| 54 |
+
if isinstance(item, str):
|
| 55 |
+
parts.append(item)
|
| 56 |
+
elif isinstance(item, dict):
|
| 57 |
+
text = item.get("text")
|
| 58 |
+
if text:
|
| 59 |
+
parts.append(text)
|
| 60 |
+
return "".join(parts)
|
| 61 |
+
|
| 62 |
+
return str(content)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _get_client():
|
| 66 |
+
return QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _ensure_collection(client, collection_name):
|
| 70 |
+
if not client.collection_exists(collection_name):
|
| 71 |
+
client.create_collection(
|
| 72 |
+
collection_name=collection_name,
|
| 73 |
+
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_vectorstore(repo_name):
|
| 78 |
+
if repo_name in VECTORSTORE_CACHE:
|
| 79 |
+
return VECTORSTORE_CACHE[repo_name]
|
| 80 |
+
|
| 81 |
+
client = _get_client()
|
| 82 |
+
embeddings = get_embeddings_model()
|
| 83 |
+
collection_name = _repo_collection_name(repo_name)
|
| 84 |
+
|
| 85 |
+
_ensure_collection(client, collection_name)
|
| 86 |
+
|
| 87 |
+
vectorstore = QdrantVectorStore(
|
| 88 |
+
client=client,
|
| 89 |
+
collection_name=collection_name,
|
| 90 |
+
embedding=embeddings,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
VECTORSTORE_CACHE[repo_name] = vectorstore
|
| 94 |
+
return vectorstore
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def get_memory_vectorstore(repo_name):
|
| 98 |
+
if repo_name in MEMORY_CACHE:
|
| 99 |
+
return MEMORY_CACHE[repo_name]
|
| 100 |
+
|
| 101 |
+
client = _get_client()
|
| 102 |
+
embeddings = get_embeddings_model()
|
| 103 |
+
collection_name = _memory_collection_name(repo_name)
|
| 104 |
+
|
| 105 |
+
_ensure_collection(client, collection_name)
|
| 106 |
+
|
| 107 |
+
memory_store = QdrantVectorStore(
|
| 108 |
+
client=client,
|
| 109 |
+
collection_name=collection_name,
|
| 110 |
+
embedding=embeddings,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
MEMORY_CACHE[repo_name] = memory_store
|
| 114 |
+
return memory_store
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def initialize_repo_caches(repo_name):
|
| 118 |
+
get_vectorstore(repo_name)
|
| 119 |
+
get_memory_vectorstore(repo_name)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def store_memory(query, response, repo_name):
|
| 123 |
+
if len(query.strip()) <= 10:
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
memory_text = f"User: {query}\nAssistant: {response}"
|
| 127 |
+
|
| 128 |
+
memory_store = get_memory_vectorstore(repo_name)
|
| 129 |
+
memory_store.add_texts(
|
| 130 |
+
[memory_text],
|
| 131 |
+
metadatas=[
|
| 132 |
+
{
|
| 133 |
+
"type": "memory",
|
| 134 |
+
"timestamp": time.time(),
|
| 135 |
+
}
|
| 136 |
+
],
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def get_retriever(vectorstore):
|
| 141 |
+
return vectorstore.as_retriever(
|
| 142 |
+
search_type="mmr",
|
| 143 |
+
search_kwargs={"k": 6, "fetch_k": 24},
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _get_overview_retriever(vectorstore):
|
| 148 |
+
return vectorstore.as_retriever(
|
| 149 |
+
search_type="mmr",
|
| 150 |
+
search_kwargs={"k": 10, "fetch_k": 40},
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _looks_code_intent(query):
|
| 155 |
+
q = query.lower()
|
| 156 |
+
code_signals = [
|
| 157 |
+
"function", "method", "class", "module", "file", "implementation", "logic",
|
| 158 |
+
"algorithm", "predict", "prediction", "how does", "how is", "where is", "call",
|
| 159 |
+
"returns", "parameter", "bug", "error", "traceback", "stack", "refactor"
|
| 160 |
+
]
|
| 161 |
+
return any(signal in q for signal in code_signals)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def _looks_overview_intent(query):
|
| 165 |
+
q = query.lower().strip()
|
| 166 |
+
overview_signals = [
|
| 167 |
+
"what does this repository do",
|
| 168 |
+
"what does this repo do",
|
| 169 |
+
"what is this repository",
|
| 170 |
+
"what is this repo",
|
| 171 |
+
"repository summary",
|
| 172 |
+
"repo summary",
|
| 173 |
+
"overview",
|
| 174 |
+
"high level",
|
| 175 |
+
"purpose of",
|
| 176 |
+
]
|
| 177 |
+
return any(signal in q for signal in overview_signals)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def _select_diverse_docs(docs, max_docs=8, max_per_path=2):
|
| 181 |
+
selected = []
|
| 182 |
+
per_path = {}
|
| 183 |
+
|
| 184 |
+
for doc in docs:
|
| 185 |
+
path = doc.metadata.get("path", "")
|
| 186 |
+
count = per_path.get(path, 0)
|
| 187 |
+
if count >= max_per_path:
|
| 188 |
+
continue
|
| 189 |
+
selected.append(doc)
|
| 190 |
+
per_path[path] = count + 1
|
| 191 |
+
if len(selected) >= max_docs:
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
return selected or docs[:max_docs]
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _rewrite_query(question, conversation_chunks, llm):
|
| 198 |
+
if not conversation_chunks:
|
| 199 |
+
return question
|
| 200 |
+
|
| 201 |
+
memory_context = "\n\n".join(conversation_chunks)
|
| 202 |
+
rewrite_prompt = f"""
|
| 203 |
+
Rewrite the user question into a standalone retrieval query.
|
| 204 |
+
Use relevant details from prior conversation only when needed to resolve references.
|
| 205 |
+
Keep technical names, filenames, class names, and function names unchanged.
|
| 206 |
+
Return only the rewritten query.
|
| 207 |
+
|
| 208 |
+
Relevant Past Conversation:
|
| 209 |
+
{memory_context}
|
| 210 |
+
|
| 211 |
+
Original Question:
|
| 212 |
+
{question}
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
rewritten = _invoke_text(llm, rewrite_prompt).strip()
|
| 216 |
+
if not rewritten:
|
| 217 |
+
return question
|
| 218 |
+
|
| 219 |
+
rewritten = rewritten.replace("\n", " ").strip('"\' ')
|
| 220 |
+
return rewritten or question
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def ask_question(query, repo_name):
|
| 224 |
+
vectorstore = get_vectorstore(repo_name)
|
| 225 |
+
llm = get_llm()
|
| 226 |
+
|
| 227 |
+
memory_store = get_memory_vectorstore(repo_name)
|
| 228 |
+
memory_retriever = memory_store.as_retriever(search_kwargs={"k": 3})
|
| 229 |
+
memory_docs = memory_retriever.invoke(query)
|
| 230 |
+
|
| 231 |
+
conversation_chunks = [d.page_content for d in memory_docs]
|
| 232 |
+
rewritten_query = _rewrite_query(query, conversation_chunks, llm)
|
| 233 |
+
|
| 234 |
+
is_overview_query = _looks_overview_intent(query) or _looks_overview_intent(rewritten_query)
|
| 235 |
+
retriever = _get_overview_retriever(vectorstore) if is_overview_query else get_retriever(vectorstore)
|
| 236 |
+
|
| 237 |
+
repo_docs = retriever.invoke(rewritten_query)
|
| 238 |
+
repo_docs = _select_diverse_docs(repo_docs, max_docs=10 if is_overview_query else 8)
|
| 239 |
+
|
| 240 |
+
if (not is_overview_query) and (_looks_code_intent(query) or _looks_code_intent(rewritten_query)):
|
| 241 |
+
code_docs = [d for d in repo_docs if d.metadata.get("type") == "code"]
|
| 242 |
+
if code_docs:
|
| 243 |
+
repo_docs = _select_diverse_docs(code_docs, max_docs=8)
|
| 244 |
+
|
| 245 |
+
conversation_context = "\n\n".join([d.page_content for d in memory_docs]) or "None"
|
| 246 |
+
code_context = "\n\n".join([doc.page_content for doc in repo_docs])
|
| 247 |
+
|
| 248 |
+
context = (
|
| 249 |
+
f"Relevant Past Conversation:\n{conversation_context}\n\n"
|
| 250 |
+
f"Relevant Code Context:\n{code_context}\n\n"
|
| 251 |
+
f"Question:\n{query}"
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
prompt = f"""
|
| 255 |
+
You are a senior software engineer.
|
| 256 |
+
|
| 257 |
+
Use:
|
| 258 |
+
* Relevant Past Conversation to resolve references like "that function"
|
| 259 |
+
* Relevant Code Context for factual answers
|
| 260 |
+
|
| 261 |
+
If exact answer is missing, infer logically from code and mention it is an inference.
|
| 262 |
+
|
| 263 |
+
Be concise and technical.
|
| 264 |
+
|
| 265 |
+
Context:
|
| 266 |
+
{context}
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
response = _invoke_text(llm, prompt)
|
| 270 |
+
|
| 271 |
+
store_memory(query, response, repo_name)
|
| 272 |
+
|
| 273 |
+
return response, repo_docs
|
requirements.txt
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anyio==4.13.0
|
| 2 |
+
apturl==0.5.2
|
| 3 |
+
argon2-cffi==25.1.0
|
| 4 |
+
argon2-cffi-bindings==25.1.0
|
| 5 |
+
arrow==1.4.0
|
| 6 |
+
asttokens==3.0.1
|
| 7 |
+
async-lru==2.3.0
|
| 8 |
+
attrs==26.1.0
|
| 9 |
+
babel==2.18.0
|
| 10 |
+
bcrypt==3.2.0
|
| 11 |
+
beautifulsoup4==4.14.3
|
| 12 |
+
bleach==6.3.0
|
| 13 |
+
blinker==1.4
|
| 14 |
+
Brlapi==0.8.3
|
| 15 |
+
certifi==2026.2.25
|
| 16 |
+
cffi==2.0.0
|
| 17 |
+
chardet==4.0.0
|
| 18 |
+
charset-normalizer==3.4.7
|
| 19 |
+
click==8.0.3
|
| 20 |
+
colorama==0.4.4
|
| 21 |
+
comm==0.2.3
|
| 22 |
+
command-not-found==0.3
|
| 23 |
+
contourpy==1.3.2
|
| 24 |
+
cryptography==3.4.8
|
| 25 |
+
cupshelpers==1.0
|
| 26 |
+
cycler==0.12.1
|
| 27 |
+
dbus-python==1.2.18
|
| 28 |
+
debugpy==1.8.20
|
| 29 |
+
decorator==5.2.1
|
| 30 |
+
defer==1.0.6
|
| 31 |
+
defusedxml==0.7.1
|
| 32 |
+
distro==1.7.0
|
| 33 |
+
distro-info==1.1+ubuntu0.2
|
| 34 |
+
duplicity==0.8.21
|
| 35 |
+
exceptiongroup==1.3.1
|
| 36 |
+
executing==2.2.1
|
| 37 |
+
fasteners==0.14.1
|
| 38 |
+
fastjsonschema==2.21.2
|
| 39 |
+
filelock==3.25.2
|
| 40 |
+
fonttools==4.62.1
|
| 41 |
+
fqdn==1.5.1
|
| 42 |
+
fsspec==2026.2.0
|
| 43 |
+
future==0.18.2
|
| 44 |
+
h11==0.16.0
|
| 45 |
+
httpcore==1.0.9
|
| 46 |
+
httplib2==0.20.2
|
| 47 |
+
httpx==0.28.1
|
| 48 |
+
idna==3.3
|
| 49 |
+
importlib-metadata==4.6.4
|
| 50 |
+
ipykernel==7.2.0
|
| 51 |
+
ipython==8.39.0
|
| 52 |
+
isoduration==20.11.0
|
| 53 |
+
jedi==0.19.2
|
| 54 |
+
jeepney==0.7.1
|
| 55 |
+
Jinja2==3.1.6
|
| 56 |
+
joblib==1.5.3
|
| 57 |
+
json5==0.14.0
|
| 58 |
+
jsonpointer==3.1.1
|
| 59 |
+
jsonschema==4.26.0
|
| 60 |
+
jsonschema-specifications==2025.9.1
|
| 61 |
+
jupyter-events==0.12.0
|
| 62 |
+
jupyter-lsp==2.3.1
|
| 63 |
+
jupyter_client==8.8.0
|
| 64 |
+
jupyter_core==5.9.1
|
| 65 |
+
jupyter_server==2.17.0
|
| 66 |
+
jupyter_server_terminals==0.5.4
|
| 67 |
+
jupyterlab==4.5.6
|
| 68 |
+
jupyterlab_pygments==0.3.0
|
| 69 |
+
jupyterlab_server==2.28.0
|
| 70 |
+
keyring==23.5.0
|
| 71 |
+
kiwisolver==1.5.0
|
| 72 |
+
language-selector==0.1
|
| 73 |
+
lark==1.3.1
|
| 74 |
+
launchpadlib==1.10.16
|
| 75 |
+
lazr.restfulclient==0.14.4
|
| 76 |
+
lazr.uri==1.0.6
|
| 77 |
+
lockfile==0.12.2
|
| 78 |
+
louis==3.20.0
|
| 79 |
+
macaroonbakery==1.3.1
|
| 80 |
+
Mako==1.1.3
|
| 81 |
+
MarkupSafe==2.0.1
|
| 82 |
+
matplotlib==3.10.8
|
| 83 |
+
matplotlib-inline==0.2.1
|
| 84 |
+
mistune==3.2.0
|
| 85 |
+
monotonic==1.6
|
| 86 |
+
more-itertools==8.10.0
|
| 87 |
+
mpmath==1.3.0
|
| 88 |
+
nbclient==0.10.4
|
| 89 |
+
nbconvert==7.17.1
|
| 90 |
+
nbformat==5.10.4
|
| 91 |
+
nest-asyncio==1.6.0
|
| 92 |
+
netifaces==0.11.0
|
| 93 |
+
networkx==3.4.2
|
| 94 |
+
notebook==7.5.5
|
| 95 |
+
notebook_shim==0.2.4
|
| 96 |
+
numpy==2.2.6
|
| 97 |
+
nvidia-cublas-cu12==12.1.3.1
|
| 98 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
| 99 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 100 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
| 101 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 102 |
+
nvidia-cufft-cu12==11.0.2.54
|
| 103 |
+
nvidia-curand-cu12==10.3.2.106
|
| 104 |
+
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
+
nvidia-cusparse-cu12==12.1.0.106
|
| 106 |
+
nvidia-nccl-cu12==2.21.5
|
| 107 |
+
nvidia-nvjitlink-cu12==12.9.86
|
| 108 |
+
nvidia-nvtx-cu12==12.1.105
|
| 109 |
+
oauthlib==3.2.0
|
| 110 |
+
olefile==0.46
|
| 111 |
+
overrides==7.7.0
|
| 112 |
+
packaging==26.0
|
| 113 |
+
pandas==2.3.3
|
| 114 |
+
pandocfilters==1.5.1
|
| 115 |
+
paramiko==2.9.3
|
| 116 |
+
parso==0.8.6
|
| 117 |
+
pexpect==4.8.0
|
| 118 |
+
Pillow==9.0.1
|
| 119 |
+
platformdirs==4.9.6
|
| 120 |
+
prometheus_client==0.25.0
|
| 121 |
+
prompt_toolkit==3.0.52
|
| 122 |
+
protobuf==3.12.4
|
| 123 |
+
psutil==7.2.2
|
| 124 |
+
ptyprocess==0.7.0
|
| 125 |
+
pure_eval==0.2.3
|
| 126 |
+
pycairo==1.20.1
|
| 127 |
+
pycparser==3.0
|
| 128 |
+
pycups==2.0.1
|
| 129 |
+
Pygments==2.20.0
|
| 130 |
+
PyGObject==3.42.1
|
| 131 |
+
PyJWT==2.3.0
|
| 132 |
+
pymacaroons==0.13.0
|
| 133 |
+
PyNaCl==1.5.0
|
| 134 |
+
pyparsing==3.3.2
|
| 135 |
+
pyRFC3339==1.1
|
| 136 |
+
python-apt==2.4.0+ubuntu4.1
|
| 137 |
+
python-dateutil==2.9.0.post0
|
| 138 |
+
python-debian==0.1.43+ubuntu1.1
|
| 139 |
+
python-json-logger==4.1.0
|
| 140 |
+
pytz==2022.1
|
| 141 |
+
pyxdg==0.27
|
| 142 |
+
PyYAML==5.4.1
|
| 143 |
+
pyzmq==27.1.0
|
| 144 |
+
referencing==0.37.0
|
| 145 |
+
reportlab==3.6.8
|
| 146 |
+
requests==2.33.1
|
| 147 |
+
rfc3339-validator==0.1.4
|
| 148 |
+
rfc3986-validator==0.1.1
|
| 149 |
+
rfc3987-syntax==1.1.0
|
| 150 |
+
rpds-py==0.30.0
|
| 151 |
+
scikit-learn==1.7.2
|
| 152 |
+
scipy==1.15.3
|
| 153 |
+
screen-resolution-extra==0.0.0
|
| 154 |
+
SecretStorage==3.3.1
|
| 155 |
+
Send2Trash==2.1.0
|
| 156 |
+
six==1.16.0
|
| 157 |
+
soupsieve==2.8.3
|
| 158 |
+
stack-data==0.6.3
|
| 159 |
+
sympy==1.13.1
|
| 160 |
+
systemd-python==234
|
| 161 |
+
terminado==0.18.1
|
| 162 |
+
threadpoolctl==3.6.0
|
| 163 |
+
tinycss2==1.4.0
|
| 164 |
+
tomli==2.4.1
|
| 165 |
+
torch==2.5.1+cu121
|
| 166 |
+
torchaudio==2.5.1+cu121
|
| 167 |
+
torchvision==0.20.1+cu121
|
| 168 |
+
tornado==6.5.5
|
| 169 |
+
tqdm==4.67.3
|
| 170 |
+
traitlets==5.14.3
|
| 171 |
+
triton==3.1.0
|
| 172 |
+
typing_extensions==4.15.0
|
| 173 |
+
tzdata==2026.1
|
| 174 |
+
ubuntu-drivers-common==0.0.0
|
| 175 |
+
ubuntu-pro-client==8001
|
| 176 |
+
ufw==0.36.1
|
| 177 |
+
unattended-upgrades==0.1
|
| 178 |
+
uri-template==1.3.0
|
| 179 |
+
urllib3==1.26.5
|
| 180 |
+
usb-creator==0.3.7
|
| 181 |
+
uv==0.11.3
|
| 182 |
+
wadllib==1.3.6
|
| 183 |
+
wcwidth==0.6.0
|
| 184 |
+
webcolors==25.10.0
|
| 185 |
+
webencodings==0.5.1
|
| 186 |
+
websocket-client==1.9.0
|
| 187 |
+
xdg==5
|
| 188 |
+
xkit==0.0.0
|
| 189 |
+
zipp==1.0.0
|