aman1762 commited on
Commit
3bc9c63
·
verified ·
1 Parent(s): 5df0799

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +19 -16
  2. api.py +25 -0
  3. app.py +28 -0
  4. chunker.py +16 -0
  5. ingest.py +24 -0
  6. rag_chain.py +14 -0
  7. requirements.txt +10 -3
  8. vectorstore.py +8 -0
README.md CHANGED
@@ -1,19 +1,22 @@
1
- ---
2
- title: GenAI
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- ---
13
 
14
- # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CODEBASE-RAG-ASSISTANT
2
+ A LangChain-based RAG system that indexes a code repository and answers architecture & code-level questions using free LLMs.
 
 
 
 
 
 
 
 
 
 
3
 
 
4
 
5
+ ## Codebase RAG Assistant
6
 
7
+ An AI-powered Retrieval-Augmented Generation system that enables natural language querying of large code repositories.
8
+
9
+ ### Features
10
+ - Code-aware chunking (functions & classes)
11
+ - FAISS-based vector retrieval
12
+ - LLaMA-3 inference via Groq
13
+ - Streamlit UI
14
+ - Fully free tech stack
15
+
16
+ ### Tech Stack
17
+ LangChain, FAISS, HuggingFace Embeddings, FastAPI, Streamlit
18
+
19
+ ### Use Cases
20
+ - Codebase understanding
21
+ - Architecture exploration
22
+ - Developer onboarding
api.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from ingest import load_repo, ingest_repo
3
+ from vectorstore import create_vectorstore
4
+ from rag_chain import build_rag_chain
5
+ import os
6
+
7
+ app = FastAPI()
8
+ qa_chain = None
9
+
10
+ @app.post("/load")
11
+ def load_repository(repo_url: str):
12
+ global qa_chain
13
+ path = load_repo(repo_url)
14
+ docs = ingest_repo(path)
15
+ vectorstore = create_vectorstore(docs)
16
+ qa_chain = build_rag_chain(vectorstore, os.getenv("GROQ_API_KEY"))
17
+ return {"status": "Repository indexed"}
18
+
19
+ @app.get("/ask")
20
+ def ask(question: str):
21
+ result = qa_chain(question)
22
+ return {
23
+ "answer": result["result"],
24
+ "sources": [doc.metadata["file"] for doc in result["source_documents"]]
25
+ }
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+
4
+ st.title("🧠 Codebase RAG Assistant")
5
+
6
+ repo_url = st.text_input("GitHub Repository URL")
7
+
8
+ if st.button("Index Repository"):
9
+ res = requests.post(
10
+ "http://localhost:8000/load",
11
+ params={"repo_url": repo_url}
12
+ )
13
+ st.success("Repository indexed!")
14
+
15
+ question = st.text_input("Ask a question about the codebase")
16
+
17
+ if st.button("Ask"):
18
+ res = requests.get(
19
+ "http://localhost:8000/ask",
20
+ params={"question": question}
21
+ ).json()
22
+
23
+ st.write("### Answer")
24
+ st.write(res["answer"])
25
+
26
+ st.write("### Sources")
27
+ for src in res["sources"]:
28
+ st.write(src)
chunker.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from langchain.schema import Document
3
+
4
+ def chunk_code(file_path, code):
5
+ chunks = []
6
+ functions = re.split(r'\n(?=def |class )', code)
7
+
8
+ for block in functions:
9
+ if len(block.strip()) > 50:
10
+ chunks.append(
11
+ Document(
12
+ page_content=block,
13
+ metadata={"file": file_path}
14
+ )
15
+ )
16
+ return chunks
ingest.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from git import Repo
3
+ from chunker import chunk_code
4
+
5
+ SUPPORTED_EXT = (".py", ".js", ".java", ".cpp")
6
+
7
+ def load_repo(repo_url, local_dir="repo"):
8
+ if os.path.exists(local_dir):
9
+ return local_dir
10
+ Repo.clone_from(repo_url, local_dir)
11
+ return local_dir
12
+
13
+ def ingest_repo(repo_path):
14
+ documents = []
15
+
16
+ for root, _, files in os.walk(repo_path):
17
+ for file in files:
18
+ if file.endswith(SUPPORTED_EXT):
19
+ path = os.path.join(root, file)
20
+ with open(path, "r", errors="ignore") as f:
21
+ code = f.read()
22
+ documents.extend(chunk_code(path, code))
23
+
24
+ return documents
rag_chain.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain.llms import Groq
3
+
4
+ def build_rag_chain(vectorstore, groq_api_key):
5
+ llm = Groq(
6
+ api_key=groq_api_key,
7
+ model_name="llama3-8b-8192"
8
+ )
9
+
10
+ return RetrievalQA.from_chain_type(
11
+ llm=llm,
12
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
13
+ return_source_documents=True
14
+ )
requirements.txt CHANGED
@@ -1,3 +1,10 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ sentence-transformers
4
+ faiss-cpu
5
+ fastapi
6
+ uvicorn
7
+ streamlit
8
+ gitpython
9
+ groq
10
+ python-dotenv
vectorstore.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+
4
+ def create_vectorstore(documents):
5
+ embeddings = HuggingFaceEmbeddings(
6
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
7
+ )
8
+ return FAISS.from_documents(documents, embeddings)