File size: 11,528 Bytes
130bc23
 
 
67baef8
 
613f1c3
 
bd40b81
 
613f1c3
130bc23
 
0e82851
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130bc23
 
 
 
 
 
 
 
 
 
 
 
 
 
bd40b81
 
130bc23
 
 
 
 
 
 
 
 
63f5a60
 
bd40b81
63f5a60
130bc23
 
 
 
 
 
 
0e82851
 
 
130bc23
0e82851
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130bc23
 
0e82851
 
 
 
 
 
 
bd40b81
 
0e82851
 
 
 
 
 
 
 
 
bd40b81
 
0e82851
 
 
 
 
130bc23
bd40b81
 
 
 
 
0e82851
130bc23
0e82851
130bc23
 
 
 
 
 
 
bd40b81
67baef8
 
bd40b81
 
 
 
 
130bc23
bd40b81
 
130bc23
 
bd40b81
130bc23
 
 
 
 
 
67baef8
 
 
bd40b81
67baef8
130bc23
 
 
 
 
 
 
 
 
 
 
67baef8
 
 
 
 
0e82851
 
 
67baef8
 
 
 
0e82851
 
 
67baef8
0e82851
67baef8
 
 
 
 
0e82851
67baef8
0e82851
67baef8
 
 
0e82851
 
 
67baef8
 
 
 
 
0e82851
 
 
 
67baef8
 
 
 
 
0e82851
67baef8
0e82851
67baef8
 
130bc23
0e82851
 
 
130bc23
0e82851
5d3f445
 
130bc23
 
0e82851
 
 
 
 
 
 
130bc23
0e82851
 
 
 
 
 
 
 
5d3f445
 
0e82851
 
5d3f445
 
0e82851
130bc23
 
67baef8
130bc23
0e82851
 
 
 
130bc23
 
67baef8
 
0e82851
 
67baef8
63f5a60
130bc23
 
0e82851
130bc23
 
 
0e82851
130bc23
67baef8
 
0e82851
 
 
 
 
 
 
 
 
 
 
 
67baef8
 
 
130bc23
 
0e82851
e13b96a
0e82851
 
 
 
130bc23
 
613f1c3
130bc23
 
63f5a60
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import gradio as gr
import torch
import os
import shutil
import subprocess

os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Maximize Hugging Face CPU Tier performance by limiting thread thrashing
torch.set_num_threads(os.cpu_count() or 2)

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_core.documents import Document

EXTENSION_TO_LANGUAGE = {
    '.py': Language.PYTHON,
    '.js': Language.JS,
    '.ts': Language.JS,
    '.java': Language.JAVA,
    '.cpp': Language.CPP,
    '.c': Language.CPP,
    '.h': Language.CPP,
    '.go': Language.GO,
    '.rs': Language.RUST,
    '.rb': Language.RUBY,
    '.html': Language.HTML,
    '.md': Language.MARKDOWN,
}
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# 1. HARDWARE OPTIMIZED LLM LOADING

def load_llm():
    model_id = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="cpu",
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True
    )
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        return_full_text=False
    )
    return HuggingFacePipeline(
        pipeline=pipe, 
        pipeline_kwargs={"max_new_tokens": 512, "temperature": 0.1, "repetition_penalty": 1.1}
    )

# 2. CODE INGESTION & VECTOR DATABASE

def setup_vector_db():
    if not os.path.exists('./repo'):
        os.makedirs('./repo')
        
    docs_by_language = {}
    generic_docs = []
    file_count = 0
    
    for root, _, files in os.walk('./repo'):
        if '.git' in root:
            continue
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                doc = Document(page_content=content, metadata={"source": file_path})
                file_count += 1
                
                lang = EXTENSION_TO_LANGUAGE.get(ext)
                if lang:
                    if lang not in docs_by_language:
                        docs_by_language[lang] = []
                    docs_by_language[lang].append(doc)
                else:
                    generic_docs.append(doc)
            except UnicodeDecodeError:
                pass # Skip binary files
                
    if file_count == 0:
        return None, 0

    all_splits = []
    
    # Split documents by specific language rules
    for lang, docs in docs_by_language.items():
        try:
            splitter = RecursiveCharacterTextSplitter.from_language(
                language=lang, 
                chunk_size=1000, 
                chunk_overlap=200
            )
            all_splits.extend(splitter.split_documents(docs))
        except Exception:
            # Fallback if language is not supported by installed langchain version
            generic_docs.extend(docs)
            
    # Split generic documents
    if generic_docs:
        generic_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        all_splits.extend(generic_splitter.split_documents(generic_docs))
        
    if not all_splits:
        return None, 0
    
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2", 
        model_kwargs={'device': 'cpu'}, 
        encode_kwargs={'normalize_embeddings': False}
    )
    db = FAISS.from_documents(all_splits, embeddings)
    
    return db, file_count

# 3. GLOBAL INITIALIZATION
print("Initializing models...")
device_status = "🟒 GPU Active" if torch.cuda.is_available() else "🟑 CPU Mode"
llm = load_llm()
vector_db, file_count = setup_vector_db()

prompt_template = """You are an expert Software Engineer and Codebase Assistant. Your ONLY purpose is to answer questions related to the provided codebase or general programming/coding questions. 
If the user asks a question that is NOT related to coding, programming, or the provided codebase, you must politely refuse to answer and remind them that you are a code-focused assistant.

When answering:
1. Carefully analyze the provided context.
2. Provide a clear, step-by-step explanation.
3. If providing code, use markdown code blocks.
4. If the answer cannot be found in the context, explicitly state that you don't know rather than hallucinating.

Codebase Context:
{context}

Question: {input}
Expert Developer Answer:"""

prompt = PromptTemplate.from_template(prompt_template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def build_qa_chain(db):
    if not db:
        return None
    retriever = db.as_retriever(search_kwargs={"k": 5})
    return (
        {"context": retriever, "input": RunnablePassthrough()}
        | RunnablePassthrough.assign(
            answer=(
                RunnablePassthrough.assign(context=lambda x: format_docs(x["context"]))
                | prompt
                | llm
                | StrOutputParser()
            )
        )
    )

qa_chain = build_qa_chain(vector_db)

# 4. INGESTION FUNCTIONS
def clone_and_index(repo_url):
    global vector_db, file_count, qa_chain
    if not repo_url or not repo_url.strip():
        return "⚠️ Please enter a valid GitHub URL."

    if os.path.exists('./repo'):
        shutil.rmtree('./repo')
    
    try:
        subprocess.run(["git", "clone", repo_url.strip(), "./repo"], check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        return f"❌ Failed to clone repo. Error: {e.stderr}"
    except Exception as e:
        return f"❌ Failed to clone repo: {str(e)}"
        
    vector_db, file_count = setup_vector_db()
    qa_chain = build_qa_chain(vector_db)
    
    if vector_db:
        return f"βœ… Success! {file_count} files indexed from `{repo_url}`"
    else:
        return f"⚠️ Warning: No valid text files found in `{repo_url}`"

def upload_and_index(files):
    global vector_db, file_count, qa_chain
    if not files:
        return "⚠️ No files were uploaded."

    if os.path.exists('./repo'):
        shutil.rmtree('./repo')
    os.makedirs('./repo', exist_ok=True)
    
    for file in files:
        # Handle both Gradio 3 (filepath string) and Gradio 4 (File object)
        file_path = getattr(file, "name", str(file))
        dest_path = os.path.join('./repo', os.path.basename(file_path))
        shutil.copy(file_path, dest_path)
        
    vector_db, file_count = setup_vector_db()
    qa_chain = build_qa_chain(vector_db)
    
    if vector_db:
        return f"βœ… Success! {file_count} files indexed from local upload"
    else:
        return "⚠️ Warning: No valid text files found in the uploaded files"

# 5. CHAT LOGIC
def respond(message, chat_history):
    if not message.strip():
        return "", chat_history
        
    if not vector_db:
        bot_message = "πŸ‘‹ Welcome! Please provide a repo link or upload your code files using the panel on the left to start chatting."
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": bot_message})
        return "", chat_history
    
    try:
        # Fetch response from RAG
        response = qa_chain.invoke(message)
        answer = response["answer"]
        sources = response["context"]
        
        final_answer = answer
        
        if sources:
            final_answer += "\n\n<details><summary>πŸ” View Source Code Referenced</summary>\n\n"
            for idx, doc in enumerate(sources):
                source_file = doc.metadata.get("source", "Unknown File")
                final_answer += f"**Snippet {idx + 1}** from `{source_file}`:\n"
                final_answer += f"```python\n{doc.page_content}\n```\n\n"
            final_answer += "</details>"
            
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": final_answer})
    except Exception as e:
        bot_message = f"❌ An error occurred during processing: {str(e)}"
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": bot_message})

    return "", chat_history

# 6. GRADIO UI
custom_css = """
.status-box { padding: 15px; border-radius: 8px; background-color: #f0f0f0; margin-bottom: 20px; border-left: 4px solid #007bff;}
.dark .status-box { background-color: #1e293b; color: #cbd5e1; border-left: 4px solid #3b82f6;}
.instructions { font-size: 0.95em; color: #555; }
.dark .instructions { color: #aaa; }
"""

def get_initial_repo_status():
    if vector_db:
        return f"βœ… **Ready!** {file_count} files indexed and loaded."
    return "❌ **Empty Database.** Provide a codebase below to begin."

with gr.Blocks(title="Codebase Assistant") as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("# πŸ¦– RepoRaptor\n**Your personal AI codebase expert.**")
            gr.Markdown("---")
            
            with gr.Column(elem_classes=["status-box"]):
                gr.Markdown("### πŸ“Š System Status")
                gr.Markdown(f"**Hardware:** {device_status}")
                repo_status = gr.Markdown(get_initial_repo_status())
            
            gr.Markdown("### πŸ“‚ Ingest Codebase")
            gr.Markdown("Choose a method to load your codebase into the Vector Database.", elem_classes=["instructions"])
            
            with gr.Tabs():
                with gr.Tab("GitHub Repo"):
                    gr.Markdown("Clone a public repository directly:")
                    repo_url = gr.Textbox(placeholder="https://github.com/user/repo", show_label=False)
                    clone_btn = gr.Button("⬇️ Clone & Index", variant="primary")
                with gr.Tab("Local Upload"):
                    gr.Markdown("Upload local codebase files:")
                    local_files = gr.File(file_count="multiple", label="Upload Files")
                    upload_btn = gr.Button("πŸ“€ Upload & Index", variant="primary")

            clone_btn.click(fn=clone_and_index, inputs=[repo_url], outputs=[repo_status])
            upload_btn.click(fn=upload_and_index, inputs=[local_files], outputs=[repo_status])
        
        with gr.Column(scale=3):
            gr.Markdown("### πŸ’» Chat Interface\nAsk architecture questions, find bugs, or request code explanations. I will **only** answer questions related to code.")
            chatbot = gr.Chatbot(height=600, show_label=False)
            
            with gr.Row():
                msg = gr.Textbox(placeholder="E.g., What does the main function do? (Press Enter to send)", show_label=False, scale=4)
                clear = gr.Button("πŸ—‘οΈ Clear Chat", scale=1)

            msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
            clear.click(lambda: ("", []), inputs=None, outputs=[msg, chatbot], queue=False)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", css=custom_css, theme=gr.themes.Soft())