File size: 7,658 Bytes
fa27a43
 
 
cbe96e4
 
 
 
 
6f67ae7
 
 
fa27a43
 
cbe96e4
6f67ae7
fa27a43
 
cbe96e4
fa27a43
cbe96e4
 
 
6f67ae7
cbe96e4
 
0a6f593
6f67ae7
 
 
 
 
 
 
 
0a6f593
 
 
 
 
 
 
 
 
 
 
 
6f67ae7
 
 
 
0a6f593
 
 
 
 
 
 
 
6f67ae7
 
 
 
 
 
 
 
 
0a6f593
 
 
 
 
 
 
 
 
6f67ae7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a6f593
 
cbe96e4
 
6f67ae7
cbe96e4
 
 
 
6f67ae7
cbe96e4
 
174d5ef
cbe96e4
 
 
6f67ae7
cbe96e4
6f67ae7
cbe96e4
 
 
 
174d5ef
6f67ae7
cbe96e4
6f67ae7
 
 
 
 
 
 
 
 
fa27a43
cbe96e4
 
6f67ae7
 
 
 
 
fa27a43
 
 
6f67ae7
fa27a43
6f67ae7
fa27a43
174d5ef
cbe96e4
6f67ae7
fa27a43
 
 
 
 
 
 
 
 
 
174d5ef
 
6f67ae7
174d5ef
cbe96e4
174d5ef
 
 
 
cbe96e4
 
174d5ef
cbe96e4
174d5ef
 
cbe96e4
174d5ef
cbe96e4
fa27a43
174d5ef
fa27a43
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import hmac
import hashlib
import time
import jwt
import requests
import tempfile
import shutil
import ast
import faiss
import numpy as np
from fastapi import FastAPI, Request, HTTPException, status
from dotenv import load_dotenv
from git import Repo
from sentence_transformers import SentenceTransformer

# --- Configuration ---
load_dotenv()
GITHUB_WEBHOOK_SECRET = os.getenv("GITHUB_WEBHOOK_SECRET")
GITHUB_APP_ID = os.getenv("GITHUB_APP_ID")
GITHUB_PRIVATE_KEY = os.getenv("GITHUB_PRIVATE_KEY")

# In-memory storage for our repository data
repo_data_store = {}

# --- Code Processing and Vectorization Class ---

class CodeProcessor:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        print("Initializing SentenceTransformer model...")
        self.model = SentenceTransformer(model_name)
        print("Model initialized.")

    def parse_python_file(self, file_path):
        """Parses a Python file to extract functions and their source code using AST."""
        # --- THE FIX IS HERE ---
        # First, read the entire file content into a string variable.
        with open(file_path, 'r', encoding='utf-8') as file:
            source_code = file.read()
        
        try:
            # Parse the source code string.
            tree = ast.parse(source_code)
        except SyntaxError:
            print(f"SyntaxError in {file_path}, skipping.")
            return []
        
        functions = []
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                # Now, pass the correct `source_code` string to get the segment.
                function_source = ast.get_source_segment(source_code, node)
                if function_source: # Ensure we got a valid segment
                    functions.append({
                        "name": node.name,
                        "source": function_source,
                        "file_path": file_path.replace(tempfile.gettempdir(), "") # Store relative path
                    })
        return functions

    def process_repo(self, repo_path):
        """Walks a repository, parses Python files, and creates a FAISS index."""
        all_functions = []
        print(f"Walking through Python files in {repo_path}...")
        for root, _, files in os.walk(repo_path):
            for file in files:
                if file.endswith('.py'):
                    # Create the full path relative to the repo root for parsing
                    full_path = os.path.join(root, file)
                    # Create a "clean" path for storage in metadata
                    clean_path = os.path.relpath(full_path, repo_path)
                    
                    parsed_funcs = self.parse_python_file(full_path)
                    for func in parsed_funcs:
                        func['file_path'] = clean_path # Overwrite with the clean path
                    all_functions.extend(parsed_funcs)
        
        if not all_functions:
            print("No Python functions found in the repository.")
            return None, None

        print(f"Found {len(all_functions)} functions. Generating embeddings...")
        function_sources = [f["source"] for f in all_functions]
        embeddings = self.model.encode(function_sources, show_progress_bar=False)
        
        embedding_dim = embeddings.shape[1]
        index = faiss.IndexFlatL2(embedding_dim)
        index.add(np.array(embeddings, dtype=np.float32))
        
        print(f"FAISS index created successfully with {index.ntotal} vectors.")
        
        return index, all_functions

# --- Initialize the processor globally ---
code_processor = CodeProcessor()


# --- GitHub App Authentication & Repo Management (Unchanged) ---
# ... (The rest of the file from the previous correct version remains the same) ...
def create_jwt(app_id, private_key):
    now = int(time.time())
    payload = {"iat": now, "exp": now + (10 * 60), "iss": app_id}
    return jwt.encode(payload, private_key, algorithm="RS256")

def get_installation_access_token(installation_id, app_id, private_key):
    app_jwt = create_jwt(app_id, private_key)
    headers = {"Authorization": f"Bearer {app_jwt}", "Accept": "application/vnd.github.v3+json"}
    url = f"https://api.github.com/app/installations/{installation_id}/access_tokens"
    response = requests.post(url, headers=headers)
    response.raise_for_status()
    return response.json()["token"]

def process_repository(repo_url, token, repo_full_name):
    """Clones a repo and hands it off to the CodeProcessor."""
    temp_dir = tempfile.mkdtemp()
    print(f"Cloning {repo_full_name} into {temp_dir}...")
    
    try:
        clone_url = repo_url.replace("https://", f"https://x-access-token:{token}@")
        Repo.clone_from(clone_url, temp_dir)
        
        faiss_index, metadata = code_processor.process_repo(temp_dir)
        
        if faiss_index:
            repo_data_store[repo_full_name] = {
                "status": "processed",
                "faiss_index": faiss_index,
                "metadata": metadata
            }
            print(f"Successfully processed and indexed {repo_full_name}")
        else:
            print(f"No processable data found for {repo_full_name}")

    except Exception as e:
        print(f"Failed to process repository {repo_full_name}: {e}")
    finally:
        print(f"Cleaning up temporary directory: {temp_dir}")
        shutil.rmtree(temp_dir)

# --- FastAPI App (Webhook handler is unchanged) ---
app = FastAPI()

async def verify_signature(request: Request):
    if not GITHUB_WEBHOOK_SECRET: raise HTTPException(status_code=500, detail="Webhook secret not configured.")
    signature_header = request.headers.get("X-Hub-Signature-256")
    if not signature_header: raise HTTPException(status_code=400, detail="X-Hub-Signature-256 header is missing.")
    body = await request.body()
    sha_name, signature = signature_header.split("=")
    mac = hmac.new(GITHUB_WEBHOOK_SECRET.encode("utf-8"), msg=body, digestmod=hashlib.sha256)
    if not hmac.compare_digest(mac.hexdigest(), signature): raise HTTPException(status_code=400, detail="Invalid signature.")

@app.get("/")
def read_root():
    return {"message": "Docu-Pilot server is alive!"}

@app.post("/api/github/webhook")
async def github_webhook(request: Request):
    await verify_signature(request)
    event_type = request.headers.get("X-GitHub-Event")
    payload = await request.json()
    print(f"Received event: {event_type} with action: {payload.get('action')}")
    installation_id = payload.get("installation", {}).get("id")
    if not installation_id: return {"status": "ok", "message": "Event does not pertain to an installation."}
    repos_to_process = []
    if event_type == "installation" and payload.get("action") == "created":
        repos_to_process = payload.get("repositories", [])
    elif event_type == "installation_repositories" and payload.get("action") == "added":
        repos_to_process = payload.get("repositories_added", [])
    if repos_to_process:
        try:
            token = get_installation_access_token(installation_id, GITHUB_APP_ID, GITHUB_PRIVATE_KEY)
            for repo in repos_to_process:
                repo_full_name = repo["full_name"]
                repo_url = f"https://github.com/{repo_full_name}"
                process_repository(repo_url, token, repo_full_name)
        except Exception as e:
            print(f"Error during repository processing: {e}")
    elif event_type == "push":
        repo_name = payload.get("repository", {}).get("full_name")
        print(f"Received a push event on repo {repo_name}")
    return {"status": "ok"}