Spaces:
Sleeping
Sleeping
File size: 7,658 Bytes
fa27a43 cbe96e4 6f67ae7 fa27a43 cbe96e4 6f67ae7 fa27a43 cbe96e4 fa27a43 cbe96e4 6f67ae7 cbe96e4 0a6f593 6f67ae7 0a6f593 6f67ae7 0a6f593 6f67ae7 0a6f593 6f67ae7 0a6f593 cbe96e4 6f67ae7 cbe96e4 6f67ae7 cbe96e4 174d5ef cbe96e4 6f67ae7 cbe96e4 6f67ae7 cbe96e4 174d5ef 6f67ae7 cbe96e4 6f67ae7 fa27a43 cbe96e4 6f67ae7 fa27a43 6f67ae7 fa27a43 6f67ae7 fa27a43 174d5ef cbe96e4 6f67ae7 fa27a43 174d5ef 6f67ae7 174d5ef cbe96e4 174d5ef cbe96e4 174d5ef cbe96e4 174d5ef cbe96e4 174d5ef cbe96e4 fa27a43 174d5ef fa27a43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | import os
import hmac
import hashlib
import time
import jwt
import requests
import tempfile
import shutil
import ast
import faiss
import numpy as np
from fastapi import FastAPI, Request, HTTPException, status
from dotenv import load_dotenv
from git import Repo
from sentence_transformers import SentenceTransformer
# --- Configuration ---
load_dotenv()
GITHUB_WEBHOOK_SECRET = os.getenv("GITHUB_WEBHOOK_SECRET")
GITHUB_APP_ID = os.getenv("GITHUB_APP_ID")
GITHUB_PRIVATE_KEY = os.getenv("GITHUB_PRIVATE_KEY")
# In-memory storage for our repository data
repo_data_store = {}
# --- Code Processing and Vectorization Class ---
class CodeProcessor:
def __init__(self, model_name='all-MiniLM-L6-v2'):
print("Initializing SentenceTransformer model...")
self.model = SentenceTransformer(model_name)
print("Model initialized.")
def parse_python_file(self, file_path):
"""Parses a Python file to extract functions and their source code using AST."""
# --- THE FIX IS HERE ---
# First, read the entire file content into a string variable.
with open(file_path, 'r', encoding='utf-8') as file:
source_code = file.read()
try:
# Parse the source code string.
tree = ast.parse(source_code)
except SyntaxError:
print(f"SyntaxError in {file_path}, skipping.")
return []
functions = []
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
# Now, pass the correct `source_code` string to get the segment.
function_source = ast.get_source_segment(source_code, node)
if function_source: # Ensure we got a valid segment
functions.append({
"name": node.name,
"source": function_source,
"file_path": file_path.replace(tempfile.gettempdir(), "") # Store relative path
})
return functions
def process_repo(self, repo_path):
"""Walks a repository, parses Python files, and creates a FAISS index."""
all_functions = []
print(f"Walking through Python files in {repo_path}...")
for root, _, files in os.walk(repo_path):
for file in files:
if file.endswith('.py'):
# Create the full path relative to the repo root for parsing
full_path = os.path.join(root, file)
# Create a "clean" path for storage in metadata
clean_path = os.path.relpath(full_path, repo_path)
parsed_funcs = self.parse_python_file(full_path)
for func in parsed_funcs:
func['file_path'] = clean_path # Overwrite with the clean path
all_functions.extend(parsed_funcs)
if not all_functions:
print("No Python functions found in the repository.")
return None, None
print(f"Found {len(all_functions)} functions. Generating embeddings...")
function_sources = [f["source"] for f in all_functions]
embeddings = self.model.encode(function_sources, show_progress_bar=False)
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings, dtype=np.float32))
print(f"FAISS index created successfully with {index.ntotal} vectors.")
return index, all_functions
# --- Initialize the processor globally ---
code_processor = CodeProcessor()
# --- GitHub App Authentication & Repo Management (Unchanged) ---
# ... (The rest of the file from the previous correct version remains the same) ...
def create_jwt(app_id, private_key):
now = int(time.time())
payload = {"iat": now, "exp": now + (10 * 60), "iss": app_id}
return jwt.encode(payload, private_key, algorithm="RS256")
def get_installation_access_token(installation_id, app_id, private_key):
app_jwt = create_jwt(app_id, private_key)
headers = {"Authorization": f"Bearer {app_jwt}", "Accept": "application/vnd.github.v3+json"}
url = f"https://api.github.com/app/installations/{installation_id}/access_tokens"
response = requests.post(url, headers=headers)
response.raise_for_status()
return response.json()["token"]
def process_repository(repo_url, token, repo_full_name):
"""Clones a repo and hands it off to the CodeProcessor."""
temp_dir = tempfile.mkdtemp()
print(f"Cloning {repo_full_name} into {temp_dir}...")
try:
clone_url = repo_url.replace("https://", f"https://x-access-token:{token}@")
Repo.clone_from(clone_url, temp_dir)
faiss_index, metadata = code_processor.process_repo(temp_dir)
if faiss_index:
repo_data_store[repo_full_name] = {
"status": "processed",
"faiss_index": faiss_index,
"metadata": metadata
}
print(f"Successfully processed and indexed {repo_full_name}")
else:
print(f"No processable data found for {repo_full_name}")
except Exception as e:
print(f"Failed to process repository {repo_full_name}: {e}")
finally:
print(f"Cleaning up temporary directory: {temp_dir}")
shutil.rmtree(temp_dir)
# --- FastAPI App (Webhook handler is unchanged) ---
app = FastAPI()
async def verify_signature(request: Request):
if not GITHUB_WEBHOOK_SECRET: raise HTTPException(status_code=500, detail="Webhook secret not configured.")
signature_header = request.headers.get("X-Hub-Signature-256")
if not signature_header: raise HTTPException(status_code=400, detail="X-Hub-Signature-256 header is missing.")
body = await request.body()
sha_name, signature = signature_header.split("=")
mac = hmac.new(GITHUB_WEBHOOK_SECRET.encode("utf-8"), msg=body, digestmod=hashlib.sha256)
if not hmac.compare_digest(mac.hexdigest(), signature): raise HTTPException(status_code=400, detail="Invalid signature.")
@app.get("/")
def read_root():
return {"message": "Docu-Pilot server is alive!"}
@app.post("/api/github/webhook")
async def github_webhook(request: Request):
await verify_signature(request)
event_type = request.headers.get("X-GitHub-Event")
payload = await request.json()
print(f"Received event: {event_type} with action: {payload.get('action')}")
installation_id = payload.get("installation", {}).get("id")
if not installation_id: return {"status": "ok", "message": "Event does not pertain to an installation."}
repos_to_process = []
if event_type == "installation" and payload.get("action") == "created":
repos_to_process = payload.get("repositories", [])
elif event_type == "installation_repositories" and payload.get("action") == "added":
repos_to_process = payload.get("repositories_added", [])
if repos_to_process:
try:
token = get_installation_access_token(installation_id, GITHUB_APP_ID, GITHUB_PRIVATE_KEY)
for repo in repos_to_process:
repo_full_name = repo["full_name"]
repo_url = f"https://github.com/{repo_full_name}"
process_repository(repo_url, token, repo_full_name)
except Exception as e:
print(f"Error during repository processing: {e}")
elif event_type == "push":
repo_name = payload.get("repository", {}).get("full_name")
print(f"Received a push event on repo {repo_name}")
return {"status": "ok"} |