GenAI_Career_Assistant / rag /github_ingestion.py
Raheel Abdul Rehman
Prod Push
b13d185
import os
import sys
import re
import requests
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from rag.logger import get_logger # pylint: disable=import-error
logger = get_logger(__name__)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores')
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def clean_personal_info(text: str) -> str:
patterns = [
r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
r"\b\d{10}\b",
r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",
r"(http|https)://\S+",
r"linkedin\.com/\S+",
r"github\.com/\S+",
r"@[A-Za-z0-9_]+",
r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)",
]
cleaned = text
for p in patterns:
cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE)
return cleaned
def fetch_github_data(username: str):
"""
Fetch GitHub user profile and all public repos using an optional personal access token.
Handles authentication and pagination.
"""
token = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"token {token}"} if token else {}
# --- User profile ---
user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json()
if isinstance(user_resp, dict) and "message" in user_resp:
raise Exception(f"GitHub API error (user): {user_resp['message']}")
# --- Repositories with pagination ---
repos = []
page = 1
per_page = 100
while True:
repos_resp = requests.get(
f"https://api.github.com/users/{username}/repos",
headers=headers,
params={"per_page": per_page, "page": page}
).json()
if isinstance(repos_resp, dict) and repos_resp.get("message"):
raise Exception(f"GitHub API error (repos): {repos_resp['message']}")
if not repos_resp:
break
repos.extend(repos_resp)
if len(repos_resp) < per_page:
break
page += 1
logger.info('Completed fetching github data')
return user_resp, repos
# -----------------------------
# Format GitHub Data
# -----------------------------
def format_github_data(username: str):
"""
Converts GitHub profile + repo info into structured text for RAG ingestion.
"""
user, repos = fetch_github_data(username)
lines = []
# --- Profile summary ---
lines.append(f"GitHub User: {user.get('name', username)}")
lines.append(f"Bio: {user.get('bio', 'No bio')}")
lines.append(f"Public Repos: {user.get('public_repos', 0)}")
lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}")
lines.append("\n--- Repositories ---\n")
# --- Repos ---
token = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"token {token}"} if token else {}
for repo in repos:
if not isinstance(repo, dict):
continue
lines.append(f"Repository: {repo.get('name', 'Unknown')}")
lines.append(f"Description: {repo.get('description', 'No description')}")
lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}")
# Languages
lang_data = {}
lang_url = repo.get("languages_url")
if lang_url:
try:
lang_data = requests.get(lang_url, headers=headers).json()
except Exception:
lang_data = {}
if isinstance(lang_data, dict) and lang_data:
lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()])
else:
lang_summary = "No language data"
lines.append(f"Languages: {lang_summary}")
lines.append("")
logger.info('Completed formatting Github data')
return "\n".join(lines)
def github_to_documents(username: str):
raw_text = format_github_data(username)
cleaned = clean_personal_info(raw_text)
paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]
docs = []
paragraph_id = 0
for para in paragraphs:
docs.append(
Document(
page_content=para,
metadata={
"source": "github",
"username": username,
"paragraph_id": paragraph_id,
}
)
)
paragraph_id += 1
splitter = RecursiveCharacterTextSplitter(
chunk_size=700,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""],
)
chunks = splitter.split_documents(docs)
return chunks
def add_github_to_vectorstore(username: str):
docs = github_to_documents(username)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
index_path = os.path.join(vector_store_path, "index.faiss")
if os.path.exists(index_path):
vectorstore = FAISS.load_local(
vector_store_path,
embeddings,
allow_dangerous_deserialization=True
)
logger.info("Loaded existing FAISS index")
else:
vectorstore = FAISS.from_documents([], embeddings)
logger.info("Created new FAISS index")
vectorstore.add_documents(docs)
logger.info(f"Added {len(docs)} GitHub chunks")
vectorstore.save_local(vector_store_path)
logger.info("Vectorstore updated successfully")
if __name__ == '__main__':
add_github_to_vectorstore(username='Raheel31')
logger.info("Ingestion Run Successful")