GenAI_Career_Assistant / rag /github_ingestion.py

Raheel Abdul Rehman

Prod Push

b13d185 about 1 month ago

5.99 kB

	import os
	import sys
	import re
	import requests
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
	from rag.logger import get_logger # pylint: disable=import-error
	logger = get_logger(__name__)

	base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	vector_store_path = os.path.join(base_dir, 'data', 'vectorstores')

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


	def clean_personal_info(text: str) -> str:
	patterns = [
	r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
	r"\b\d{10}\b",
	r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",
	r"(http\|https)://\S+",
	r"linkedin\.com/\S+",
	r"github\.com/\S+",
	r"@[A-Za-z0-9_]+",
	r"\d{1,4}\s+\w+\s+(Street\|St\|Road\|Rd\|Avenue\|Ave\|Lane\|Ln)",
	]

	cleaned = text
	for p in patterns:
	cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE)

	return cleaned


	def fetch_github_data(username: str):
	"""
	Fetch GitHub user profile and all public repos using an optional personal access token.
	Handles authentication and pagination.
	"""
	token = os.getenv("GITHUB_TOKEN")
	headers = {"Authorization": f"token {token}"} if token else {}

	# --- User profile ---
	user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json()
	if isinstance(user_resp, dict) and "message" in user_resp:
	raise Exception(f"GitHub API error (user): {user_resp['message']}")

	# --- Repositories with pagination ---
	repos = []
	page = 1
	per_page = 100
	while True:
	repos_resp = requests.get(
	f"https://api.github.com/users/{username}/repos",
	headers=headers,
	params={"per_page": per_page, "page": page}
	).json()

	if isinstance(repos_resp, dict) and repos_resp.get("message"):
	raise Exception(f"GitHub API error (repos): {repos_resp['message']}")
	if not repos_resp:
	break

	repos.extend(repos_resp)
	if len(repos_resp) < per_page:
	break
	page += 1
	logger.info('Completed fetching github data')
	return user_resp, repos


	# -----------------------------
	# Format GitHub Data
	# -----------------------------
	def format_github_data(username: str):
	"""
	Converts GitHub profile + repo info into structured text for RAG ingestion.
	"""
	user, repos = fetch_github_data(username)

	lines = []

	# --- Profile summary ---
	lines.append(f"GitHub User: {user.get('name', username)}")
	lines.append(f"Bio: {user.get('bio', 'No bio')}")
	lines.append(f"Public Repos: {user.get('public_repos', 0)}")
	lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}")
	lines.append("\n--- Repositories ---\n")

	# --- Repos ---
	token = os.getenv("GITHUB_TOKEN")
	headers = {"Authorization": f"token {token}"} if token else {}

	for repo in repos:
	if not isinstance(repo, dict):
	continue

	lines.append(f"Repository: {repo.get('name', 'Unknown')}")
	lines.append(f"Description: {repo.get('description', 'No description')}")
	lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}")

	# Languages
	lang_data = {}
	lang_url = repo.get("languages_url")
	if lang_url:
	try:
	lang_data = requests.get(lang_url, headers=headers).json()
	except Exception:
	lang_data = {}

	if isinstance(lang_data, dict) and lang_data:
	lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()])
	else:
	lang_summary = "No language data"

	lines.append(f"Languages: {lang_summary}")
	lines.append("")
	logger.info('Completed formatting Github data')
	return "\n".join(lines)


	def github_to_documents(username: str):
	raw_text = format_github_data(username)

	cleaned = clean_personal_info(raw_text)

	paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]

	docs = []
	paragraph_id = 0

	for para in paragraphs:
	docs.append(
	Document(
	page_content=para,
	metadata={
	"source": "github",
	"username": username,
	"paragraph_id": paragraph_id,
	}
	)
	)
	paragraph_id += 1

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=700,
	chunk_overlap=200,
	separators=["\n\n", "\n", " ", ""],
	)

	chunks = splitter.split_documents(docs)
	return chunks

	def add_github_to_vectorstore(username: str):
	docs = github_to_documents(username)

	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	index_path = os.path.join(vector_store_path, "index.faiss")

	if os.path.exists(index_path):
	vectorstore = FAISS.load_local(
	vector_store_path,
	embeddings,
	allow_dangerous_deserialization=True
	)
	logger.info("Loaded existing FAISS index")
	else:
	vectorstore = FAISS.from_documents([], embeddings)
	logger.info("Created new FAISS index")

	vectorstore.add_documents(docs)
	logger.info(f"Added {len(docs)} GitHub chunks")

	vectorstore.save_local(vector_store_path)
	logger.info("Vectorstore updated successfully")


	if __name__ == '__main__':
	add_github_to_vectorstore(username='Raheel31')
	logger.info("Ingestion Run Successful")