import os import sys import re import requests from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from rag.logger import get_logger # pylint: disable=import-error logger = get_logger(__name__) base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") def clean_personal_info(text: str) -> str: patterns = [ r"\b[\w\.-]+@[\w\.-]+\.\w+\b", r"\b\d{10}\b", r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", r"(http|https)://\S+", r"linkedin\.com/\S+", r"github\.com/\S+", r"@[A-Za-z0-9_]+", r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", ] cleaned = text for p in patterns: cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) return cleaned def fetch_github_data(username: str): """ Fetch GitHub user profile and all public repos using an optional personal access token. Handles authentication and pagination. """ token = os.getenv("GITHUB_TOKEN") headers = {"Authorization": f"token {token}"} if token else {} # --- User profile --- user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json() if isinstance(user_resp, dict) and "message" in user_resp: raise Exception(f"GitHub API error (user): {user_resp['message']}") # --- Repositories with pagination --- repos = [] page = 1 per_page = 100 while True: repos_resp = requests.get( f"https://api.github.com/users/{username}/repos", headers=headers, params={"per_page": per_page, "page": page} ).json() if isinstance(repos_resp, dict) and repos_resp.get("message"): raise Exception(f"GitHub API error (repos): {repos_resp['message']}") if not repos_resp: break repos.extend(repos_resp) if len(repos_resp) < per_page: break page += 1 logger.info('Completed fetching github data') return user_resp, repos # ----------------------------- # Format GitHub Data # ----------------------------- def format_github_data(username: str): """ Converts GitHub profile + repo info into structured text for RAG ingestion. """ user, repos = fetch_github_data(username) lines = [] # --- Profile summary --- lines.append(f"GitHub User: {user.get('name', username)}") lines.append(f"Bio: {user.get('bio', 'No bio')}") lines.append(f"Public Repos: {user.get('public_repos', 0)}") lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}") lines.append("\n--- Repositories ---\n") # --- Repos --- token = os.getenv("GITHUB_TOKEN") headers = {"Authorization": f"token {token}"} if token else {} for repo in repos: if not isinstance(repo, dict): continue lines.append(f"Repository: {repo.get('name', 'Unknown')}") lines.append(f"Description: {repo.get('description', 'No description')}") lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}") # Languages lang_data = {} lang_url = repo.get("languages_url") if lang_url: try: lang_data = requests.get(lang_url, headers=headers).json() except Exception: lang_data = {} if isinstance(lang_data, dict) and lang_data: lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()]) else: lang_summary = "No language data" lines.append(f"Languages: {lang_summary}") lines.append("") logger.info('Completed formatting Github data') return "\n".join(lines) def github_to_documents(username: str): raw_text = format_github_data(username) cleaned = clean_personal_info(raw_text) paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] docs = [] paragraph_id = 0 for para in paragraphs: docs.append( Document( page_content=para, metadata={ "source": "github", "username": username, "paragraph_id": paragraph_id, } ) ) paragraph_id += 1 splitter = RecursiveCharacterTextSplitter( chunk_size=700, chunk_overlap=200, separators=["\n\n", "\n", " ", ""], ) chunks = splitter.split_documents(docs) return chunks def add_github_to_vectorstore(username: str): docs = github_to_documents(username) embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) index_path = os.path.join(vector_store_path, "index.faiss") if os.path.exists(index_path): vectorstore = FAISS.load_local( vector_store_path, embeddings, allow_dangerous_deserialization=True ) logger.info("Loaded existing FAISS index") else: vectorstore = FAISS.from_documents([], embeddings) logger.info("Created new FAISS index") vectorstore.add_documents(docs) logger.info(f"Added {len(docs)} GitHub chunks") vectorstore.save_local(vector_store_path) logger.info("Vectorstore updated successfully") if __name__ == '__main__': add_github_to_vectorstore(username='Raheel31') logger.info("Ingestion Run Successful")