Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import re | |
| import requests | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from rag.logger import get_logger # pylint: disable=import-error | |
| logger = get_logger(__name__) | |
| base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| def clean_personal_info(text: str) -> str: | |
| patterns = [ | |
| r"\b[\w\.-]+@[\w\.-]+\.\w+\b", | |
| r"\b\d{10}\b", | |
| r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", | |
| r"(http|https)://\S+", | |
| r"linkedin\.com/\S+", | |
| r"github\.com/\S+", | |
| r"@[A-Za-z0-9_]+", | |
| r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", | |
| ] | |
| cleaned = text | |
| for p in patterns: | |
| cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) | |
| return cleaned | |
| def fetch_github_data(username: str): | |
| """ | |
| Fetch GitHub user profile and all public repos using an optional personal access token. | |
| Handles authentication and pagination. | |
| """ | |
| token = os.getenv("GITHUB_TOKEN") | |
| headers = {"Authorization": f"token {token}"} if token else {} | |
| # --- User profile --- | |
| user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json() | |
| if isinstance(user_resp, dict) and "message" in user_resp: | |
| raise Exception(f"GitHub API error (user): {user_resp['message']}") | |
| # --- Repositories with pagination --- | |
| repos = [] | |
| page = 1 | |
| per_page = 100 | |
| while True: | |
| repos_resp = requests.get( | |
| f"https://api.github.com/users/{username}/repos", | |
| headers=headers, | |
| params={"per_page": per_page, "page": page} | |
| ).json() | |
| if isinstance(repos_resp, dict) and repos_resp.get("message"): | |
| raise Exception(f"GitHub API error (repos): {repos_resp['message']}") | |
| if not repos_resp: | |
| break | |
| repos.extend(repos_resp) | |
| if len(repos_resp) < per_page: | |
| break | |
| page += 1 | |
| logger.info('Completed fetching github data') | |
| return user_resp, repos | |
| # ----------------------------- | |
| # Format GitHub Data | |
| # ----------------------------- | |
| def format_github_data(username: str): | |
| """ | |
| Converts GitHub profile + repo info into structured text for RAG ingestion. | |
| """ | |
| user, repos = fetch_github_data(username) | |
| lines = [] | |
| # --- Profile summary --- | |
| lines.append(f"GitHub User: {user.get('name', username)}") | |
| lines.append(f"Bio: {user.get('bio', 'No bio')}") | |
| lines.append(f"Public Repos: {user.get('public_repos', 0)}") | |
| lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}") | |
| lines.append("\n--- Repositories ---\n") | |
| # --- Repos --- | |
| token = os.getenv("GITHUB_TOKEN") | |
| headers = {"Authorization": f"token {token}"} if token else {} | |
| for repo in repos: | |
| if not isinstance(repo, dict): | |
| continue | |
| lines.append(f"Repository: {repo.get('name', 'Unknown')}") | |
| lines.append(f"Description: {repo.get('description', 'No description')}") | |
| lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}") | |
| # Languages | |
| lang_data = {} | |
| lang_url = repo.get("languages_url") | |
| if lang_url: | |
| try: | |
| lang_data = requests.get(lang_url, headers=headers).json() | |
| except Exception: | |
| lang_data = {} | |
| if isinstance(lang_data, dict) and lang_data: | |
| lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()]) | |
| else: | |
| lang_summary = "No language data" | |
| lines.append(f"Languages: {lang_summary}") | |
| lines.append("") | |
| logger.info('Completed formatting Github data') | |
| return "\n".join(lines) | |
| def github_to_documents(username: str): | |
| raw_text = format_github_data(username) | |
| cleaned = clean_personal_info(raw_text) | |
| paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] | |
| docs = [] | |
| paragraph_id = 0 | |
| for para in paragraphs: | |
| docs.append( | |
| Document( | |
| page_content=para, | |
| metadata={ | |
| "source": "github", | |
| "username": username, | |
| "paragraph_id": paragraph_id, | |
| } | |
| ) | |
| ) | |
| paragraph_id += 1 | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=700, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", " ", ""], | |
| ) | |
| chunks = splitter.split_documents(docs) | |
| return chunks | |
| def add_github_to_vectorstore(username: str): | |
| docs = github_to_documents(username) | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| index_path = os.path.join(vector_store_path, "index.faiss") | |
| if os.path.exists(index_path): | |
| vectorstore = FAISS.load_local( | |
| vector_store_path, | |
| embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| logger.info("Loaded existing FAISS index") | |
| else: | |
| vectorstore = FAISS.from_documents([], embeddings) | |
| logger.info("Created new FAISS index") | |
| vectorstore.add_documents(docs) | |
| logger.info(f"Added {len(docs)} GitHub chunks") | |
| vectorstore.save_local(vector_store_path) | |
| logger.info("Vectorstore updated successfully") | |
| if __name__ == '__main__': | |
| add_github_to_vectorstore(username='Raheel31') | |
| logger.info("Ingestion Run Successful") | |