|
|
import os |
|
|
import sys |
|
|
import re |
|
|
import requests |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_core.documents import Document |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
from rag.logger import get_logger |
|
|
logger = get_logger(__name__) |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
def clean_personal_info(text: str) -> str: |
|
|
patterns = [ |
|
|
r"\b[\w\.-]+@[\w\.-]+\.\w+\b", |
|
|
r"\b\d{10}\b", |
|
|
r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", |
|
|
r"(http|https)://\S+", |
|
|
r"linkedin\.com/\S+", |
|
|
r"github\.com/\S+", |
|
|
r"@[A-Za-z0-9_]+", |
|
|
r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", |
|
|
] |
|
|
|
|
|
cleaned = text |
|
|
for p in patterns: |
|
|
cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) |
|
|
|
|
|
return cleaned |
|
|
|
|
|
|
|
|
def fetch_github_data(username: str): |
|
|
""" |
|
|
Fetch GitHub user profile and all public repos using an optional personal access token. |
|
|
Handles authentication and pagination. |
|
|
""" |
|
|
token = os.getenv("GITHUB_TOKEN") |
|
|
headers = {"Authorization": f"token {token}"} if token else {} |
|
|
|
|
|
|
|
|
user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json() |
|
|
if isinstance(user_resp, dict) and "message" in user_resp: |
|
|
raise Exception(f"GitHub API error (user): {user_resp['message']}") |
|
|
|
|
|
|
|
|
repos = [] |
|
|
page = 1 |
|
|
per_page = 100 |
|
|
while True: |
|
|
repos_resp = requests.get( |
|
|
f"https://api.github.com/users/{username}/repos", |
|
|
headers=headers, |
|
|
params={"per_page": per_page, "page": page} |
|
|
).json() |
|
|
|
|
|
if isinstance(repos_resp, dict) and repos_resp.get("message"): |
|
|
raise Exception(f"GitHub API error (repos): {repos_resp['message']}") |
|
|
if not repos_resp: |
|
|
break |
|
|
|
|
|
repos.extend(repos_resp) |
|
|
if len(repos_resp) < per_page: |
|
|
break |
|
|
page += 1 |
|
|
logger.info('Completed fetching github data') |
|
|
return user_resp, repos |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_github_data(username: str): |
|
|
""" |
|
|
Converts GitHub profile + repo info into structured text for RAG ingestion. |
|
|
""" |
|
|
user, repos = fetch_github_data(username) |
|
|
|
|
|
lines = [] |
|
|
|
|
|
|
|
|
lines.append(f"GitHub User: {user.get('name', username)}") |
|
|
lines.append(f"Bio: {user.get('bio', 'No bio')}") |
|
|
lines.append(f"Public Repos: {user.get('public_repos', 0)}") |
|
|
lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}") |
|
|
lines.append("\n--- Repositories ---\n") |
|
|
|
|
|
|
|
|
token = os.getenv("GITHUB_TOKEN") |
|
|
headers = {"Authorization": f"token {token}"} if token else {} |
|
|
|
|
|
for repo in repos: |
|
|
if not isinstance(repo, dict): |
|
|
continue |
|
|
|
|
|
lines.append(f"Repository: {repo.get('name', 'Unknown')}") |
|
|
lines.append(f"Description: {repo.get('description', 'No description')}") |
|
|
lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}") |
|
|
|
|
|
|
|
|
lang_data = {} |
|
|
lang_url = repo.get("languages_url") |
|
|
if lang_url: |
|
|
try: |
|
|
lang_data = requests.get(lang_url, headers=headers).json() |
|
|
except Exception: |
|
|
lang_data = {} |
|
|
|
|
|
if isinstance(lang_data, dict) and lang_data: |
|
|
lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()]) |
|
|
else: |
|
|
lang_summary = "No language data" |
|
|
|
|
|
lines.append(f"Languages: {lang_summary}") |
|
|
lines.append("") |
|
|
logger.info('Completed formatting Github data') |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def github_to_documents(username: str): |
|
|
raw_text = format_github_data(username) |
|
|
|
|
|
cleaned = clean_personal_info(raw_text) |
|
|
|
|
|
paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()] |
|
|
|
|
|
docs = [] |
|
|
paragraph_id = 0 |
|
|
|
|
|
for para in paragraphs: |
|
|
docs.append( |
|
|
Document( |
|
|
page_content=para, |
|
|
metadata={ |
|
|
"source": "github", |
|
|
"username": username, |
|
|
"paragraph_id": paragraph_id, |
|
|
} |
|
|
) |
|
|
) |
|
|
paragraph_id += 1 |
|
|
|
|
|
splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=700, |
|
|
chunk_overlap=200, |
|
|
separators=["\n\n", "\n", " ", ""], |
|
|
) |
|
|
|
|
|
chunks = splitter.split_documents(docs) |
|
|
return chunks |
|
|
|
|
|
def add_github_to_vectorstore(username: str): |
|
|
docs = github_to_documents(username) |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2" |
|
|
) |
|
|
|
|
|
index_path = os.path.join(vector_store_path, "index.faiss") |
|
|
|
|
|
if os.path.exists(index_path): |
|
|
vectorstore = FAISS.load_local( |
|
|
vector_store_path, |
|
|
embeddings, |
|
|
allow_dangerous_deserialization=True |
|
|
) |
|
|
logger.info("Loaded existing FAISS index") |
|
|
else: |
|
|
vectorstore = FAISS.from_documents([], embeddings) |
|
|
logger.info("Created new FAISS index") |
|
|
|
|
|
vectorstore.add_documents(docs) |
|
|
logger.info(f"Added {len(docs)} GitHub chunks") |
|
|
|
|
|
vectorstore.save_local(vector_store_path) |
|
|
logger.info("Vectorstore updated successfully") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
add_github_to_vectorstore(username='Raheel31') |
|
|
logger.info("Ingestion Run Successful") |
|
|
|