import os
import sys
import re
import requests
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from rag.logger import get_logger  # pylint: disable=import-error
logger = get_logger(__name__)

base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores')

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


def clean_personal_info(text: str) -> str:
    patterns = [
        r"\b[\w\.-]+@[\w\.-]+\.\w+\b",    
        r"\b\d{10}\b",                               
        r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",  
        r"(http|https)://\S+",                       
        r"linkedin\.com/\S+",
        r"github\.com/\S+",
        r"@[A-Za-z0-9_]+",                        
        r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", 
    ]

    cleaned = text
    for p in patterns:
        cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE)

    return cleaned
    

def fetch_github_data(username: str):
    """
    Fetch GitHub user profile and all public repos using an optional personal access token.
    Handles authentication and pagination.
    """
    token = os.getenv("GITHUB_TOKEN")
    headers = {"Authorization": f"token {token}"} if token else {}

    # --- User profile ---
    user_resp = requests.get(f"https://api.github.com/users/{username}", headers=headers).json()
    if isinstance(user_resp, dict) and "message" in user_resp:
        raise Exception(f"GitHub API error (user): {user_resp['message']}")

    # --- Repositories with pagination ---
    repos = []
    page = 1
    per_page = 100
    while True:
        repos_resp = requests.get(
            f"https://api.github.com/users/{username}/repos",
            headers=headers,
            params={"per_page": per_page, "page": page}
        ).json()

        if isinstance(repos_resp, dict) and repos_resp.get("message"):
            raise Exception(f"GitHub API error (repos): {repos_resp['message']}")
        if not repos_resp:
            break

        repos.extend(repos_resp)
        if len(repos_resp) < per_page:
            break
        page += 1
    logger.info('Completed fetching github data')
    return user_resp, repos


# -----------------------------
# Format GitHub Data
# -----------------------------
def format_github_data(username: str):
    """
    Converts GitHub profile + repo info into structured text for RAG ingestion.
    """
    user, repos = fetch_github_data(username)

    lines = []

    # --- Profile summary ---
    lines.append(f"GitHub User: {user.get('name', username)}")
    lines.append(f"Bio: {user.get('bio', 'No bio')}")
    lines.append(f"Public Repos: {user.get('public_repos', 0)}")
    lines.append(f"Followers: {user.get('followers', 0)}, Following: {user.get('following', 0)}")
    lines.append("\n--- Repositories ---\n")

    # --- Repos ---
    token = os.getenv("GITHUB_TOKEN")
    headers = {"Authorization": f"token {token}"} if token else {}

    for repo in repos:
        if not isinstance(repo, dict):
            continue

        lines.append(f"Repository: {repo.get('name', 'Unknown')}")
        lines.append(f"Description: {repo.get('description', 'No description')}")
        lines.append(f"Stars: {repo.get('stargazers_count', 0)}, Forks: {repo.get('forks_count', 0)}")

        # Languages
        lang_data = {}
        lang_url = repo.get("languages_url")
        if lang_url:
            try:
                lang_data = requests.get(lang_url, headers=headers).json()
            except Exception:
                lang_data = {}

        if isinstance(lang_data, dict) and lang_data:
            lang_summary = ", ".join([f"{k} ({v} bytes)" for k, v in lang_data.items()])
        else:
            lang_summary = "No language data"

        lines.append(f"Languages: {lang_summary}")
        lines.append("")
    logger.info('Completed formatting Github data')
    return "\n".join(lines)


def github_to_documents(username: str):
    raw_text = format_github_data(username)

    cleaned = clean_personal_info(raw_text)

    paragraphs = [p.strip() for p in cleaned.split("\n\n") if p.strip()]

    docs = []
    paragraph_id = 0

    for para in paragraphs:
        docs.append(
            Document(
                page_content=para,
                metadata={
                    "source": "github",
                    "username": username,
                    "paragraph_id": paragraph_id,
                }
            )
        )
        paragraph_id += 1

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""],
    )

    chunks = splitter.split_documents(docs)
    return chunks

def add_github_to_vectorstore(username: str):
    docs = github_to_documents(username)

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    index_path = os.path.join(vector_store_path, "index.faiss")

    if os.path.exists(index_path):
        vectorstore = FAISS.load_local(
            vector_store_path,
            embeddings,
            allow_dangerous_deserialization=True
        )
        logger.info("Loaded existing FAISS index")
    else:
        vectorstore = FAISS.from_documents([], embeddings)
        logger.info("Created new FAISS index")

    vectorstore.add_documents(docs)
    logger.info(f"Added {len(docs)} GitHub chunks")

    vectorstore.save_local(vector_store_path)
    logger.info("Vectorstore updated successfully")


if __name__ == '__main__':
    add_github_to_vectorstore(username='Raheel31')
    logger.info("Ingestion Run Successful")