{ "cells": [ { "cell_type": "markdown", "id": "5870516f", "metadata": {}, "source": [ "# Project Code Loader\n", "This notebook loads and displays the full contents of the main project source files from the repository." ] }, { "cell_type": "markdown", "id": "035d6dcb", "metadata": {}, "source": [ "## Import Required Libraries\n", "\n", "Import built-in libraries for file handling and display." ] }, { "cell_type": "markdown", "id": "7d43652c", "metadata": {}, "source": [ "1. fetch sitemap.xml\n", "2. load xml\n", "3. get loc\n", "4. seperate /\n", "5. seperate not relvent text\n", "5. convert text to embedding\n", "6. store embedding to chromadb\n", "7. when user query convert to embeddings\n", "8. \n", "9. " ] }, { "cell_type": "code", "execution_count": 48, "id": "ae26c1f4", "metadata": {}, "outputs": [], "source": [ "import re\n", "from typing import List\n", "import requests as rq\n", "from bs4 import BeautifulSoup\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "\n", "# Sitemap slug-vectorization quick utility (Method 2: fast, in-house routing)\n", "# Usage: call rank_sitemap_urls_by_query(sitemap_url, query, top_k=3)\n", "\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " \"\"\"Fetch sitemap (supports sitemapindex) and return a flat list of URLs (up to max_urls).\"\"\"\n", " resp = rq.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " content = resp.content\n", " urls: List[str] = []\n", "\n", " try:\n", " root = ET.fromstring(content)\n", " except ET.ParseError:\n", " soup = BeautifulSoup(content, \"xml\")\n", " locs = soup.find_all(\"loc\")\n", " for loc in locs:\n", " if loc.string:\n", " urls.append(loc.string.strip())\n", " if len(urls) >= max_urls:\n", " break\n", " return urls\n", "\n", " ns = {}\n", " if root.tag.startswith(\"{\"):\n", " ns_uri = root.tag.split(\"}\")[0].strip(\"{\")\n", " ns = {\"ns\": ns_uri}\n", "\n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " child = loc.text.strip()\n", " try:\n", " urls.extend(fetch_sitemap_urls(child, max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls:\n", " break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls:\n", " break\n", "\n", " return urls\n", "\n", "\n", "def slug_to_text(url: str) -> str:\n", " \"\"\"Turn a URL path into readable tokens suitable for fast semantic matching.\"\"\"\n", " p = urllib.parse.urlparse(url)\n", " path = p.path or \"\"\n", " path = re.sub(r\"\\.\\w{1,6}$\", \"\", path)\n", " tokens = re.split(r\"[\\/\\-\\_\\.\\?\\=\\&\\#]+\", path)\n", " tokens = [t for t in tokens if t and not re.fullmatch(r\"\\d+\", t)]\n", " if not tokens:\n", " tokens = [p.netloc]\n", " return \" \".join(tokens)\n", "\n", "\n", "def rank_sitemap_urls_by_query(sitemap_url: str, query: str, top_k: int = 3, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " \"\"\"Return top_k sitemap URLs ranked by TF-IDF similarity between query and URL slugs.\"\"\"\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=max_urls, verify=verify)\n", " if not urls:\n", " return []\n", " slugs = [slug_to_text(u) for u in urls]\n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)\n", " X = vectorizer.fit_transform(slugs + [query])\n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " top_idx = sims.argsort()[::-1][:top_k]\n", " return [urls[i] for i in top_idx]\n", "\n", "\n", "# Example usage:\n", "# sitemap = \"https://docs.python.org/sitemap.xml\"\n", "# print(rank_sitemap_urls_by_query(sitemap, \"how to install pip\", top_k=3))\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "c111390b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: No lexical matches found for 'installation guide'.\n", "['https://docs.langchain.com/api-reference/agent-connections-v2/create-connection', 'https://docs.langchain.com/api-reference/agent-connections-v2/list-connections', 'https://docs.langchain.com/api-reference/agent-connections-v2/remove-connection', 'https://docs.langchain.com/api-reference/auth-service-v2/authenticate', 'https://docs.langchain.com/api-reference/auth-service-v2/check-oauth-token-exists']\n", "Top results for query: 'langgraph authentication'\n" ] } ], "source": [ "import re\n", "import urllib3\n", "import requests\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "from typing import List\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", "requests.packages.urllib3.disable_warnings()\n", "\n", "# Redefine helpers here so this cell can run independently.\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " resp = requests.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " root = ET.fromstring(resp.content)\n", " urls = []\n", " ns = {}\n", " if root.tag.startswith(\"{\"):\n", " ns_uri = root.tag.split(\"}\")[0].strip(\"{\")\n", " ns = {\"ns\": ns_uri}\n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " child = loc.text.strip()\n", " try:\n", " urls.extend(fetch_sitemap_urls(child, max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls:\n", " break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls:\n", " break\n", " return urls\n", "\n", "\n", "def slug_to_text(url: str) -> str:\n", " p = urllib.parse.urlparse(url)\n", " path = p.path or \"\"\n", " path = re.sub(r\"\\.\\w{1,6}$\", \"\", path)\n", " tokens = re.split(r\"[\\/\\-\\_\\.\\?\\=\\&\\#]+\", path)\n", " tokens = [t for t in tokens if t and not re.fullmatch(r\"\\d+\", t)]\n", " if not tokens:\n", " tokens = [p.netloc]\n", " return \" \".join(tokens)\n", "\n", "\n", "def rank_sitemap_urls_by_query(sitemap_url: str, query: str, top_k: int = 3, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " # In a production app, you would want to cache the result of this fetch!\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=max_urls, verify=verify)\n", " if not urls:\n", " return []\n", " \n", " slugs = [slug_to_text(u) for u in urls]\n", " \n", " # Lowercase everything to help TF-IDF match words more easily\n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000, lowercase=True)\n", " X = vectorizer.fit_transform(slugs + [query])\n", " \n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " \n", " # Optimization: Filter out URLs that have absolutely zero keyword overlap\n", " valid_indices = [i for i in sims.argsort()[::-1] if sims[i] > 0.0]\n", " \n", " top_idx = valid_indices[:top_k]\n", " \n", " if not top_idx:\n", " print(f\"Warning: No lexical matches found for '{query}'.\")\n", " return []\n", " \n", " return [urls[i] for i in top_idx]\n", "\n", "\n", "b = fetch_sitemap_urls(\"https://docs.langchain.com/sitemap.xml\", verify=False)\n", "c = slug_to_text(\"https://docs.langchain.com/sitemap.xml\")\n", "d = rank_sitemap_urls_by_query(\"https://docs.langchain.com/sitemap.xml\", \"installation guide\", top_k=3, verify=False)\n", "print(b[:5])\n", "user_query = \"langgraph authentication\"\n", "results = rank_sitemap_urls_by_query(\"https://docs.langchain.com/sitemap.xml\", user_query, top_k=3, verify=False)\n", "print(f\"Top results for query: '{user_query}'\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "78ba4032", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading weights: 100%|██████████| 103/103 [00:00<00:00, 4993.74it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "--- STARTING JOB ---\n", "Fetching sitemap: https://docs.langchain.com/sitemap.xml\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\bs4\\builder\\_lxml.py:321: RuntimeWarning: coroutine 'run_smart_rag' was never awaited\n", " for inverted_nsmap in reversed(self.nsmaps):\n", "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Sitemap Quality: 0.0% poorly named URLs.\n", "Strategy: VECTOR SEARCH. Embedding URLs...\n", "Vector Database chose: https://docs.langchain.com/oss/python/contributing/implement-langchain\n", "\n", "Scraping target URL: https://docs.langchain.com/oss/python/contributing/implement-langchain...\n", "\n", "Generating final answer...\n", "\n", "--- FINAL ANSWER ---\n", "Based on the provided webpage text, an example case with LangChain involves implementing a chat model or an embedding model, which are subclasses of the respective base classes in `langchain-core`. For instance, you could implement a subclass of `BaseChatModel` to generate chat completions or handle message formatting. Another example would be implementing an embedding model subclass of the `Embeddings` class to generate embeddings for text.\n", "\n", "Here's a simplified example of implementing a chat model:\n", "\n", "```python\n", "from langchain.chains.base import Chain\n", "from langchain.base_language import BaseLanguageModel\n", "\n", "class CustomChatModel(BaseChatModel):\n", " def __init__(self, llm: BaseLanguageModel):\n", " self.llm = llm\n", "\n", " def generate_chat_completion(self, messages):\n", " # Custom logic to generate chat completion\n", " response = self.llm.generate_response(messages)\n", " return response\n", "\n", " def format_messages(self, messages):\n", " # Custom logic to format messages\n", " formatted_messages = [f\"{msg['role']}: {msg['content']}\" for msg in messages]\n", " return formatted_messages\n", "\n", " def manage_model_parameters(self, parameters):\n", " # Custom logic to manage model parameters\n", " adjusted_params = {**parameters, \"temperature\": 0.7}\n", "\n" ] } ], "source": [ "import os\n", "import re\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urlparse\n", "from langchain_community.vectorstores import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint\n", "from langchain_core.documents import Document\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "# --- 1. LLM & EMBEDDING SETUP ---\n", "from dotenv import load_dotenv\n", "from pathlib import Path\n", "\n", "dotenv_path = Path.cwd() / \".env\"\n", "if not dotenv_path.exists():\n", " dotenv_path = Path.cwd().parent / \".env\"\n", "if not dotenv_path.exists():\n", " raise FileNotFoundError(\".env not found in current working directory or its parent.\")\n", "load_dotenv(dotenv_path)\n", "\n", "HF_TOKEN = os.getenv(\"HF_TOKEN\")\n", "if not HF_TOKEN:\n", " raise ValueError(\"HF_TOKEN not found. Create a .env with HF_TOKEN=... and restart the kernel.\")\n", "os.environ.setdefault(\"HUGGINGFACEHUB_API_TOKEN\", HF_TOKEN)\n", "os.environ.setdefault(\"HF_TOKEN\", HF_TOKEN)\n", "HF_MODEL = \"Qwen/Qwen2.5-7B-Instruct\"\n", "\n", "endpoint = HuggingFaceEndpoint(\n", " repo_id=HF_MODEL, task=\"text-generation\", max_new_tokens=256, temperature=0.1\n", ")\n", "llm = ChatHuggingFace(llm=endpoint, model_id=HF_MODEL)\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n", "\n", "# --- 2. THE GRADER LOGIC ---\n", "def is_poorly_named(url: str) -> bool:\n", " \"\"\"Returns True if the URL slug is unreadable to an AI.\"\"\"\n", " path = urlparse(url).path.strip('/')\n", " if not path: return False\n", " slug = path.split('/')[-1]\n", " \n", " if re.search(r'[0-9a-fA-F]{8}-', slug): return True\n", " if slug.isdigit(): return True\n", " \n", " letters = sum(c.isalpha() for c in slug)\n", " numbers = sum(c.isdigit() for c in slug)\n", " if numbers > letters: return True\n", " \n", " if len(slug) > 15 and '-' not in slug and '_' not in slug: return True\n", " return False\n", "\n", "def extract_urls_from_sitemap(sitemap_url: str) -> list:\n", " \"\"\"Downloads a sitemap.xml and returns a list of URLs.\"\"\"\n", " print(f\"Fetching sitemap: {sitemap_url}\")\n", " response = requests.get(sitemap_url)\n", " soup = BeautifulSoup(response.content, \"xml\")\n", " return [loc.text for loc in soup.find_all(\"loc\")]\n", "\n", "\n", "# --- 3. METHOD 2: VECTOR SEARCH ---\n", "def route_via_vector_search(urls: list, user_query: str) -> str:\n", " \"\"\"Embeds the URL slugs into Chroma and returns the most relevant URL.\"\"\"\n", " print(\"Strategy: VECTOR SEARCH. Embedding URLs...\")\n", " docs = []\n", " for url in urls:\n", " path = urlparse(url).path.strip('/')\n", " slug = path.split('/')[-1].replace('-', ' ').replace('_', ' ')\n", " # Store the readable slug as the text, and the actual URL in metadata\n", " docs.append(Document(page_content=slug, metadata={\"source\": url}))\n", " \n", " # Create a temporary local vector database in memory\n", " vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)\n", " \n", " # Retrieve the closest match to the user's question\n", " results = vectorstore.similarity_search(user_query, k=1)\n", " best_url = results[0].metadata[\"source\"]\n", " print(f\"Vector Database chose: {best_url}\")\n", " return best_url\n", "\n", "\n", "# --- 4. METHOD 3: LLM TITLE ROUTER ---\n", "def fetch_title(url: str) -> str:\n", " \"\"\"Quickly fetches the