{ "cells": [ { "cell_type": "markdown", "id": "5870516f", "metadata": {}, "source": [ "# Project Code Loader\n", "This notebook loads and displays the full contents of the main project source files from the repository." ] }, { "cell_type": "markdown", "id": "035d6dcb", "metadata": {}, "source": [ "## Import Required Libraries\n", "\n", "Import built-in libraries for file handling and display." ] }, { "cell_type": "markdown", "id": "7d43652c", "metadata": {}, "source": [ "1. fetch sitemap.xml\n", "2. load xml\n", "3. get loc\n", "4. seperate /\n", "5. seperate not relvent text\n", "5. convert text to embedding\n", "6. store embedding to chromadb\n", "7. when user query convert to embeddings\n", "8. \n", "9. " ] }, { "cell_type": "code", "execution_count": 48, "id": "ae26c1f4", "metadata": {}, "outputs": [], "source": [ "import re\n", "from typing import List\n", "import requests as rq\n", "from bs4 import BeautifulSoup\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "\n", "# Sitemap slug-vectorization quick utility (Method 2: fast, in-house routing)\n", "# Usage: call rank_sitemap_urls_by_query(sitemap_url, query, top_k=3)\n", "\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " \"\"\"Fetch sitemap (supports sitemapindex) and return a flat list of URLs (up to max_urls).\"\"\"\n", " resp = rq.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " content = resp.content\n", " urls: List[str] = []\n", "\n", " try:\n", " root = ET.fromstring(content)\n", " except ET.ParseError:\n", " soup = BeautifulSoup(content, \"xml\")\n", " locs = soup.find_all(\"loc\")\n", " for loc in locs:\n", " if loc.string:\n", " urls.append(loc.string.strip())\n", " if len(urls) >= max_urls:\n", " break\n", " return urls\n", "\n", " ns = {}\n", " if root.tag.startswith(\"{\"):\n", " ns_uri = root.tag.split(\"}\")[0].strip(\"{\")\n", " ns = {\"ns\": ns_uri}\n", "\n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " child = loc.text.strip()\n", " try:\n", " urls.extend(fetch_sitemap_urls(child, max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls:\n", " break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls:\n", " break\n", "\n", " return urls\n", "\n", "\n", "def slug_to_text(url: str) -> str:\n", " \"\"\"Turn a URL path into readable tokens suitable for fast semantic matching.\"\"\"\n", " p = urllib.parse.urlparse(url)\n", " path = p.path or \"\"\n", " path = re.sub(r\"\\.\\w{1,6}$\", \"\", path)\n", " tokens = re.split(r\"[\\/\\-\\_\\.\\?\\=\\&\\#]+\", path)\n", " tokens = [t for t in tokens if t and not re.fullmatch(r\"\\d+\", t)]\n", " if not tokens:\n", " tokens = [p.netloc]\n", " return \" \".join(tokens)\n", "\n", "\n", "def rank_sitemap_urls_by_query(sitemap_url: str, query: str, top_k: int = 3, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " \"\"\"Return top_k sitemap URLs ranked by TF-IDF similarity between query and URL slugs.\"\"\"\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=max_urls, verify=verify)\n", " if not urls:\n", " return []\n", " slugs = [slug_to_text(u) for u in urls]\n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)\n", " X = vectorizer.fit_transform(slugs + [query])\n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " top_idx = sims.argsort()[::-1][:top_k]\n", " return [urls[i] for i in top_idx]\n", "\n", "\n", "# Example usage:\n", "# sitemap = \"https://docs.python.org/sitemap.xml\"\n", "# print(rank_sitemap_urls_by_query(sitemap, \"how to install pip\", top_k=3))\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "c111390b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: No lexical matches found for 'installation guide'.\n", "['https://docs.langchain.com/api-reference/agent-connections-v2/create-connection', 'https://docs.langchain.com/api-reference/agent-connections-v2/list-connections', 'https://docs.langchain.com/api-reference/agent-connections-v2/remove-connection', 'https://docs.langchain.com/api-reference/auth-service-v2/authenticate', 'https://docs.langchain.com/api-reference/auth-service-v2/check-oauth-token-exists']\n", "Top results for query: 'langgraph authentication'\n" ] } ], "source": [ "import re\n", "import urllib3\n", "import requests\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "from typing import List\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", "requests.packages.urllib3.disable_warnings()\n", "\n", "# Redefine helpers here so this cell can run independently.\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " resp = requests.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " root = ET.fromstring(resp.content)\n", " urls = []\n", " ns = {}\n", " if root.tag.startswith(\"{\"):\n", " ns_uri = root.tag.split(\"}\")[0].strip(\"{\")\n", " ns = {\"ns\": ns_uri}\n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " child = loc.text.strip()\n", " try:\n", " urls.extend(fetch_sitemap_urls(child, max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls:\n", " break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls:\n", " break\n", " return urls\n", "\n", "\n", "def slug_to_text(url: str) -> str:\n", " p = urllib.parse.urlparse(url)\n", " path = p.path or \"\"\n", " path = re.sub(r\"\\.\\w{1,6}$\", \"\", path)\n", " tokens = re.split(r\"[\\/\\-\\_\\.\\?\\=\\&\\#]+\", path)\n", " tokens = [t for t in tokens if t and not re.fullmatch(r\"\\d+\", t)]\n", " if not tokens:\n", " tokens = [p.netloc]\n", " return \" \".join(tokens)\n", "\n", "\n", "def rank_sitemap_urls_by_query(sitemap_url: str, query: str, top_k: int = 3, max_urls: int = 2000, verify: bool = False) -> List[str]:\n", " # In a production app, you would want to cache the result of this fetch!\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=max_urls, verify=verify)\n", " if not urls:\n", " return []\n", " \n", " slugs = [slug_to_text(u) for u in urls]\n", " \n", " # Lowercase everything to help TF-IDF match words more easily\n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000, lowercase=True)\n", " X = vectorizer.fit_transform(slugs + [query])\n", " \n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " \n", " # Optimization: Filter out URLs that have absolutely zero keyword overlap\n", " valid_indices = [i for i in sims.argsort()[::-1] if sims[i] > 0.0]\n", " \n", " top_idx = valid_indices[:top_k]\n", " \n", " if not top_idx:\n", " print(f\"Warning: No lexical matches found for '{query}'.\")\n", " return []\n", " \n", " return [urls[i] for i in top_idx]\n", "\n", "\n", "b = fetch_sitemap_urls(\"https://docs.langchain.com/sitemap.xml\", verify=False)\n", "c = slug_to_text(\"https://docs.langchain.com/sitemap.xml\")\n", "d = rank_sitemap_urls_by_query(\"https://docs.langchain.com/sitemap.xml\", \"installation guide\", top_k=3, verify=False)\n", "print(b[:5])\n", "user_query = \"langgraph authentication\"\n", "results = rank_sitemap_urls_by_query(\"https://docs.langchain.com/sitemap.xml\", user_query, top_k=3, verify=False)\n", "print(f\"Top results for query: '{user_query}'\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "78ba4032", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading weights: 100%|██████████| 103/103 [00:00<00:00, 4993.74it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "--- STARTING JOB ---\n", "Fetching sitemap: https://docs.langchain.com/sitemap.xml\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\bs4\\builder\\_lxml.py:321: RuntimeWarning: coroutine 'run_smart_rag' was never awaited\n", " for inverted_nsmap in reversed(self.nsmaps):\n", "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Sitemap Quality: 0.0% poorly named URLs.\n", "Strategy: VECTOR SEARCH. Embedding URLs...\n", "Vector Database chose: https://docs.langchain.com/oss/python/contributing/implement-langchain\n", "\n", "Scraping target URL: https://docs.langchain.com/oss/python/contributing/implement-langchain...\n", "\n", "Generating final answer...\n", "\n", "--- FINAL ANSWER ---\n", "Based on the provided webpage text, an example case with LangChain involves implementing a chat model or an embedding model, which are subclasses of the respective base classes in `langchain-core`. For instance, you could implement a subclass of `BaseChatModel` to generate chat completions or handle message formatting. Another example would be implementing an embedding model subclass of the `Embeddings` class to generate embeddings for text.\n", "\n", "Here's a simplified example of implementing a chat model:\n", "\n", "```python\n", "from langchain.chains.base import Chain\n", "from langchain.base_language import BaseLanguageModel\n", "\n", "class CustomChatModel(BaseChatModel):\n", " def __init__(self, llm: BaseLanguageModel):\n", " self.llm = llm\n", "\n", " def generate_chat_completion(self, messages):\n", " # Custom logic to generate chat completion\n", " response = self.llm.generate_response(messages)\n", " return response\n", "\n", " def format_messages(self, messages):\n", " # Custom logic to format messages\n", " formatted_messages = [f\"{msg['role']}: {msg['content']}\" for msg in messages]\n", " return formatted_messages\n", "\n", " def manage_model_parameters(self, parameters):\n", " # Custom logic to manage model parameters\n", " adjusted_params = {**parameters, \"temperature\": 0.7}\n", "\n" ] } ], "source": [ "import os\n", "import re\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urlparse\n", "from langchain_community.vectorstores import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint\n", "from langchain_core.documents import Document\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "# --- 1. LLM & EMBEDDING SETUP ---\n", "from dotenv import load_dotenv\n", "from pathlib import Path\n", "\n", "dotenv_path = Path.cwd() / \".env\"\n", "if not dotenv_path.exists():\n", " dotenv_path = Path.cwd().parent / \".env\"\n", "if not dotenv_path.exists():\n", " raise FileNotFoundError(\".env not found in current working directory or its parent.\")\n", "load_dotenv(dotenv_path)\n", "\n", "HF_TOKEN = os.getenv(\"HF_TOKEN\")\n", "if not HF_TOKEN:\n", " raise ValueError(\"HF_TOKEN not found. Create a .env with HF_TOKEN=... and restart the kernel.\")\n", "os.environ.setdefault(\"HUGGINGFACEHUB_API_TOKEN\", HF_TOKEN)\n", "os.environ.setdefault(\"HF_TOKEN\", HF_TOKEN)\n", "HF_MODEL = \"Qwen/Qwen2.5-7B-Instruct\"\n", "\n", "endpoint = HuggingFaceEndpoint(\n", " repo_id=HF_MODEL, task=\"text-generation\", max_new_tokens=256, temperature=0.1\n", ")\n", "llm = ChatHuggingFace(llm=endpoint, model_id=HF_MODEL)\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n", "\n", "# --- 2. THE GRADER LOGIC ---\n", "def is_poorly_named(url: str) -> bool:\n", " \"\"\"Returns True if the URL slug is unreadable to an AI.\"\"\"\n", " path = urlparse(url).path.strip('/')\n", " if not path: return False\n", " slug = path.split('/')[-1]\n", " \n", " if re.search(r'[0-9a-fA-F]{8}-', slug): return True\n", " if slug.isdigit(): return True\n", " \n", " letters = sum(c.isalpha() for c in slug)\n", " numbers = sum(c.isdigit() for c in slug)\n", " if numbers > letters: return True\n", " \n", " if len(slug) > 15 and '-' not in slug and '_' not in slug: return True\n", " return False\n", "\n", "def extract_urls_from_sitemap(sitemap_url: str) -> list:\n", " \"\"\"Downloads a sitemap.xml and returns a list of URLs.\"\"\"\n", " print(f\"Fetching sitemap: {sitemap_url}\")\n", " response = requests.get(sitemap_url)\n", " soup = BeautifulSoup(response.content, \"xml\")\n", " return [loc.text for loc in soup.find_all(\"loc\")]\n", "\n", "\n", "# --- 3. METHOD 2: VECTOR SEARCH ---\n", "def route_via_vector_search(urls: list, user_query: str) -> str:\n", " \"\"\"Embeds the URL slugs into Chroma and returns the most relevant URL.\"\"\"\n", " print(\"Strategy: VECTOR SEARCH. Embedding URLs...\")\n", " docs = []\n", " for url in urls:\n", " path = urlparse(url).path.strip('/')\n", " slug = path.split('/')[-1].replace('-', ' ').replace('_', ' ')\n", " # Store the readable slug as the text, and the actual URL in metadata\n", " docs.append(Document(page_content=slug, metadata={\"source\": url}))\n", " \n", " # Create a temporary local vector database in memory\n", " vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)\n", " \n", " # Retrieve the closest match to the user's question\n", " results = vectorstore.similarity_search(user_query, k=1)\n", " best_url = results[0].metadata[\"source\"]\n", " print(f\"Vector Database chose: {best_url}\")\n", " return best_url\n", "\n", "\n", "# --- 4. METHOD 3: LLM TITLE ROUTER ---\n", "def fetch_title(url: str) -> str:\n", " \"\"\"Quickly fetches the tag of a webpage.\"\"\"\n", " try:\n", " html = requests.get(url, timeout=3).text\n", " soup = BeautifulSoup(html, 'html.parser')\n", " return soup.title.string if soup.title else \"Unknown Title\"\n", " except:\n", " return \"Failed to load\"\n", "\n", "def route_via_llm(urls: list, user_query: str) -> str:\n", " \"\"\"Fetches titles and asks the LLM to pick the best URL.\"\"\"\n", " print(\"Strategy: LLM ROUTER. Fetching titles for the first 15 URLs...\")\n", " # Cap at 15 to save time/tokens. In production, you'd batch this.\n", " sample_urls = urls[:15] \n", " \n", " url_catalog = \"\"\n", " for i, url in enumerate(sample_urls):\n", " title = fetch_title(url)\n", " url_catalog += f\"[{i}] URL: {url} | Title: {title}\\n\"\n", " \n", " prompt = f\"\"\"\n", " You are an intelligent web router. The user asked: \"{user_query}\"\n", " \n", " Here is a list of URLs and their titles:\n", " {url_catalog}\n", " \n", " Which URL is most likely to contain the answer to the user's question?\n", " Respond ONLY with the exact URL string. Do not explain your reasoning.\n", " \"\"\"\n", " \n", " print(\"Asking LLM to route...\")\n", " best_url = llm.invoke(prompt).content.strip()\n", " print(f\"LLM chose: {best_url}\")\n", " return best_url\n", "\n", "\n", "# --- 5. THE MASTER CONTROLLER ---\n", "def answer_user_question(sitemap_url: str, user_query: str):\n", " print(\"\\n--- STARTING JOB ---\")\n", " urls = extract_urls_from_sitemap(sitemap_url)\n", " \n", " # Grade the URLs\n", " bad_count = sum(is_poorly_named(u) for u in urls)\n", " bad_percentage = (bad_count / len(urls)) * 100\n", " print(f\"Sitemap Quality: {bad_percentage:.1f}% poorly named URLs.\")\n", " \n", " # Decide Strategy\n", " if bad_percentage > 30:\n", " target_url = route_via_llm(urls, user_query)\n", " else:\n", " target_url = route_via_vector_search(urls, user_query)\n", " \n", " # --- FINAL GENERATION PHASE ---\n", " print(f\"\\nScraping target URL: {target_url}...\")\n", " html = requests.get(target_url).text\n", " soup = BeautifulSoup(html, 'html.parser')\n", " page_text = soup.get_text(separator=' ', strip=True)[:3000] # Grab first 3k chars\n", " \n", " final_prompt = f\"\"\"\n", " Based on the following scraped webpage text, answer the user's question.\n", " If the answer is not in the text, say you don't know.\n", " \n", " Webpage Text: {page_text}\n", " Question: {user_query}\n", " \"\"\"\n", " \n", " print(\"\\nGenerating final answer...\")\n", " answer = llm.invoke(final_prompt).content\n", " print(\"\\n--- FINAL ANSWER ---\")\n", " print(answer)\n", "\n", "# --- RUN THE APP ---\n", "\n", "if __name__ == \"__main__\":\n", " # Example: A user drops a sitemap and asks a specific question\n", " test_sitemap = \"https://docs.langchain.com/sitemap.xml\"\n", " test_query = \"give a example case with langchain\"\n", " try:\n", " answer_user_question(test_sitemap, test_query)\n", " except ValueError as e:\n", " if \"auto-router\" in str(e).lower():\n", " urls = extract_urls_from_sitemap(test_sitemap)\n", " best_url = route_via_vector_search(urls, test_query)\n", " print(f\"\\nFallback URL: {best_url}\")\n", " else:\n", " raise" ] }, { "cell_type": "code", "execution_count": 54, "id": "db1ea38c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Initializing LLM...\n", "\n", "--- 🚀 STARTING JOB: 'Can you give an illustrative case using LangChain?' ---\n", "Fetching sitemap: https://docs.langchain.com/sitemap.xml\n", "Discovered 300 URLs. Extracting metadata asynchronously...\n", "🔄 Expanded Query: Can you give an illustrative case using LangChain? LangChain example LangChain case study LangChain demonstration LangChain illustration LangChain scenario\n", "Routing query using TF-IDF Matrix...\n", "✅ Best Match Found: https://docs.langchain.com/langsmith/add-human-in-the-loop\n", "Scraping full page content...\n", "Generating Final Answer via LLM...\n", "\n", "================ FINAL ANSWER ================\n", "\n", "An illustrative case using LangChain for a human-in-the-loop scenario involves reviewing and editing text generated by an AI agent. Here’s how it works:\n", "\n", "1. **Initial Invocation**: The agent is invoked with some initial state. For example, the agent might be asked to generate a response to a user query.\n", "\n", "2. **Interrupt for Human Review**: During the execution, the agent hits a specific point where it needs human input. This could be because the generated text needs to be reviewed for accuracy, appropriateness, or any other reason. The agent returns an interrupt object with the payload and metadata.\n", "\n", " ```python\n", " result = await client.runs.wait(thread_id, assistant_id, input={\"some_text\": \"original text\"})\n", " print(result['__interrupt__'])\n", " # Output:\n", " # [\n", " # {\n", " # 'value': {'text_to_revise': 'original text'},\n", " # 'resumable': True,\n", " # 'ns': ['human_node:fc722478-2f21-0578-c572-d9fc4dd07c3b'],\n", " # 'when': 'during'\n", " # }\n", " # ]\n", " ```\n", "\n", "3. **Human Input and Resumption**: The human reviews the generated text and provides feedback. This feedback is then used to resume the agent's execution with the updated input.\n", "\n", " ```python\n", " print(await client.runs.wait(thread_id, assistant_id, command=Command(resume=\"Edited text\")))\n", " # Output:\n", " # {'some_text': 'Edited text'}\n", " ```\n", "\n", "In this case, the agent pauses its execution at a specific point, allowing a human to review and edit the generated text. Once the human provides the edited text, the agent resumes its execution with the updated input.\n", "\n", "==============================================\n" ] } ], "source": [ "import os\n", "import re\n", "import asyncio\n", "import aiohttp\n", "import urllib3\n", "import requests\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "from typing import List, Dict\n", "from bs4 import BeautifulSoup\n", "from dotenv import load_dotenv\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint\n", "\n", "# Suppress SSL warnings\n", "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", "requests.packages.urllib3.disable_warnings()\n", "\n", "# ==========================================================\n", "# 1. ENVIRONMENT & LLM SETUP\n", "# ==========================================================\n", "# Dynamically load the .env file from the current or parent directory\n", "current_dir = os.getcwd()\n", "possible_env_paths = [\n", " os.path.join(current_dir, '.env'),\n", " os.path.join(os.path.dirname(current_dir), '.env'),\n", "]\n", "for path in possible_env_paths:\n", " if os.path.exists(path):\n", " load_dotenv(dotenv_path=path)\n", " break\n", "else:\n", " print(\"Warning: .env not found. Ensure HF_TOKEN is set in your environment.\")\n", "\n", "# Ensure LangChain finds the Hugging Face token\n", "hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')\n", "if not hf_token:\n", " raise ValueError(\"Missing Hugging Face token! Please set HF_TOKEN in your .env file.\")\n", "os.environ.setdefault('HUGGINGFACEHUB_API_TOKEN', hf_token)\n", "os.environ.setdefault('HF_TOKEN', hf_token)\n", "\n", "HF_MODEL = \"Qwen/Qwen2.5-7B-Instruct\"\n", "\n", "print(\"Initializing LLM...\")\n", "endpoint = HuggingFaceEndpoint(\n", " repo_id=HF_MODEL,\n", " task=\"text-generation\",\n", " max_new_tokens=512,\n", " temperature=0.1,\n", " top_p=0.95\n", ")\n", "llm = ChatHuggingFace(llm=endpoint, model_id=HF_MODEL)\n", "\n", "\n", "# ==========================================================\n", "# 2. QUERY EXPANSION AGENT\n", "# ==========================================================\n", "def expand_query_with_synonyms(user_query: str) -> str:\n", " \"\"\"Expands the user query with synonyms so TF-IDF has a higher chance of matching.\"\"\"\n", " prompt = f\"\"\"\n", " You are a search query expansion assistant. Look at the user's search query and return a space-separated list of highly relevant synonyms, alternative terms, and root keywords that might appear in documentation titles.\n", " \n", " User Query: {user_query}\n", " \n", " Output format: Return ONLY the raw keywords separated by spaces. Do not include any sentences or explanations.\n", " \"\"\"\n", " try:\n", " expanded_keywords = llm.invoke(prompt).content.strip()\n", " combined_query = f\"{user_query} {expanded_keywords}\"\n", " print(f\"🔄 Expanded Query: {combined_query}\")\n", " return combined_query\n", " except Exception:\n", " return user_query\n", "\n", "\n", "# ==========================================================\n", "# 3. SITEMAP PARSER\n", "# ==========================================================\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 500, verify: bool = False) -> List[str]:\n", " \"\"\"Recursively downloads and extracts URLs from a sitemap.\"\"\"\n", " resp = requests.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " root = ET.fromstring(resp.content)\n", " urls = []\n", " ns = {}\n", " \n", " if root.tag.startswith(\"{\"):\n", " ns_uri = root.tag.split(\"}\")[0].strip(\"{\")\n", " ns = {\"ns\": ns_uri}\n", " \n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " child = loc.text.strip()\n", " try:\n", " urls.extend(fetch_sitemap_urls(child, max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls:\n", " break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls:\n", " break\n", " return urls\n", "\n", "\n", "# ==========================================================\n", "# 4. ASYNC METADATA SCRAPER\n", "# ==========================================================\n", "async def fetch_meta_data(session: aiohttp.ClientSession, url: str) -> Dict[str, str]:\n", " \"\"\"Fetches just the first 50KB of a page to extract Title and Description.\"\"\"\n", " try:\n", " async with session.get(url, timeout=4, ssl=False) as response:\n", " if response.status != 200:\n", " return {\"url\": url, \"text\": \"\"}\n", " \n", " html_chunk = await response.content.read(50000) \n", " soup = BeautifulSoup(html_chunk, 'html.parser')\n", " \n", " title = soup.title.string if soup.title else \"\"\n", " desc_tag = soup.find(\"meta\", attrs={\"name\": \"description\"})\n", " description = desc_tag[\"content\"] if desc_tag and \"content\" in desc_tag.attrs else \"\"\n", " \n", " # Combine them for the vectorizer\n", " rich_text = f\"{title} {description}\".strip()\n", " \n", " # Fallback: if no meta tags exist, fallback to the URL slug\n", " if len(rich_text) < 5:\n", " p = urllib.parse.urlparse(url).path.strip('/')\n", " rich_text = p.replace('-', ' ').replace('_', ' ').replace('/', ' ')\n", " \n", " return {\"url\": url, \"text\": rich_text}\n", " except Exception:\n", " return {\"url\": url, \"text\": \"\"}\n", "\n", "async def build_rich_sitemap(urls: List[str]) -> List[Dict[str, str]]:\n", " \"\"\"Concurrently fetches metadata for all URLs to eliminate network latency.\"\"\"\n", " connector = aiohttp.TCPConnector(limit=50) # Prevent overwhelming the target server\n", " async with aiohttp.ClientSession(connector=connector) as session:\n", " tasks = [fetch_meta_data(session, url) for url in urls]\n", " results = await asyncio.gather(*tasks)\n", " return results\n", "\n", "\n", "# ==========================================================\n", "# 5. TF-IDF VECTOR ROUTER\n", "# ==========================================================\n", "def rank_urls_by_metadata(rich_data: List[Dict[str, str]], query: str, top_k: int = 1) -> List[str]:\n", " \"\"\"Uses TF-IDF to route the query based on the scraped Titles and Descriptions.\"\"\"\n", " valid_data = [item for item in rich_data if item[\"text\"]]\n", " if not valid_data: return []\n", " \n", " documents = [item[\"text\"] for item in valid_data]\n", " urls = [item[\"url\"] for item in valid_data]\n", " \n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000, stop_words='english', lowercase=True)\n", " X = vectorizer.fit_transform(documents + [query])\n", " \n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " valid_indices = [i for i in sims.argsort()[::-1] if sims[i] > 0.0]\n", " top_idx = valid_indices[:top_k]\n", " \n", " return [urls[i] for i in top_idx]\n", "\n", "\n", "# ==========================================================\n", "# 6. MASTER EXECUTION FUNCTION\n", "# ==========================================================\n", "async def run_smart_rag(sitemap_url: str, user_query: str):\n", " print(f\"\\n--- 🚀 STARTING JOB: '{user_query}' ---\")\n", " \n", " # Step 1: Fetch Sitemap\n", " print(f\"Fetching sitemap: {sitemap_url}\")\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=300, verify=False)\n", " print(f\"Discovered {len(urls)} URLs. Extracting metadata asynchronously...\")\n", " \n", " # Step 2: Build Rich Sitemap (Async)\n", " rich_sitemap = await build_rich_sitemap(urls)\n", " \n", " # Step 3: Query expansion + TF-IDF Routing\n", " expanded_query = expand_query_with_synonyms(user_query)\n", " print(\"Routing query using TF-IDF Matrix...\")\n", " best_urls = rank_urls_by_metadata(rich_sitemap, expanded_query, top_k=1)\n", " \n", " if not best_urls:\n", " print(f\"❌ Could not find any URLs matching the keywords in: '{expanded_query}'\")\n", " return\n", " \n", " target_url = best_urls[0]\n", " print(f\"✅ Best Match Found: {target_url}\")\n", " \n", " # Step 4: Deep Scrape the Chosen Page\n", " print(\"Scraping full page content...\")\n", " try:\n", " html = requests.get(target_url, timeout=10, verify=False).text\n", " soup = BeautifulSoup(html, 'html.parser')\n", " \n", " # Strip boilerplate\n", " for tag in soup([\"nav\", \"footer\", \"script\", \"style\", \"aside\", \"header\"]):\n", " tag.decompose()\n", " \n", " page_text = soup.get_text(separator=' ', strip=True)[:4000] \n", " except Exception as e:\n", " print(f\"Failed to scrape the webpage: {e}\")\n", " return\n", "\n", " # Step 5: LLM Generation\n", " print(\"Generating Final Answer via LLM...\")\n", " \n", " # Notice we pass the ORIGINAL user_query here, not the expanded one\n", " final_prompt = f\"\"\"\n", " You are an expert developer assistant. Answer the user's question using ONLY the provided documentation text below. \n", " If the answer is not contained in the text, explicitly state \"I don't know based on this documentation.\"\n", " \n", " User Question: {user_query}\n", " \n", " Documentation Text from {target_url}:\n", " {page_text}\n", " \"\"\"\n", " \n", " try:\n", " answer = llm.invoke(final_prompt).content\n", " print(\"\\n================ FINAL ANSWER ================\\n\")\n", " print(answer.strip())\n", " print(\"\\n==============================================\")\n", " except Exception as e:\n", " print(f\"LLM Generation Failed: {e}\")\n", "\n", "\n", "# ==========================================================\n", "# TRIGGER THE ASYNC LOOP\n", "# ==========================================================\n", "\n", "# Test Data\n", "test_sitemap = \"https://docs.langchain.com/sitemap.xml\"\n", "test_query = \"Can you give an illustrative case using LangChain?\"\n", "\n", "# Run the Async App\n", "await run_smart_rag(test_sitemap, test_query)" ] }, { "cell_type": "code", "execution_count": 60, "id": "c45d5fad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Initializing LLM...\n", "\n", "--- 🚀 STARTING JOB: 'give me a sample case with langchain' ---\n", "Generating Multi-Query JSON...\n", "🧠 Hunting with 6 variations:\n", " 1. langchain example case\n", " 2. sample langchain use case\n", " 3. langchain case study\n", " 4. langchain practical example\n", " 5. example of langchain\n", " 6. give me a sample case with langchain\n", "\n", "Fetching sitemap: https://docs.langchain.com/sitemap.xml (Limit: 2500)\n", "Discovered 836 URLs. Extracting metadata asynchronously...\n", "\n", "Routing queries through TF-IDF Matrix and stacking results...\n", "🏆 Winning URL selected by ensemble vote: https://docs.langchain.com/langsmith/administration-overview\n", " (Received 5 out of 6 possible votes)\n", "\n", "Scraping full page content...\n", "Generating Final Answer via LLM...\n", "\n", "================ FINAL ANSWER ================\n", "\n", "I don't know based on this documentation.\n", "\n", "The provided text does not contain any sample cases or code examples related to LangChain or LangSmith. It primarily discusses the organizational structure and features of LangSmith, including organizations, workspaces, and applications.\n", "\n", "==============================================\n" ] } ], "source": [ "import os\n", "import re\n", "import json\n", "import asyncio\n", "import aiohttp\n", "import urllib3\n", "import requests\n", "import urllib.parse\n", "import xml.etree.ElementTree as ET\n", "from typing import List, Dict\n", "from bs4 import BeautifulSoup\n", "from collections import Counter\n", "from dotenv import load_dotenv\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint\n", "\n", "# Suppress SSL warnings\n", "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", "requests.packages.urllib3.disable_warnings()\n", "\n", "# ==========================================================\n", "# 1. ENVIRONMENT & LLM SETUP\n", "# ==========================================================\n", "current_dir = os.getcwd()\n", "possible_env_paths = [\n", " os.path.join(current_dir, '.env'),\n", " os.path.join(os.path.dirname(current_dir), '.env'),\n", "]\n", "for path in possible_env_paths:\n", " if os.path.exists(path):\n", " load_dotenv(dotenv_path=path)\n", " break\n", "\n", "hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')\n", "if not hf_token:\n", " raise ValueError(\"Missing Hugging Face token! Please set HF_TOKEN in your .env file.\")\n", "os.environ.setdefault('HUGGINGFACEHUB_API_TOKEN', hf_token)\n", "\n", "HF_MODEL = \"Qwen/Qwen2.5-7B-Instruct\"\n", "\n", "print(\"Initializing LLM...\")\n", "endpoint = HuggingFaceEndpoint(\n", " repo_id=HF_MODEL,\n", " task=\"text-generation\",\n", " max_new_tokens=512,\n", " temperature=0.1,\n", " top_p=0.95\n", ")\n", "llm = ChatHuggingFace(llm=endpoint, model_id=HF_MODEL)\n", "\n", "\n", "# ==========================================================\n", "# 2. MULTI-QUERY GENERATOR (JSON)\n", "# ==========================================================\n", "def generate_multi_queries(user_query: str) -> List[str]:\n", " \"\"\"Uses the LLM to generate 5 query variations and strictly returns a JSON array.\"\"\"\n", " prompt = f\"\"\"\n", " You are an AI search assistant. Your job is to generate 5 different rephrasings of the user's query to help a search engine find the best documentation page. you can use synonyms, can be short also\n", " \n", " User Query: \"{user_query}\"\n", " \n", " CRITICAL INSTRUCTION: Return ONLY a valid JSON array of 5 strings. Do not include markdown formatting, code blocks, or explanations.\n", " \n", " Example output format:\n", " [\"how to use langchain\", \"langchain tutorial python\", \"langchain basic example code\", \"getting started with langchain\", \"langchain implementation guide\"]\n", " \"\"\"\n", " \n", " try:\n", " response = llm.invoke(prompt).content.strip()\n", " \n", " # Clean up Markdown if the LLM disobeys the instruction\n", " if response.startswith(\"```json\"):\n", " response = response[7:]\n", " if response.startswith(\"```\"):\n", " response = response[3:]\n", " if response.endswith(\"```\"): response = response[:-3]\n", " \n", " queries = json.loads(response.strip())\n", " \n", " # Ensure the original query is always included in the hunt\n", " if user_query not in queries:\n", " queries.append(user_query)\n", " \n", " return queries\n", " except Exception as e:\n", " print(f\"⚠️ JSON Parse Error (falling back to original query): {e}\\nRaw Output: {response}\")\n", " return [user_query]\n", "\n", "\n", "# ==========================================================\n", "# 3. SITEMAP & METADATA LOGIC\n", "# ==========================================================\n", "def fetch_sitemap_urls(sitemap_url: str, max_urls: int = 2500, verify: bool = False) -> List[str]:\n", " resp = requests.get(sitemap_url, timeout=15, verify=verify)\n", " resp.raise_for_status()\n", " \n", " try:\n", " root = ET.fromstring(resp.content)\n", " except ET.ParseError:\n", " print(f\"⚠️ XML Parse Error: The URL {sitemap_url} returned HTML, not a valid XML sitemap.\")\n", " return []\n", " \n", " urls = []\n", " ns = {}\n", " if root.tag.startswith(\"{\"):\n", " ns = {\"ns\": root.tag.split(\"}\")[0].strip(\"{\")}\n", " \n", " if root.tag.lower().endswith(\"sitemapindex\"):\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " try:\n", " urls.extend(fetch_sitemap_urls(loc.text.strip(), max_urls=max_urls - len(urls), verify=verify))\n", " except Exception:\n", " continue\n", " if len(urls) >= max_urls: break\n", " else:\n", " locs = root.findall(\".//ns:loc\", ns) if ns else root.findall(\".//loc\")\n", " for loc in locs:\n", " urls.append(loc.text.strip())\n", " if len(urls) >= max_urls: break\n", " return urls\n", "\n", "async def fetch_meta_data(session: aiohttp.ClientSession, url: str) -> Dict[str, str]:\n", " try:\n", " async with session.get(url, timeout=4, ssl=False) as response:\n", " if response.status != 200: return {\"url\": url, \"text\": \"\"}\n", " html_chunk = await response.content.read(50000) \n", " soup = BeautifulSoup(html_chunk, 'html.parser')\n", " \n", " title = soup.title.string if soup.title else \"\"\n", " desc_tag = soup.find(\"meta\", attrs={\"name\": \"description\"})\n", " desc = desc_tag[\"content\"] if desc_tag and \"content\" in desc_tag.attrs else \"\"\n", " \n", " rich_text = f\"{title} {desc}\".strip()\n", " if len(rich_text) < 5:\n", " rich_text = urllib.parse.urlparse(url).path.replace('-', ' ').replace('/', ' ')\n", " return {\"url\": url, \"text\": rich_text}\n", " except Exception:\n", " return {\"url\": url, \"text\": \"\"}\n", "\n", "async def build_rich_sitemap(urls: List[str]) -> List[Dict[str, str]]:\n", " connector = aiohttp.TCPConnector(limit=50) \n", " async with aiohttp.ClientSession(connector=connector) as session:\n", " tasks = [fetch_meta_data(session, url) for url in urls]\n", " return await asyncio.gather(*tasks)\n", "\n", "\n", "# ==========================================================\n", "# 4. TF-IDF VECTOR ROUTER\n", "# ==========================================================\n", "def rank_urls_by_metadata(rich_data: List[Dict[str, str]], query: str, top_k: int = 1) -> List[str]:\n", " valid_data = [item for item in rich_data if item[\"text\"]]\n", " if not valid_data: return []\n", " \n", " documents = [item[\"text\"] for item in valid_data]\n", " urls = [item[\"url\"] for item in valid_data]\n", " \n", " vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000, stop_words='english', lowercase=True)\n", " X = vectorizer.fit_transform(documents + [query])\n", " \n", " sims = cosine_similarity(X[-1], X[:-1]).reshape(-1)\n", " valid_indices = [i for i in sims.argsort()[::-1] if sims[i] > 0.0]\n", " return [urls[i] for i in valid_indices[:top_k]]\n", "\n", "\n", "# ==========================================================\n", "# 5. MASTER EXECUTION FUNCTION\n", "# ==========================================================\n", "async def run_smart_rag(sitemap_url: str, user_query: str):\n", " print(f\"\\n--- 🚀 STARTING JOB: '{user_query}' ---\")\n", " \n", " # 1. Generate 5 variations\n", " print(\"Generating Multi-Query JSON...\")\n", " query_variations = generate_multi_queries(user_query)\n", " print(f\"🧠 Hunting with {len(query_variations)} variations:\")\n", " for i, q in enumerate(query_variations, 1):\n", " print(f\" {i}. {q}\")\n", " \n", " # 2. Fetch Sitemap & Metadata\n", " print(f\"\\nFetching sitemap: {sitemap_url} (Limit: 2500)\")\n", " urls = fetch_sitemap_urls(sitemap_url, max_urls=2500, verify=False)\n", " if not urls: return\n", " print(f\"Discovered {len(urls)} URLs. Extracting metadata asynchronously...\")\n", " rich_sitemap = await build_rich_sitemap(urls)\n", " \n", " # 3. Stack the Output (Ensemble Voting)\n", " print(\"\\nRouting queries through TF-IDF Matrix and stacking results...\")\n", " candidate_urls = []\n", " for q in query_variations:\n", " best_match = rank_urls_by_metadata(rich_sitemap, q, top_k=1)\n", " if best_match:\n", " candidate_urls.extend(best_match)\n", " \n", " if not candidate_urls:\n", " print(f\"❌ Could not find any URLs matching any of the variations.\")\n", " return\n", " \n", " # Count the votes to find the absolute best URL\n", " url_votes = Counter(candidate_urls)\n", " target_url = url_votes.most_common(1)[0][0]\n", " \n", " print(f\"🏆 Winning URL selected by ensemble vote: {target_url}\")\n", " print(f\" (Received {url_votes[target_url]} out of {len(query_variations)} possible votes)\")\n", " \n", " # 4. Scrape the Target Page\n", " print(\"\\nScraping full page content...\")\n", " try:\n", " html = requests.get(target_url, timeout=10, verify=False).text\n", " soup = BeautifulSoup(html, 'html.parser')\n", " for tag in soup([\"nav\", \"footer\", \"script\", \"style\", \"aside\", \"header\"]):\n", " tag.decompose()\n", " page_text = soup.get_text(separator=' ', strip=True)[:4000] \n", " except Exception as e:\n", " print(f\"Failed to scrape the webpage: {e}\")\n", " return\n", "\n", " # 5. LLM Generation\n", " print(\"Generating Final Answer via LLM...\")\n", " final_prompt = f\"\"\"\n", " You are an expert developer assistant. Answer the user's question using ONLY the provided documentation text below. \n", " If the answer is not contained in the text, explicitly state \"I don't know based on this documentation.\"\n", " Provide clear code examples if they exist in the text.\n", " \n", " User Question: {user_query}\n", " \n", " Documentation Text from {target_url}:\n", " {page_text}\n", " \"\"\"\n", " \n", " try:\n", " answer = llm.invoke(final_prompt).content\n", " print(\"\\n================ FINAL ANSWER ================\\n\")\n", " print(answer.strip())\n", " print(\"\\n==============================================\")\n", " except Exception as e:\n", " print(f\"LLM Generation Failed: {e}\")\n", "\n", "# ==========================================================\n", "# TRIGGER THE JUPYTER ASYNC LOOP\n", "# ==========================================================\n", "# Point it to the pure XML API reference that actually contains code\n", "test_sitemap = \"https://docs.langchain.com/sitemap.xml\"\n", "test_query = \"give me a sample case with langchain\"\n", "\n", "# Use await to avoid the asyncio RuntimeWarning!\n", "await run_smart_rag(test_sitemap, test_query)" ] }, { "cell_type": "code", "execution_count": 62, "id": "d3f678e8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Title: 12 real-world LangChain usecases - Educative\n", "URL: www.educative.io/blog/langchain-usecases\n", "Snippet: LangChain enables developers to build powerful LLM applications across real-world scenarios, from chatbots with memory and private document Q&A to research bots, legal summarization, and workflow automation. This guide explores 12 LangChain usecases that demonstrate how developers can move beyond basic prompts into production-ready systems.\n", "--------------------------------------------------\n", "Title: GitHub - alphasecio/langchain-examples: A collection of apps powered by ...\n", "URL: github.com/alphasecio/langchain-examples\n", "Snippet: This repository contains a collection of apps powered by LangChain. LangChain is an open-source framework created to aid the development of applications leveraging the power of large language models (LLMs). It can be used for chatbots, text summarisation, data generation, code understanding, question answering, evaluation, and more.\n", "--------------------------------------------------\n", "Title: Case studies - Docs by LangChain\n", "URL: docs.langchain.com/oss/python/langgraph/case-studies\n", "Snippet: Case studies Copy page https://docs.langchain.com/llms.txt This list of companies using LangGraph and their success stories is compiled from public sources. If your company uses LangGraph, we'd love for you to share your story and add it to the list.\n", "--------------------------------------------------\n", "Title: 7+ LangChain Use Cases and Real-World Example - Designveloper\n", "URL: www.designveloper.com/blog/langchain-use-cases/\n", "Snippet: Discover the top LangChain use cases. Explore real-world examples of LangChain enterprise use cases, from AI chatbots and RAG.\n", "--------------------------------------------------\n", "Title: LangChain Use Cases: 15+ Real-World Applications in 2026\n", "URL: www.spaceo.ai/blog/langchain-use-cases/\n", "Snippet: Explore 15+ LangChain use cases with real-world examples across industries, and see how it powers production-grade AI applications in this guide.\n", "--------------------------------------------------\n", "Title: Langchain Use Cases - Medium\n", "URL: medium.com/@amit25173/langchain-use-cases-d4477ae6e077\n", "Snippet: We'll explore various use cases, backed by practical applications and code examples, to give you a clear understanding of how you can leverage Langchain in your projects.\n", "--------------------------------------------------\n", "Title: Real-world LangChain use cases: A developer's journey into production ...\n", "URL: learningdaily.dev/real-world-langchain-use-cases-a-developers-journey-into-production-grade-llm-apps-d45858e0a8ff\n", "Snippet: LangChain quickly became the framework that turned toy projects into real applications. Now, after working on several LLM-powered tools across content, research, and automation use cases, I've come to see LangChain not just as a tool, but as a mindset shift. It lets you treat language models as dynamic agents that can reason, remember, and act.\n", "--------------------------------------------------\n", "Title: LangChain Tutorial - GeeksforGeeks\n", "URL: www.geeksforgeeks.org/data-science/langchain-tutorial/\n", "Snippet: LangChain is a framework that makes it easier to build applications using large language models (LLMs) by connecting them with data, tools and APIs. It helps developers move beyond simple text generation and create intelligent workflows.\n", "--------------------------------------------------\n", "Title: 8 Use Cases of LangChain - Airbyte\n", "URL: airbyte.com/data-engineering-resources/langchain-use-cases\n", "Snippet: Learn about LangChain use cases and how this AI technology is revolutionizing the process of building LLM applications.\n", "--------------------------------------------------\n", "Title: Langchain Tools and Agents use cases with examples\n", "URL: telestreak.com/tech/langchain-tools-agents-with-examples/\n", "Snippet: In conclusion, LangChain's tools and agents represent a significant leap forward in the development of AI applications. By combining robust building blocks with intelligent orchestrators, LangChain empowers developers to create dynamic, context-aware, and scalable solutions that can transform industries and enhance user experiences.\n", "--------------------------------------------------\n" ] } ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "def duckduckgo_search(query):\n", " url = \"https://html.duckduckgo.com/html/\"\n", " \n", " # A standard browser User-Agent prevents DuckDuckGo from blocking the script\n", " headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\"\n", " }\n", " \n", " params = {\n", " \"q\": query\n", " }\n", " \n", " response = requests.get(url, headers=headers, params=params)\n", " \n", " if response.status_code != 200:\n", " print(f\"Failed to retrieve results. Status code: {response.status_code}\")\n", " return\n", "\n", " soup = BeautifulSoup(response.text, 'html.parser')\n", " \n", " # Find all individual search result containers\n", " results = soup.find_all('div', class_='result')\n", " \n", " for result in results:\n", " title_tag = result.find('a', class_='result__a')\n", " snippet_tag = result.find('a', class_='result__snippet')\n", " url_tag = result.find('a', class_='result__url')\n", " \n", " if title_tag and url_tag:\n", " title = title_tag.text.strip()\n", " # Extract the display URL directly from the text (avoids DDG redirect links)\n", " clean_url = url_tag.text.strip() \n", " snippet = snippet_tag.text.strip() if snippet_tag else \"No snippet available\"\n", " \n", " print(f\"Title: {title}\")\n", " print(f\"URL: {clean_url}\")\n", " print(f\"Snippet: {snippet}\")\n", " print(\"-\" * 50)\n", "\n", "# Example usage\n", "if __name__ == \"__main__\":\n", " duckduckgo_search(\"give me a sample case with langchain\")" ] }, { "cell_type": "code", "execution_count": 73, "id": "926a44cc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling: https://github.com/langchain-ai/deepagents ...\n", "\n", "Generating answer...\n", "DeepAgent, specifically referred to as Deep Agents in the provided text, is an open-source agent harness designed to facilitate the development and deployment of AI agents. It is described as an \"opinionated agent that runs out of the box,\" meaning it comes with default configurations that are tuned for handling long-horizon, multi-step tasks. However, it is also highly extensible, allowing users to override or replace any part of the agent without needing to fork the project.\n", "\n", "Key features of Deep Agents include:\n", "\n", "- **Model-agnostic**: It works with any language model (LLM) that supports tool calling, such as frontier, open-weight, or local models.\n", "- **Production-ready**: Built on top of LangGraph, which provides streaming, persistence, checkpointing, and first-class tracing, evaluation, and deployment via LangSmith.\n", "- **Extensibility**: Users can extend, override, or replace any piece of the agent.\n", "- **Sub-agents**: Allows delegation of tasks to agents with isolated context windows.\n", "- **Filesystem**: Supports reading, writing, editing, or searching over pluggable local, sandboxed, or remote backends.\n", "- **Context Management**: Summarizes long threads and offloads tool outputs to disk.\n", "- **Shell Access**: Runs commands in a sandbox of choice.\n", "- **Persistent Memory**: Uses pluggable state and store backends for cross-session recall.\n", "- **Human-in-the-loop**: Allows approval, editing, or rejection of tool calls before execution.\n", "- **Skills and Tools**: Reusable behaviors and functions that the agent can load on demand.\n", "\n", "Here is a sample quickstart code snippet provided in the text:\n", "\n", "```python\n", "from deepagents import create_deep_agent\n", "\n", "agent = create_deep_agent(\n", " model=\"openai:gpt-5.5\",\n", " tools=[my_custom_tool],\n", " system_prompt=\"You are a research assistant.\"\n", ")\n", "\n", "result = agent.invoke({\n", " \"messages\": \"Research LangGraph and write a summary\"\n", "})\n", "\n", "# The agent can plan, read/write files, and manage its own context.\n", "# Add your own tools, swap models, customize prompts, configure sub-agents, and more.\n", "```\n", "\n", "This code demonstrates how to create and use a DeepAgent instance to perform a task, such as researching a topic and writing a summary.\n" ] } ], "source": [ "import os\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from dotenv import load_dotenv\n", "from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint\n", "\n", "# Setup LLM (same as before)\n", "load_dotenv()\n", "hf_token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')\n", "os.environ.setdefault('HUGGINGFACEHUB_API_TOKEN', hf_token)\n", "\n", "endpoint = HuggingFaceEndpoint(\n", " repo_id=\"Qwen/Qwen2.5-7B-Instruct\",\n", " task=\"text-generation\",\n", " max_new_tokens=1000,\n", " temperature=0.1\n", ")\n", "llm = ChatHuggingFace(llm=endpoint, model_id=\"Qwen/Qwen2.5-7B-Instruct\")\n", "\n", "def get_first_result_url(query: str) -> str:\n", " \"\"\"Gets the URL of the top result from DuckDuckGo HTML version.\"\"\"\n", " url = \"https://html.duckduckgo.com/html/\"\n", " headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", " data = {\"q\": query}\n", " \n", " response = requests.post(url, headers=headers, data=data)\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " \n", " # Grab the first link\n", " first_result = soup.find('a', class_='result__a')\n", " if first_result:\n", " return first_result['href']\n", " return None\n", "\n", "def crawl_website(url: str) -> str:\n", " \"\"\"Scrapes the main text content from a given URL.\"\"\"\n", " print(f\"Crawling: {url} ...\")\n", " try:\n", " response = requests.get(url, timeout=10, headers={\"User-Agent\": \"Mozilla/5.0\"})\n", " soup = BeautifulSoup(response.text, \"html.parser\")\n", " \n", " # Remove junk\n", " for tag in soup([\"nav\", \"footer\", \"script\", \"style\", \"aside\", \"header\"]):\n", " tag.decompose()\n", " \n", " return soup.get_text(separator=' ', strip=True)[:5000]\n", " except Exception as e:\n", " return f\"Error crawling {url}: {e}\"\n", "\n", "def search_and_crawl(query: str):\n", " # 1. Search\n", " target_url = get_first_result_url(query)\n", " if not target_url:\n", " print(\"No results found.\")\n", " return\n", "\n", " # 2. Crawl\n", " content = crawl_website(target_url)\n", " \n", " # 3. Generate Answer\n", " prompt = f\"Answer the user query based on the following text.\\n\\nQuery: {query}\\n\\nText: {content[:3000]}\"\n", " print(\"\\nGenerating answer...\")\n", " print(llm.invoke(prompt).content)\n", "\n", "# Run it\n", "search_and_crawl(\"what is deepagent? with sample text\")" ] }, { "cell_type": "markdown", "id": "c5f668d1", "metadata": {}, "source": [ "## Run Python Compile Tests\n", "\n", "Compile the project Python files to verify syntax and detect parse-time errors." ] }, { "cell_type": "markdown", "id": "61c8f1cb", "metadata": {}, "source": [ "## Self-contained Run and Test Cell\n", "\n", "This cell defines the project files and performs compilation in one block, so it can run independently." ] }, { "cell_type": "code", "execution_count": null, "id": "0b6516c3", "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name 'Chroma' from 'chromadb' (c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\chromadb\\__init__.py)", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mImportError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mchromadb\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Chroma\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mlangchain_core\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseLanguageModel\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[34;01mlangchain_core\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mprompts\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PromptTemplate\n", "\u001b[31mImportError\u001b[39m: cannot import name 'Chroma' from 'chromadb' (c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\chromadb\\__init__.py)" ] } ], "source": [ "from chromadb import Chroma\n", "from langchain_core import BaseLanguageModel\n", "from langchain_core.prompts import PromptTemplate\n", "import chromadb\n", "\n", "# Example 1: create an in-memory Chroma client and collection\n", "\n", "client = chromadb.Client()\n", "collection = client.get_or_create_collection(name=\"demo_examples\")\n", "\n", "# Example 2: add documents\n", "collection.add(\n", " ids=[\"1\", \"2\", \"3\"],\n", " documents=[\n", " \"Chroma is a vector database.\",\n", " \"Paris is the capital of France.\",\n", " \"World War II ended in 1945.\",\n", " ],\n", " metadatas=[\n", " {\"topic\": \"db\"},\n", " {\"topic\": \"geo\"},\n", " {\"topic\": \"history\"},\n", " ],\n", ")\n", "\n", "# Example 3: inspect collection size\n", "print(\"Count:\", collection.count())\n", "\n", "# Example 4: similarity query\n", "result = collection.query(\n", " query_texts=[\"What is Chroma used for?\"],\n", " n_results=2\n", ")\n", "print(\"Query result:\", result)\n", "\n", "# Example 5: fetch by id\n", "print(\"Get id=2:\", collection.get(ids=[\"2\"]))\n", "\n", "# Example 6: update a document\n", "collection.update(\n", " ids=[\"3\"],\n", " documents=[\"World War II ended in 1945 after Germany surrendered.\"]\n", ")\n", "\n", "# Example 7: delete a document\n", "collection.delete(ids=[\"1\"])\n", "print(\"Count after delete:\", collection.count())" ] }, { "cell_type": "markdown", "id": "1f5fcbaf", "metadata": {}, "source": [ "## Load Hugging Face LLM and Respond to a Question\n", "\n", "This cell uses the Hugging Face Inference endpoint and `ChatHuggingFace` wrapper to answer a sample question without relying on the document RAG chain." ] }, { "cell_type": "code", "execution_count": null, "id": "cc9ed29c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'role': 'assistant', 'reasoning': 'The user wants a one-sentence description of the image.\\n\\n1. **Identify the main subject:** The Statue of Liberty is the most prominent foreground element, standing on its pedestal on an island (Liberty Island).\\n2. **Identify the background:** The New York City skyline is visible across the water, featuring skyscrapers like the Empire State Building.\\n3. **Identify the setting:** It\\'s a body of water (New York Harbor) on a clear day with blue skies.\\n4. **Drafting the sentence:**\\n * *Draft 1:* The Statue of Liberty stands on an island in the water with the New York City skyline in the background. (A bit simple)\\n * *Draft 2:* A view of the Statue of Liberty standing tall on its pedestal, overlooking the blue water of New York Harbor with the city\\'s skyscrapers rising in the distance. (Better, more descriptive)\\n * *Draft 3:* The Statue of Liberty stands prominently on its island base in the foreground, set against a backdrop of the New York City skyline and blue water under a clear sky. (Good balance of detail)\\n\\n5. **Refining for conciseness and impact:** \"The Statue of Liberty stands prominently on its island pedestal in the foreground, overlooking the blue waters of New York Harbor with the city\\'s iconic skyline rising in the background.\"\\n\\n6. **Final Polish:** Let\\'s make it punchy. \"The Statue of Liberty stands majestically on its island base in the foreground, overlooking the blue waters of the harbor with the New York City skyline visible in the distance.\"\\n\\nLet\\'s stick to a clear, descriptive sentence.\\n\\n\"The Statue of Liberty stands prominently on its stone pedestal in the foreground, overlooking the blue waters of New York Harbor with the city\\'s dense skyline rising in the background.\"\\n', 'content': \"\\n\\nThe Statue of Liberty stands prominently on its stone pedestal in the foreground, overlooking the blue waters of New York Harbor with the city's dense skyline rising in the background.\"}\n" ] } ], "source": [ "import os\n", "import requests\n", "\n", "API_URL = \"https://router.huggingface.co/v1/chat/completions\"\n", "headers = {\n", " \"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\",\n", "}\n", "\n", "def query(payload):\n", " response = requests.post(API_URL, headers=headers, json=payload)\n", " return response.json()\n", "\n", "response = query({\n", " \"messages\": [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"Describe this image in one sentence.\"\n", " },\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", " \"url\": \"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg\"\n", " }\n", " }\n", " ]\n", " }\n", " ],\n", " \"model\": \"Qwen/Qwen3.6-27B:featherless-ai\"\n", "})\n", "\n", "print(response[\"choices\"][0][\"message\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "757cfd5c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'role': 'assistant', 'content': 'Yes, France participated in World War II. Here’s a breakdown of the key aspects you asked about:\\n\\n### Did France participate?\\nYes, France was a major participant from the very first day of the war until its end.\\n\\n### Why?\\nFrance declared war on Germany in direct response to Germany\\'s invasion of Poland.\\n\\n- **The Treaty Guarantee:** France (along with the United Kingdom) had signed a military alliance with Poland, guaranteeing its independence. The most immediate reason was the German invasion of Poland on September 1, 1939.\\n- **The Failure of Appeasement:** In the late 1930s, France and Britain followed a policy of appeasement, allowing Hitler to remilitarize the Rhineland and annex Austria and the Sudetenland (part of Czechoslovakia) in the hope of avoiding another large-scale war. When Hitler broke his promise and invaded the rest of Czechoslovakia in March 1939, it became clear that his expansionist ambitions could not be contained peacefully. The guarantee to Poland was a line in the sand. When Germany crossed it, France had little choice but to honor its commitment and declare war.\\n\\n### When?\\n- **Declaration of War:** September 3, 1939, two days after Germany invaded Poland.\\n- **The \"Phony War\" (Drôle de guerre):** From September 1939 to May 1940, there was little actual fighting on the Franco-German border.\\n- **Battle of France:** May 10 – June 25, 1940. Germany invaded France and the Low Countries, leading to France\\'s defeat in six weeks.\\n- **Armistice & Occupation:** On June 22, 1940, France signed an armistice with Germany. The country was divided, with the north and west under direct German occupation, and a collaborationist French government established in Vichy, led by Marshal Philippe Pétain.\\n- **Liberation:** France was gradually liberated following the D-Day landings in Normandy on June 6, 1944, and the Allied invasion of southern France (Operation Dragoon) in August. Paris was liberated on August 25, 1944. The war in Europe ended on May 8, 1945, with France among the victorious Allied powers.\\n\\n### How?\\nFrench participation took two starkly contrasting forms:\\n\\n**1. The Defeat and Collaborationist State (Vichy France)**\\n- **Military Collapse:** The French military strategy, built around the static Maginot Line, was completely outflanked by the rapid, mobile German \"Blitzkrieg\" through the Ardennes Forest. The French army, considered one of the strongest in the world, was decisively defeated.\\n- **Vichy Government:** Marshal Pétain\\'s regime replaced the French Republic with the authoritarian \"French State,\" which actively collaborated with Nazi Germany, including in the persecution and deportation of Jews.\\n\\n**2. The Fighters (Free France and the Resistance)**\\n- **Free France (La France Libre):** Led by General Charles de Gaulle from exile in London, this government-in-exile refused to accept the armistice. They rallied French colonies to the Allied cause.\\n- **The Resistance (La Résistance):** Inside France, a disparate network of underground groups formed. They conducted espionage for the Allies, published clandestine newspapers, sabotaged German infrastructure, and set the stage for an uprising during the liberation.\\n- **The French Army Reintegrated:** Free French forces fought in North Africa, Italy, and were instrumental in the liberation of their own country. By the end of the war, France had over 1.3 million men under arms, making it a major Allied power again.\\n\\n### Died?\\nFrance\\'s human losses were severe, estimated at **between 550,000 and 600,000 in total**. This can be broken down into:\\n\\n- **Military Deaths:** Approximately 210,000–220,000 soldiers killed. This includes the 92,000 who died in the six-week Battle of France in 1940 and those who died later fighting for the Axis, Allies, or as prisoners of war.\\n- **Civilian Deaths:** Approximately 390,000. This is a tragic mix of:\\n - **Deportation and Genocide:** Roughly **75,000 Jews**, including 11,000 children, were deported from France as part of the Holocaust. Only about 2,500 survived.\\n - **Allied Bombing:** Between 50,000 and 70,000 civilians were killed in Allied air raids targeting German positions and infrastructure in occupied France.\\n - **Massacres and Reprisals:** German forces carried out horrific massacres, such as at Oradour-sur-Glane (642 inhabitants murdered) and the killing of Maquis (Resistance) fighters.\\n - **Prisoners of War:** Roughly 40,000 of the 1.8 million French soldiers taken as POWs died in German camps.\\n - **Starvation and Hardship:** As part of the occupation\\'s economic exploitation, the German authorities requisitioned vast quantities of food, leading to malnutrition and a rise in mortality, particularly among the urban poor.', 'reasoning_content': 'Hmm, the user is asking about France\\'s participation in World War II, specifically four aspects: if they participated, why, when, how, and how many died. This is a straightforward historical question but requires a structured answer covering multiple dimensions. I need to break it down clearly. The user\\'s phrasing is a bit fragmented, so I\\'ll interpret it as a request for a comprehensive yet organized explanation.\\n\\nI can structure the response by directly addressing each part of the query in order. Starting with a clear \"yes\" for participation, then explaining the \"why\" (declaration after invasion of Poland), the \"when\" (1939-1945 timeline), the \"how\" in two phases (defeat and occupation/resistance), and finally the \"died\" part with a human cost breakdown. I\\'ll keep the language factual and avoid deep elaboration unless necessary, as the query seems to ask for concise facts. The response should flow naturally from one point to the next.'}\n" ] } ], "source": [ "import os\n", "import requests\n", "\n", "API_URL = \"https://router.huggingface.co/v1/chat/completions\"\n", "headers = {\n", " \"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\",\n", "}\n", "\n", "def query(payload):\n", " response = requests.post(API_URL, headers=headers, json=payload)\n", " return response.json()\n", "\n", "response = query({\n", " \"messages\": [\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"did france paticipated in world war 2? why?when?how ?died\"\n", " }\n", " ],\n", " \"model\": \"deepseek-ai/DeepSeek-V4-Pro:novita\"\n", "})\n", "\n", "print(response[\"choices\"][0][\"message\"])" ] }, { "cell_type": "code", "execution_count": 53, "id": "2b9ea909", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Initializing HF endpoint model=deepseek-ai/DeepSeek-V4-Pro:novita task=conversational\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Direct model output:\n", "content='[\\n \"Provide an example use case with LangChain.\",\\n \"Can you give an illustrative case using LangChain?\",\\n \"Show me a sample scenario involving LangChain.\",\\n \"Demonstrate an example case with LangChain.\",\\n \"Please provide an example case that utilizes LangChain.\"\\n]' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 523, 'prompt_tokens': 25, 'total_tokens': 548}, 'model_name': 'deepseek-ai/DeepSeek-V4-Pro:novita', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019e4a82-f0e4-7c13-a70e-644ec6995e9d-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 25, 'output_tokens': 523, 'total_tokens': 548}\n" ] } ], "source": [ "import sys\n", "import os\n", "import importlib\n", "\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\")))\n", "from src import app_hf\n", "\n", "importlib.reload(app_hf)\n", "llm = app_hf.get_llm()\n", "direct_response = llm.invoke(\"return json only, with 5 rephaseing prompt :'give a example case with langchain'\")\n", "print(\"Direct model output:\")\n", "print(direct_response)" ] }, { "cell_type": "code", "execution_count": 2, "id": "7d3a8056", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "USER_AGENT environment variable not set, consider setting it to identify your requests.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "add_urls: stored ['https://en.wikipedia.org/wiki/Paris', 'https://www.britannica.com/place/Paris'] (count=2)\n", "answer_question: doc_dir=./my_docs urls=['https://en.wikipedia.org/wiki/Paris', 'https://www.britannica.com/place/Paris'] _URLS=['https://en.wikipedia.org/wiki/Paris', 'https://www.britannica.com/place/Paris']\n", "Loading content from 2 URL(s)...\n", " ✓ Loaded: https://en.wikipedia.org/wiki/Paris (via WebBaseLoader)\n", " ✓ Loaded: https://www.britannica.com/place/Paris (via WebBaseLoader)\n", "Processing 256 text chunks...\n", "Initializing Gemini embeddings...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\HP\\OneDrive\\Desktop\\fd\\src\\app_hf.py:445: LangChainDeprecationWarning: The class `Chroma` was deprecated in LangChain 0.2.9 and will be removed in 1.0. An updated version of the class exists in the `langchain-chroma package and should be used instead. To use it run `pip install -U `langchain-chroma` and import as `from `langchain_chroma import Chroma``.\n", " _VECTORSTORE = Chroma(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Answer: ❌ Error: 404 Client Error: Not Found for url: https://generativelanguage.googleapis.com/v1beta2/models/textembedding-gecko-001:embed?key=AIzaSyBxRTAUDyWrrrQL7JyLU0L8lbxlVqRZ5yA\n" ] } ], "source": [ "import sys\n", "import os\n", "import importlib\n", "\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\")))\n", "from src import app_hf\n", "\n", "importlib.reload(app_hf)\n", "\n", "urls = [\n", " \"https://en.wikipedia.org/wiki/Paris\",\n", " \"https://www.britannica.com/place/Paris\"\n", "]\n", "\n", "app_hf.add_urls(urls)\n", "question = \"did france paticipated in world war 2? why?when?how ?died\"\n", "answer = app_hf.answer_question(question)\n", "print(f'Answer: {answer}')" ] }, { "cell_type": "code", "execution_count": 3, "id": "e026c742", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "add_urls: stored ['https://en.wikipedia.org/wiki/Paris', 'https://www.britannica.com/place/Paris'] (count=2)\n", "URL state: {'urls': ['https://en.wikipedia.org/wiki/Paris', 'https://www.britannica.com/place/Paris'], 'rag_chain_set': False}\n", "Loading content from 2 URL(s)...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " ✓ Loaded: https://en.wikipedia.org/wiki/Paris (via WebBaseLoader)\n", " ✓ Loaded: https://www.britannica.com/place/Paris (via WebBaseLoader)\n", "Loaded docs: 2\n", "0 https://en.wikipedia.org/wiki/Paris 174715\n", "1 https://www.britannica.com/place/Paris 57\n" ] } ], "source": [ "import sys\n", "import os\n", "import importlib\n", "\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\")))\n", "from src import app_hf\n", "\n", "importlib.reload(app_hf)\n", "\n", "urls = [\n", " \"https://en.wikipedia.org/wiki/Paris\",\n", " \"https://www.britannica.com/place/Paris\"\n", "]\n", "\n", "app_hf.add_urls(urls)\n", "print(\"URL state:\", app_hf.get_url_state())\n", "\n", "docs = app_hf.load_documents_from_sources(\"./my_docs\", app_hf.get_url_state()[\"urls\"])\n", "print(\"Loaded docs:\", len(docs))\n", "for i, d in enumerate(docs[:3]):\n", " print(i, d.metadata.get(\"source\"), len(d.page_content))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }