Spaces:

AkshayStark
/

webscrapper

Sleeping

App Files Files Community

AkshayStark commited on Feb 20, 2025

Commit

95f7828

1 Parent(s): fe6bb80

intiail setup for scrapper

Browse files

Files changed (32) hide show

.DS_Store +0 -0
.env.example +9 -0
.github/workflows/update_space.yml +31 -0
.gitignore +4 -0
Dockerfile +40 -0
api/researcher.py +43 -0
docs/TODO.md +0 -0
docs/api_docs.md +0 -0
docs/experiment_results.md +0 -0
docs/scraping_notes.md +0 -0
llm_module/base.py +107 -0
llm_module/config.py +18 -0
llm_module/message.py +36 -0
llm_module/model.py +28 -0
log/__init__.py +3 -0
log/logger.py +6 -0
poetry.lock +0 -0
pyproject.toml +26 -0
scraper/__init__.py +3 -0
scraper/automate.py +36 -0
scraper/config.py +13 -0
scraper/locator.py +120 -0
scraper/navigator.py +93 -0
scraper/utils.py +64 -0
scripts/analyze_pages.py +1 -0
scripts/fetch_sample.py +1 -0
scripts/test_llm.py +1 -0
server.py +22 -0
tests/test_customizer.py +1 -0
tests/test_routes.py +1 -0
tests/test_scraper.py +1 -0
tests/test_summarizer.py +1 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.env.example ADDED Viewed

	@@ -0,0 +1,9 @@

+# Environment variables (e.g., API keys)
+OPENAI_API_KEY=
+GOOGLE_API_KEY=
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+LANGCHAIN_API_KEY=
+LANGCHAIN_PROJECT=
+LANGCHAIN_RETRY_LIMIT=3
+PLAYWRIGHT_DEFAULT_TIMEOUT=

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - develop
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.12'
+    - name: Install Poetry
+      run: pip install poetry  # Fixed indentation
+    - name: Install Dependencies
+      run: poetry install --no-root
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: poetry run gradio deploy  # Fixed command

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.pyc
+.env
+.venv

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+FROM python:3.12
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    libnss3 \
+    libnspr4 \
+    libdbus-1-3 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libatspi2.0-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libxkbcommon0 \
+    libasound2 \
+    && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir poetry
+COPY pyproject.toml poetry.lock ./
+COPY . .
+RUN poetry install --no-root
+RUN poetry run playwright install chromium
+RUN poetry run playwright install-deps
+EXPOSE 8000
+CMD ["poetry", "run", "python", "server.py"]

api/researcher.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Handles API logic for calling scraper & LLM
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, HttpUrl
+from llm_module.message import chain
+from scraper import multiple_links_scraping
+from typing import Any
+import logging
+logger = logging.getLogger('fastapi_cli')
+api_router = APIRouter(prefix='/researcher', tags=['Researcher'])
+class InputField(BaseModel):
+    url : str
+    generic_message : str
+class OutputField(BaseModel):
+    company_summary : str
+    customised_message : str
+@api_router.post('/invoke', response_model=OutputField)
+async def invoke_web_researcher(body: InputField):
+    try:
+        scraped_data = await multiple_links_scraping([body.url])
+    except Exception as e:
+        logger.error(f"Failed to scrape: {e}")
+        raise HTTPException(status_code=500, detail="Internal Server Error")
+    try:
+        output = await chain.ainvoke({
+            "generic_message": body.generic_message,
+            "scraped_message": scraped_data[0]
+        })
+        if output is None :
+            raise ValueError("LLM chain returned None values")
+        print(output)
+        return OutputField(
+            company_summary=output['summary_output'],
+            customised_message=output['generated_message']
+        )
+    except Exception as e:
+        logger.error(f"Failed to invoke the chain: {e}")
+        raise HTTPException(status_code=500, detail="Internal Server Error")

docs/TODO.md ADDED Viewed

File without changes

docs/api_docs.md ADDED Viewed

File without changes

docs/experiment_results.md ADDED Viewed

File without changes

docs/scraping_notes.md ADDED Viewed

File without changes

llm_module/base.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Message customization logic
+from dataclasses import dataclass
+from typing import Any, TypeVar
+from langchain.output_parsers import OutputFixingParser
+from langchain.output_parsers.prompts import NAIVE_FIX_PROMPT
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableSerializable
+from .config import settings
+from .model import gpt4o
+T = TypeVar("T")
+extraction_messages = [
+    (
+        "system",
+        """あなたは企業情報の分析に優れた専門家です。
+        与えられた情報を元に、可能な限り詳細で有益な企業概要を**日本語**で作成してください。
+        ### **概要に含めるべき内容:**
+        1. **会社名とアイデンティティ**
+           - 会社名、設立背景、歴史、特徴など。
+           - もし背景情報が不足している場合、**可能な推測（例: 業界や関連事業の一般的な傾向）** を交えながら説明する。
+        2. **コアビジネスとミッション**
+           - 事業内容、主要サービス、企業の使命や目標。
+           - もしミッションが明確でない場合、事業内容から推測できる可能性のある方向性を述べる。
+        3. **主要プロジェクトやイニシアティブ**
+           - 企業が取り組んでいる重要なプロジェクトや技術。
+           - **具体的な情報がない場合でも、企業の業界における一般的な取り組みを補足し、可能な活動内容を示唆する。**
+        4. **業界と市場での地位**
+           - 企業が属する業界、市場での競争環境、主な競合企業。
+           - **業界情報を活用して、可能な市場ポジションを考察する。**
+        ### **重要なルール**
+        - **与えられた情報を最大限活用し、不完全な場合もできる限り推測を交えて説明する。**
+        - **決して情報を捏造せず、信頼できる情報源が不足している場合は、その旨を明示する。**
+        - **単なる「情報が不足しています」という回答ではなく、読者が次に取るべき調査ステップを提案する。**
+        """
+    ),
+    (
+        "human",
+        """以下の企業情報を分析し、可能な限り詳細な企業概要を **日本語** で作成してください。
+        **提供された企業情報:**
+        {scraped_message}
+        **要求事項:**
+        - 企業の業界、事業内容、競争環境を考慮した **具体的な説明** を記述すること。
+        - 情報が不足している場合でも、 **業界の一般的な知識を活用し、可能な考察を加えること**。
+        - もし詳細情報が不足している場合は、 **読者が追加情報を得るための推奨アクション（例: 公式サイトの確認、ニュースリリースのチェック）** を明示すること。
+        """
+    )
+]
+generation_messages = [
+    (
+        "system",
+        """You are an expert in writing **personalized, compelling, and persuasive sales messages** in Japanese.
+        Your task is to craft a customized message based on the company's business summary and the provided generic message.
+        ### **Guidelines:**
+        1. **Personalize the message** using the company's industry, goals, and key projects.
+        2. **Make a strong connection** between the company's mission and the value proposition in the generic message.
+        3. **Keep the core intent** of the generic message intact while making it highly relevant.
+        4. **Ensure the message flows naturally** and does not feel robotic or forced.
+        5. **If the company summary lacks details**, return the generic message **without forced personalization**.
+        """
+    ),
+    (
+        "human",
+        """Generate a well-structured and persuasive sales message in **Japanese** based on the following details:
+        **Company Summary:**
+        {summary_output}
+        **Generic Message:**
+        {generic_message}
+        ### **Instructions:**
+        - Begin with: **"貴社は [business field/service] に従事しており、"**
+        - Clearly highlight how the company's **projects and goals** align with the proposed solution.
+        - Use a **smooth, engaging, and persuasive tone** without making it sound artificial.
+        - If the summary lacks sufficient details, simply **return the generic message as it is** without attempting to force customization.
+        """
+    )
+]
+def build_output_fixing_parser(
+    *, parser: Any, llm: Any = gpt4o
+) -> OutputFixingParser[Any]:
+    """Build output fixing parser with retry capability"""
+    retry_chain: RunnableSerializable[dict[str, Any], str] = NAIVE_FIX_PROMPT | llm | StrOutputParser()
+    return OutputFixingParser(
+        parser=parser,
+        retry_chain=retry_chain,
+        max_retries=settings.LANGCHAIN_RETRY_LIMIT,
+    )

llm_module/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic_settings import BaseSettings
+from dotenv import load_dotenv
+import os
+# Load environment variables from .env file
+load_dotenv()
+class Settings(BaseSettings):
+    OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY")
+    GOOGLE_API_KEY: str = os.getenv("GOOGLE_API_KEY")
+    LANGCHAIN_TRACING_V2: bool = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
+    LANGCHAIN_ENDPOINT: str = os.getenv("LANGCHAIN_ENDPOINT")
+    LANGCHAIN_API_KEY: str = os.getenv("LANGCHAIN_API_KEY")
+    LANGCHAIN_PROJECT: str = os.getenv("LANGCHAIN_PROJECT")
+    LANGCHAIN_RETRY_LIMIT: int = os.getenv("LANGCHAIN_RETRY_LIMIT")
+# Initialize settings instance
+settings = Settings()

llm_module/message.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import logging
+from typing import Any
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from .model import gpt4o
+from .base import build_output_fixing_parser, extraction_messages, generation_messages
+logger = logging.getLogger("fastapi_cli")
+# Build parser
+parser = build_output_fixing_parser(parser=StrOutputParser(), llm=gpt4o)
+# Extraction chain
+extraction_prompt = ChatPromptTemplate.from_messages(extraction_messages)
+extraction_chain = (
+    extraction_prompt
+    | gpt4o
+    | parser
+)
+# Generation chain
+generation_prompt = ChatPromptTemplate.from_messages(generation_messages)
+generation_chain = (
+    generation_prompt
+    | gpt4o
+    | parser
+)
+# Define the LCEL-based pipeline
+chain = (
+    RunnablePassthrough.assign(summary_output=extraction_chain)  # Runs extraction_chain and adds its output
+    .assign(generated_message=generation_chain)  # Runs generation_chain using the extracted summary
+)

llm_module/model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Load and interact with LLM
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+from .config import settings
+gpt4o = ChatOpenAI(
+    model="gpt-4o",
+    api_key=SecretStr(settings.OPENAI_API_KEY),
+    temperature=0,
+    max_tokens=8192,
+    timeout=60,
+)
+gpt4o_mini = ChatOpenAI(
+    model="gpt-4o-mini",
+    api_key=SecretStr(settings.OPENAI_API_KEY),
+    temperature=0,
+    max_tokens=8192,
+    timeout=60,
+)
+o1_mini = ChatOpenAI(
+    model="o1-mini",
+    api_key=SecretStr(settings.OPENAI_API_KEY),
+    temperature=1,
+    timeout=60,
+)

log/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .logger import logger
2	+
3	+ __all__ = ['logger']

log/logger.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[tool.poetry]
+name = "web-researcher"
+version = "0.1.0"
+description = "Beta version for testing the feature of scraping company info and generating personalized messages."
+authors = ["UjjTiw"]
+license = "2WINS Inc."
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.12"
+langchain = "^0.3.18"
+langchain-openai = "^0.3.4"
+playwright = "^1.50.0"
+uvicorn = "^0.34.0"
+fastapi = "^0.115.8"
+pydantic = "^2.10.6"
+pydantic-settings = "^2.7.1"
+gradio = "^5.15.0"
+langchain-gemini = "^0.1.1"
+huggingface-hub = "^0.28.1"
+httpx = "^0.28.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

scraper/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .automate import multiple_links_scraping
2	+
3	+ __all__ = ['multiple_links_scraping']

scraper/automate.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import logging
+import asyncio
+from playwright.async_api import async_playwright, Error, BrowserContext
+from .navigator import navigate_and_scrape
+logger = logging.getLogger("fastapi_cli")
+async def link_scraping(context: BrowserContext, url: str):
+    """Scrapes a single link using Playwright"""
+    try:
+        page = await context.new_page()
+        await page.goto(url, wait_until="domcontentloaded")
+        scraped_data = await navigate_and_scrape(page)
+        await page.close()  # Close the page after scraping
+        return scraped_data
+    except Error as e:
+        logger.warning(f"Failed to scrape {url}: {e}")
+        return None
+async def multiple_links_scraping(target_urls: list[str]):
+    """Scrapes multiple links concurrently"""
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(
+        headless=True,
+        args=["--no-sandbox", "--disable-setuid-sandbox"]
+)
+  # Set headless=True for production
+        context = await browser.new_context()
+        tasks = [link_scraping(context, url) for url in target_urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        await browser.close()  # Ensure browser closes after scraping
+        return results  # Return scraped data for all links

scraper/config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic_settings import BaseSettings
+from dotenv import load_dotenv
+import os
+load_dotenv()
+class Settings(BaseSettings):
+    PLAYWRIGHT_DEFAULT_TIMEOUT: int = os.getenv("PLAYWRIGHT_DEFAULT_TIMEOUT", 7_000)
+    PLAYWRIGHT_WAIT_SECONDS_BEFORE_CLOSE: int = 3_600  # 1 hour
+settings = Settings()  # type: ignore

scraper/locator.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import logging
+import asyncio
+from playwright.async_api import Locator, Page
+logger = logging.getLogger('fastapi_cli')
+# XPaths for navigation links
+INFO_PAGE_XPATHS = [
+    "//a[contains(@href, 'about') or contains(@href, 'company') or contains(@href, 'info') or contains(translate(text(), 'ABOUT', 'about'), 'about')]",
+    "//a[contains(text(), '会社概要') or contains(text(), '企業情報')]"
+]
+HOME_PAGE_XPATHS = [
+    "//a[contains(@href, 'home') or contains(@href, 'index') or @href='/']",
+    "//a[contains(text(), 'ホーム') or contains(text(), 'HOME')]"
+]
+def normalize_url(url: str, base_url: str) -> str:
+    """Normalize URL to absolute form."""
+    if not url:
+        return ""
+    if url.startswith("http"):
+        return url.rstrip("/")
+    if url.startswith("/"):
+        return base_url.rstrip("/") + url
+    return base_url.rstrip("/") + "/" + url
+async def locate_info_page_links(page: Page) -> list[Locator]:
+    """Locate all possible info/about page links in parallel."""
+    tasks = [page.locator(xpath).all() for xpath in INFO_PAGE_XPATHS]
+    results = await asyncio.gather(*tasks)
+    links = [link for result in results for link in result]
+    valid_links = []
+    for link in links:
+            href = await link.get_attribute("href")
+            if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
+                valid_links.append(link)
+    return valid_links
+async def locate_home_page_link(page: Page) -> Locator | None:
+    """Locate the home page link in parallel."""
+    tasks = [page.locator(xpath).all() for xpath in HOME_PAGE_XPATHS]
+    results = await asyncio.gather(*tasks)
+    for links in results:
+        for link in links:
+                href = await link.get_attribute("href")
+                if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
+                    return link
+    return None
+async def scrape_page_content(page: Page) -> dict[str, str]:
+    """Scrape relevant textual content from the page in parallel."""
+    content = {
+        'url': page.url,
+        'title': await page.title(),
+        'main_content': '',
+        'company_info': '',
+        'meta_description': '',
+        'meta_keywords': ''
+    }
+    async def get_meta_data():
+        """Fetch meta description and keywords concurrently."""
+        try:
+            meta_desc_task = page.locator('meta[name="description"]').get_attribute('content')
+            meta_keywords_task = page.locator('meta[name="keywords"]').get_attribute('content')
+            meta_desc, meta_keywords = await asyncio.gather(meta_desc_task, meta_keywords_task)
+            content['meta_description'] = meta_desc or ''
+            content['meta_keywords'] = meta_keywords or ''
+        except Exception as e:
+            logger.warning(f'Failed to get meta description and keywords: {e}')
+    async def get_main_content():
+        """Fetch the main page content concurrently."""
+        main_selectors = [
+            "main", "article", ".main-content", "#main-content",
+            ".content", "#content", "body"  # fallback
+        ]
+        for selector in main_selectors:
+            try:
+                elements = await page.locator(selector).all()
+                tasks = [element.inner_text() for element in elements if await element.is_visible()]
+                texts = await asyncio.gather(*tasks)
+                valid_texts = [text for text in texts if text and len(text) > 100]
+                if valid_texts:
+                    content['main_content'] = valid_texts[0]
+                    break
+            except Exception as e:
+                logger.warning(f'Failed to scrape the main page content: {e}')
+    async def get_company_info():
+        """Fetch company information concurrently."""
+        company_selectors = [
+            ".company-info", "#company-info",
+            "section:has-text('会社概要')", "div:has-text('企業情報')",
+            "table:has-text('会社概要')", "table:has-text('企業情報')"
+        ]
+        for selector in company_selectors:
+            try:
+                elements = await page.locator(selector).all()
+                tasks = [element.inner_text() for element in elements if await element.is_visible()]
+                texts = await asyncio.gather(*tasks)
+                valid_texts = [text for text in texts if text]
+                if valid_texts:
+                    content['company_info'] = valid_texts[0]
+                    break
+            except Exception as e:
+                logger.warning(f'Failed to scrape company info: {e}')
+    # Run all scraping tasks concurrently
+    await asyncio.gather(get_meta_data(), get_main_content(), get_company_info())
+    return content

scraper/navigator.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Core web scraping logic
+import asyncio
+import logging
+from playwright.async_api import Locator, Page
+from urllib.parse import urlparse
+from .locator import locate_home_page_link, locate_info_page_links, normalize_url, scrape_page_content
+from .utils import navigate_to_link
+from .config import settings
+logger = logging.getLogger('fastapi_cli')
+async def navigate_and_scrape(
+    page: Page,
+    *,
+    verbose: bool = False,
+) -> dict[str, dict[str, str]]:
+    r"""
+    Navigate to info and home pages and scrape their content.
+    Args:
+        page (Page): The page to navigate from.
+        verbose (bool, optional): Whether to print verbose logs. Defaults to False.
+    Returns:
+        dict[str, dict[str, str]]: Dictionary containing scraped data from info and home pages.
+    """
+    original_url = page.url
+    parsed_url = urlparse(original_url)
+    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    scraped_data = {}
+    seen_urls = set()
+    try:
+        # First try info/about page
+        info_links = await locate_info_page_links(page)
+        if verbose:
+            logger.info(f"Current URL: {original_url}")
+            logger.info(
+                f"Located the following info page links: {await asyncio.gather(*(link.get_attribute('href') for link in info_links))}"
+            )
+        for i, link in enumerate(info_links):
+            try:
+                href = await link.get_attribute("href", timeout=30000)
+                absolute_url = normalize_url(href, base_url)
+                if absolute_url in seen_urls:
+                    continue
+                seen_urls.add(absolute_url)
+                if not await navigate_to_link(page, link):
+                    logger.warning("Failed to navigate to the info page link.")
+                    continue
+                scraped_data['info_page'] = await scrape_page_content(page)
+                break
+            except Exception as e:
+                logger.warning(f"Failed to navigate to info page: {e}")
+                continue
+            # Reset the page if scraping failed and there are more links to try
+            if i < len(info_links) - 1:
+                logger.warning("Failed to scrape info page. Returning to the original page.")
+                await page.goto(original_url, wait_until="domcontentloaded")
+        # Return to original page before trying home page
+        await page.goto(original_url, wait_until="domcontentloaded")
+        # Try home page
+        home_link = await locate_home_page_link(page)
+        if home_link:
+            try:
+                href = await home_link.get_attribute("href", timeout=30000)
+                absolute_url = normalize_url(href, base_url)
+                if absolute_url not in seen_urls:
+                    seen_urls.add(absolute_url)
+                    if not await navigate_to_link(page, home_link):
+                        logger.warning("Failed to navigate to the home page link.")
+                    else:
+                        scraped_data['home_page'] = await scrape_page_content(page)
+            except Exception as e:
+                logger.warning(f"Failed to navigate to home page: {e}")
+    finally:
+        # Always return to original page
+        await page.goto(original_url, wait_until="domcontentloaded")
+    if verbose:
+        logger.info(f"Scraped data: {scraped_data}")
+    return scraped_data

scraper/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import logging
+from urllib.parse import urlparse
+from playwright.async_api import Locator, Page
+from .config import settings
+logger  = logging.getLogger('fastapi_cli')
+async def navigate_to_link(page: Page, link: Locator) -> bool:
+    r"""
+    Navigate to the link.
+    Args:
+        page (Page): The page to navigate.
+        link (Locator): The link to navigate to.
+    Returns:
+        bool: True if navigated, False otherwise.
+    """
+    navigated = False
+    # If the link does not open in a new tab, click it.
+    try:
+        target = await link.get_attribute("target", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
+        if target != "_blank":
+            await link.click(timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
+            await page.wait_for_load_state("domcontentloaded")
+            navigated = True
+    except Exception as e:
+        logger.warning(f"Failed to click the link: {e}")
+    # If the link opens in a new tab, replace the current url with the link's url.
+    if not navigated:
+        try:
+            parsed_url = urlparse(page.url)
+            base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
+            base_url = base_url[:-1] if base_url.endswith("/") else base_url
+            href = await link.get_attribute("href", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
+            if not href:
+                logger.warning("No href attribute found while trying to navigate to the link.")
+                return navigated
+            # NOTE: e.g. /contact -> https://example.com/contact
+            if href.startswith("/"):
+                href = base_url + href
+            # NOTE: e.g. otoiawase/ -> https://example.com/otoiawase/
+            elif not href.startswith("http"):
+                href = base_url + "/" + href
+            # NOTE: e.g. https://example.com/contact
+            await page.goto(href, wait_until="domcontentloaded")
+            navigated = True
+        except Exception as e:
+            logger.warning(f"Failed to navigate to the link: {e}")
+    return navigated

scripts/analyze_pages.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Analyze scraped pages

scripts/fetch_sample.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Script to run scraper on sample links

scripts/test_llm.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Test LLM interactions

server.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from api import researcher  # Ensure researcher.py has `api_router`
+app = FastAPI(docs_url='/docs')
+# Enable CORS for frontend communication
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include API routes from researcher module
+app.include_router(researcher.api_router)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

tests/test_customizer.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Unit tests for message customization

tests/test_routes.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # API route tests

tests/test_scraper.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Unit tests for scraper

tests/test_summarizer.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Unit tests for summarizer