AkshayStark commited on
Commit
95f7828
·
1 Parent(s): fe6bb80

intiail setup for scrapper

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.env.example ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables (e.g., API keys)
2
+ OPENAI_API_KEY=
3
+ GOOGLE_API_KEY=
4
+ LANGCHAIN_TRACING_V2=true
5
+ LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
6
+ LANGCHAIN_API_KEY=
7
+ LANGCHAIN_PROJECT=
8
+ LANGCHAIN_RETRY_LIMIT=3
9
+ PLAYWRIGHT_DEFAULT_TIMEOUT=
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - develop
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.12'
20
+
21
+ - name: Install Poetry
22
+ run: pip install poetry # Fixed indentation
23
+
24
+ - name: Install Dependencies
25
+ run: poetry install --no-root
26
+
27
+ - name: Log in to Hugging Face
28
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
29
+
30
+ - name: Deploy to Spaces
31
+ run: poetry run gradio deploy # Fixed command
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ .venv
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.12
3
+
4
+
5
+ WORKDIR /app
6
+
7
+
8
+ RUN apt-get update && apt-get install -y \
9
+ libnss3 \
10
+ libnspr4 \
11
+ libdbus-1-3 \
12
+ libatk1.0-0 \
13
+ libatk-bridge2.0-0 \
14
+ libatspi2.0-0 \
15
+ libxcomposite1 \
16
+ libxdamage1 \
17
+ libxfixes3 \
18
+ libxrandr2 \
19
+ libgbm1 \
20
+ libxkbcommon0 \
21
+ libasound2 \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+
25
+ RUN pip install --no-cache-dir poetry
26
+
27
+
28
+ COPY pyproject.toml poetry.lock ./
29
+ COPY . .
30
+
31
+
32
+ RUN poetry install --no-root
33
+
34
+
35
+ RUN poetry run playwright install chromium
36
+ RUN poetry run playwright install-deps
37
+
38
+ EXPOSE 8000
39
+
40
+ CMD ["poetry", "run", "python", "server.py"]
api/researcher.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Handles API logic for calling scraper & LLM
2
+ from fastapi import APIRouter, HTTPException
3
+ from pydantic import BaseModel, HttpUrl
4
+ from llm_module.message import chain
5
+ from scraper import multiple_links_scraping
6
+ from typing import Any
7
+ import logging
8
+
9
+ logger = logging.getLogger('fastapi_cli')
10
+ api_router = APIRouter(prefix='/researcher', tags=['Researcher'])
11
+
12
+ class InputField(BaseModel):
13
+ url : str
14
+ generic_message : str
15
+
16
+ class OutputField(BaseModel):
17
+ company_summary : str
18
+ customised_message : str
19
+
20
+
21
+ @api_router.post('/invoke', response_model=OutputField)
22
+ async def invoke_web_researcher(body: InputField):
23
+ try:
24
+ scraped_data = await multiple_links_scraping([body.url])
25
+ except Exception as e:
26
+ logger.error(f"Failed to scrape: {e}")
27
+ raise HTTPException(status_code=500, detail="Internal Server Error")
28
+ try:
29
+ output = await chain.ainvoke({
30
+ "generic_message": body.generic_message,
31
+ "scraped_message": scraped_data[0]
32
+ })
33
+
34
+ if output is None :
35
+ raise ValueError("LLM chain returned None values")
36
+ print(output)
37
+ return OutputField(
38
+ company_summary=output['summary_output'],
39
+ customised_message=output['generated_message']
40
+ )
41
+ except Exception as e:
42
+ logger.error(f"Failed to invoke the chain: {e}")
43
+ raise HTTPException(status_code=500, detail="Internal Server Error")
docs/TODO.md ADDED
File without changes
docs/api_docs.md ADDED
File without changes
docs/experiment_results.md ADDED
File without changes
docs/scraping_notes.md ADDED
File without changes
llm_module/base.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Message customization logic
2
+ from dataclasses import dataclass
3
+ from typing import Any, TypeVar
4
+
5
+ from langchain.output_parsers import OutputFixingParser
6
+ from langchain.output_parsers.prompts import NAIVE_FIX_PROMPT
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnableSerializable
9
+
10
+ from .config import settings
11
+ from .model import gpt4o
12
+
13
+ T = TypeVar("T")
14
+
15
+
16
+ extraction_messages = [
17
+ (
18
+ "system",
19
+ """あなたは企業情報の分析に優れた専門家です。
20
+ 与えられた情報を元に、可能な限り詳細で有益な企業概要を**日本語**で作成してください。
21
+
22
+ ### **概要に含めるべき内容:**
23
+ 1. **会社名とアイデンティティ**
24
+ - 会社名、設立背景、歴史、特徴など。
25
+ - もし背景情報が不足している場合、**可能な推測(例: 業界や関連事業の一般的な傾向)** を交えながら説明する。
26
+
27
+ 2. **コアビジネスとミッション**
28
+ - 事業内容、主要サービス、企業の使命や目標。
29
+ - もしミッションが明確でない場合、事業内容から推測できる可能性のある方向性を述べる。
30
+
31
+ 3. **主要プロジェクトやイニシアティブ**
32
+ - 企業が取り組んでいる重要なプロジェクトや技術。
33
+ - **具体的な情報がない場合でも、企業の業界における一般的な取り組みを補足し、可能な活動内容を示唆する。**
34
+
35
+ 4. **業界と市場での地位**
36
+ - 企業が属する業界、市場での競争環境、主な競合企業。
37
+ - **業界情報を活用して、可能な市場ポジションを考察する。**
38
+
39
+ ### **重要なルール**
40
+ - **与えられた情報を最大限活用し、不完全な場合もできる限り推測を交えて説明する。**
41
+ - **決して情報を捏造せず、信頼できる情報源が不足している場合は、その旨を明示する。**
42
+ - **単なる「情報が不足しています」という回答ではなく、読者が次に取るべき調査ステップを提案する。**
43
+ """
44
+ ),
45
+ (
46
+ "human",
47
+ """以下の企業情報を分析し、可能な限り詳細な企業概要を **日本語** で作成してください。
48
+
49
+ **提供された企業情報:**
50
+ {scraped_message}
51
+
52
+ **要求事項:**
53
+ - 企業の業界、事業内容、競争環境を考慮した **具体的な説明** を記述すること。
54
+ - 情報が不足している場合でも、 **業界の一般的な知識を活用し、可能な考察を加えること**。
55
+ - もし詳細情報が不足している場合は、 **読者が追加情報を得るための推奨アクション(例: 公式サイトの確認、ニュースリリースのチェック)** を明示すること。
56
+ """
57
+ )
58
+ ]
59
+
60
+
61
+
62
+ generation_messages = [
63
+ (
64
+ "system",
65
+ """You are an expert in writing **personalized, compelling, and persuasive sales messages** in Japanese.
66
+
67
+ Your task is to craft a customized message based on the company's business summary and the provided generic message.
68
+
69
+ ### **Guidelines:**
70
+ 1. **Personalize the message** using the company's industry, goals, and key projects.
71
+ 2. **Make a strong connection** between the company's mission and the value proposition in the generic message.
72
+ 3. **Keep the core intent** of the generic message intact while making it highly relevant.
73
+ 4. **Ensure the message flows naturally** and does not feel robotic or forced.
74
+ 5. **If the company summary lacks details**, return the generic message **without forced personalization**.
75
+ """
76
+ ),
77
+ (
78
+ "human",
79
+ """Generate a well-structured and persuasive sales message in **Japanese** based on the following details:
80
+
81
+ **Company Summary:**
82
+ {summary_output}
83
+
84
+ **Generic Message:**
85
+ {generic_message}
86
+
87
+ ### **Instructions:**
88
+ - Begin with: **"貴社は [business field/service] に従事しており、"**
89
+ - Clearly highlight how the company's **projects and goals** align with the proposed solution.
90
+ - Use a **smooth, engaging, and persuasive tone** without making it sound artificial.
91
+ - If the summary lacks sufficient details, simply **return the generic message as it is** without attempting to force customization.
92
+ """
93
+ )
94
+ ]
95
+
96
+
97
+
98
+ def build_output_fixing_parser(
99
+ *, parser: Any, llm: Any = gpt4o
100
+ ) -> OutputFixingParser[Any]:
101
+ """Build output fixing parser with retry capability"""
102
+ retry_chain: RunnableSerializable[dict[str, Any], str] = NAIVE_FIX_PROMPT | llm | StrOutputParser()
103
+ return OutputFixingParser(
104
+ parser=parser,
105
+ retry_chain=retry_chain,
106
+ max_retries=settings.LANGCHAIN_RETRY_LIMIT,
107
+ )
llm_module/config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
+
8
+ class Settings(BaseSettings):
9
+ OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY")
10
+ GOOGLE_API_KEY: str = os.getenv("GOOGLE_API_KEY")
11
+ LANGCHAIN_TRACING_V2: bool = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
12
+ LANGCHAIN_ENDPOINT: str = os.getenv("LANGCHAIN_ENDPOINT")
13
+ LANGCHAIN_API_KEY: str = os.getenv("LANGCHAIN_API_KEY")
14
+ LANGCHAIN_PROJECT: str = os.getenv("LANGCHAIN_PROJECT")
15
+ LANGCHAIN_RETRY_LIMIT: int = os.getenv("LANGCHAIN_RETRY_LIMIT")
16
+
17
+ # Initialize settings instance
18
+ settings = Settings()
llm_module/message.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any
3
+
4
+ from langchain.prompts import ChatPromptTemplate
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain_core.runnables import RunnableLambda, RunnablePassthrough
7
+
8
+ from .model import gpt4o
9
+ from .base import build_output_fixing_parser, extraction_messages, generation_messages
10
+
11
+ logger = logging.getLogger("fastapi_cli")
12
+
13
+ # Build parser
14
+ parser = build_output_fixing_parser(parser=StrOutputParser(), llm=gpt4o)
15
+
16
+ # Extraction chain
17
+ extraction_prompt = ChatPromptTemplate.from_messages(extraction_messages)
18
+ extraction_chain = (
19
+ extraction_prompt
20
+ | gpt4o
21
+ | parser
22
+ )
23
+
24
+ # Generation chain
25
+ generation_prompt = ChatPromptTemplate.from_messages(generation_messages)
26
+ generation_chain = (
27
+ generation_prompt
28
+ | gpt4o
29
+ | parser
30
+ )
31
+
32
+ # Define the LCEL-based pipeline
33
+ chain = (
34
+ RunnablePassthrough.assign(summary_output=extraction_chain) # Runs extraction_chain and adds its output
35
+ .assign(generated_message=generation_chain) # Runs generation_chain using the extracted summary
36
+ )
llm_module/model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load and interact with LLM
2
+ from langchain_openai import ChatOpenAI
3
+ from pydantic import SecretStr
4
+
5
+ from .config import settings
6
+
7
+ gpt4o = ChatOpenAI(
8
+ model="gpt-4o",
9
+ api_key=SecretStr(settings.OPENAI_API_KEY),
10
+ temperature=0,
11
+ max_tokens=8192,
12
+ timeout=60,
13
+ )
14
+
15
+ gpt4o_mini = ChatOpenAI(
16
+ model="gpt-4o-mini",
17
+ api_key=SecretStr(settings.OPENAI_API_KEY),
18
+ temperature=0,
19
+ max_tokens=8192,
20
+ timeout=60,
21
+ )
22
+
23
+ o1_mini = ChatOpenAI(
24
+ model="o1-mini",
25
+ api_key=SecretStr(settings.OPENAI_API_KEY),
26
+ temperature=1,
27
+ timeout=60,
28
+ )
log/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .logger import logger
2
+
3
+ __all__ = ['logger']
log/logger.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+
4
+ logging.basicConfig(level=logging.INFO,
5
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
6
+ logger = logging.getLogger(__name__)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "web-researcher"
3
+ version = "0.1.0"
4
+ description = "Beta version for testing the feature of scraping company info and generating personalized messages."
5
+ authors = ["UjjTiw"]
6
+ license = "2WINS Inc."
7
+ readme = "README.md"
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.12"
11
+ langchain = "^0.3.18"
12
+ langchain-openai = "^0.3.4"
13
+ playwright = "^1.50.0"
14
+ uvicorn = "^0.34.0"
15
+ fastapi = "^0.115.8"
16
+ pydantic = "^2.10.6"
17
+ pydantic-settings = "^2.7.1"
18
+ gradio = "^5.15.0"
19
+ langchain-gemini = "^0.1.1"
20
+ huggingface-hub = "^0.28.1"
21
+ httpx = "^0.28.1"
22
+
23
+
24
+ [build-system]
25
+ requires = ["poetry-core"]
26
+ build-backend = "poetry.core.masonry.api"
scraper/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .automate import multiple_links_scraping
2
+
3
+ __all__ = ['multiple_links_scraping']
scraper/automate.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from playwright.async_api import async_playwright, Error, BrowserContext
4
+ from .navigator import navigate_and_scrape
5
+
6
+ logger = logging.getLogger("fastapi_cli")
7
+
8
+ async def link_scraping(context: BrowserContext, url: str):
9
+ """Scrapes a single link using Playwright"""
10
+ try:
11
+ page = await context.new_page()
12
+ await page.goto(url, wait_until="domcontentloaded")
13
+
14
+ scraped_data = await navigate_and_scrape(page)
15
+ await page.close() # Close the page after scraping
16
+ return scraped_data
17
+
18
+ except Error as e:
19
+ logger.warning(f"Failed to scrape {url}: {e}")
20
+ return None
21
+
22
+ async def multiple_links_scraping(target_urls: list[str]):
23
+ """Scrapes multiple links concurrently"""
24
+ async with async_playwright() as p:
25
+ browser = await p.chromium.launch(
26
+ headless=True,
27
+ args=["--no-sandbox", "--disable-setuid-sandbox"]
28
+ )
29
+ # Set headless=True for production
30
+ context = await browser.new_context()
31
+
32
+ tasks = [link_scraping(context, url) for url in target_urls]
33
+ results = await asyncio.gather(*tasks, return_exceptions=True)
34
+
35
+ await browser.close() # Ensure browser closes after scraping
36
+ return results # Return scraped data for all links
scraper/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv()
6
+
7
+ class Settings(BaseSettings):
8
+
9
+ PLAYWRIGHT_DEFAULT_TIMEOUT: int = os.getenv("PLAYWRIGHT_DEFAULT_TIMEOUT", 7_000)
10
+ PLAYWRIGHT_WAIT_SECONDS_BEFORE_CLOSE: int = 3_600 # 1 hour
11
+
12
+
13
+ settings = Settings() # type: ignore
scraper/locator.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from playwright.async_api import Locator, Page
4
+
5
+ logger = logging.getLogger('fastapi_cli')
6
+
7
+ # XPaths for navigation links
8
+ INFO_PAGE_XPATHS = [
9
+ "//a[contains(@href, 'about') or contains(@href, 'company') or contains(@href, 'info') or contains(translate(text(), 'ABOUT', 'about'), 'about')]",
10
+ "//a[contains(text(), '会社概要') or contains(text(), '企業情報')]"
11
+ ]
12
+
13
+ HOME_PAGE_XPATHS = [
14
+ "//a[contains(@href, 'home') or contains(@href, 'index') or @href='/']",
15
+ "//a[contains(text(), 'ホーム') or contains(text(), 'HOME')]"
16
+ ]
17
+
18
+ def normalize_url(url: str, base_url: str) -> str:
19
+ """Normalize URL to absolute form."""
20
+ if not url:
21
+ return ""
22
+
23
+ if url.startswith("http"):
24
+ return url.rstrip("/")
25
+
26
+ if url.startswith("/"):
27
+ return base_url.rstrip("/") + url
28
+
29
+ return base_url.rstrip("/") + "/" + url
30
+
31
+ async def locate_info_page_links(page: Page) -> list[Locator]:
32
+ """Locate all possible info/about page links in parallel."""
33
+ tasks = [page.locator(xpath).all() for xpath in INFO_PAGE_XPATHS]
34
+ results = await asyncio.gather(*tasks)
35
+ links = [link for result in results for link in result]
36
+
37
+ valid_links = []
38
+ for link in links:
39
+ href = await link.get_attribute("href")
40
+ if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
41
+ valid_links.append(link)
42
+
43
+ return valid_links
44
+
45
+ async def locate_home_page_link(page: Page) -> Locator | None:
46
+ """Locate the home page link in parallel."""
47
+ tasks = [page.locator(xpath).all() for xpath in HOME_PAGE_XPATHS]
48
+ results = await asyncio.gather(*tasks)
49
+
50
+ for links in results:
51
+ for link in links:
52
+ href = await link.get_attribute("href")
53
+ if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
54
+ return link
55
+
56
+ return None
57
+
58
+ async def scrape_page_content(page: Page) -> dict[str, str]:
59
+ """Scrape relevant textual content from the page in parallel."""
60
+ content = {
61
+ 'url': page.url,
62
+ 'title': await page.title(),
63
+ 'main_content': '',
64
+ 'company_info': '',
65
+ 'meta_description': '',
66
+ 'meta_keywords': ''
67
+ }
68
+
69
+ async def get_meta_data():
70
+ """Fetch meta description and keywords concurrently."""
71
+ try:
72
+ meta_desc_task = page.locator('meta[name="description"]').get_attribute('content')
73
+ meta_keywords_task = page.locator('meta[name="keywords"]').get_attribute('content')
74
+ meta_desc, meta_keywords = await asyncio.gather(meta_desc_task, meta_keywords_task)
75
+ content['meta_description'] = meta_desc or ''
76
+ content['meta_keywords'] = meta_keywords or ''
77
+ except Exception as e:
78
+ logger.warning(f'Failed to get meta description and keywords: {e}')
79
+
80
+ async def get_main_content():
81
+ """Fetch the main page content concurrently."""
82
+ main_selectors = [
83
+ "main", "article", ".main-content", "#main-content",
84
+ ".content", "#content", "body" # fallback
85
+ ]
86
+ for selector in main_selectors:
87
+ try:
88
+ elements = await page.locator(selector).all()
89
+ tasks = [element.inner_text() for element in elements if await element.is_visible()]
90
+ texts = await asyncio.gather(*tasks)
91
+ valid_texts = [text for text in texts if text and len(text) > 100]
92
+ if valid_texts:
93
+ content['main_content'] = valid_texts[0]
94
+ break
95
+ except Exception as e:
96
+ logger.warning(f'Failed to scrape the main page content: {e}')
97
+
98
+ async def get_company_info():
99
+ """Fetch company information concurrently."""
100
+ company_selectors = [
101
+ ".company-info", "#company-info",
102
+ "section:has-text('会社概要')", "div:has-text('企業情報')",
103
+ "table:has-text('会社概要')", "table:has-text('企業情報')"
104
+ ]
105
+ for selector in company_selectors:
106
+ try:
107
+ elements = await page.locator(selector).all()
108
+ tasks = [element.inner_text() for element in elements if await element.is_visible()]
109
+ texts = await asyncio.gather(*tasks)
110
+ valid_texts = [text for text in texts if text]
111
+ if valid_texts:
112
+ content['company_info'] = valid_texts[0]
113
+ break
114
+ except Exception as e:
115
+ logger.warning(f'Failed to scrape company info: {e}')
116
+
117
+ # Run all scraping tasks concurrently
118
+ await asyncio.gather(get_meta_data(), get_main_content(), get_company_info())
119
+
120
+ return content
scraper/navigator.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core web scraping logic
2
+ import asyncio
3
+ import logging
4
+ from playwright.async_api import Locator, Page
5
+ from urllib.parse import urlparse
6
+ from .locator import locate_home_page_link, locate_info_page_links, normalize_url, scrape_page_content
7
+ from .utils import navigate_to_link
8
+ from .config import settings
9
+
10
+ logger = logging.getLogger('fastapi_cli')
11
+ async def navigate_and_scrape(
12
+ page: Page,
13
+ *,
14
+ verbose: bool = False,
15
+ ) -> dict[str, dict[str, str]]:
16
+ r"""
17
+ Navigate to info and home pages and scrape their content.
18
+
19
+ Args:
20
+ page (Page): The page to navigate from.
21
+ verbose (bool, optional): Whether to print verbose logs. Defaults to False.
22
+
23
+ Returns:
24
+ dict[str, dict[str, str]]: Dictionary containing scraped data from info and home pages.
25
+ """
26
+ original_url = page.url
27
+ parsed_url = urlparse(original_url)
28
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
29
+ scraped_data = {}
30
+ seen_urls = set()
31
+
32
+ try:
33
+ # First try info/about page
34
+ info_links = await locate_info_page_links(page)
35
+ if verbose:
36
+ logger.info(f"Current URL: {original_url}")
37
+ logger.info(
38
+ f"Located the following info page links: {await asyncio.gather(*(link.get_attribute('href') for link in info_links))}"
39
+ )
40
+
41
+ for i, link in enumerate(info_links):
42
+ try:
43
+ href = await link.get_attribute("href", timeout=30000)
44
+ absolute_url = normalize_url(href, base_url)
45
+
46
+ if absolute_url in seen_urls:
47
+ continue
48
+
49
+ seen_urls.add(absolute_url)
50
+ if not await navigate_to_link(page, link):
51
+ logger.warning("Failed to navigate to the info page link.")
52
+ continue
53
+
54
+ scraped_data['info_page'] = await scrape_page_content(page)
55
+ break
56
+
57
+ except Exception as e:
58
+ logger.warning(f"Failed to navigate to info page: {e}")
59
+ continue
60
+
61
+ # Reset the page if scraping failed and there are more links to try
62
+ if i < len(info_links) - 1:
63
+ logger.warning("Failed to scrape info page. Returning to the original page.")
64
+ await page.goto(original_url, wait_until="domcontentloaded")
65
+
66
+ # Return to original page before trying home page
67
+ await page.goto(original_url, wait_until="domcontentloaded")
68
+
69
+ # Try home page
70
+ home_link = await locate_home_page_link(page)
71
+ if home_link:
72
+ try:
73
+ href = await home_link.get_attribute("href", timeout=30000)
74
+ absolute_url = normalize_url(href, base_url)
75
+
76
+ if absolute_url not in seen_urls:
77
+ seen_urls.add(absolute_url)
78
+ if not await navigate_to_link(page, home_link):
79
+ logger.warning("Failed to navigate to the home page link.")
80
+ else:
81
+ scraped_data['home_page'] = await scrape_page_content(page)
82
+
83
+ except Exception as e:
84
+ logger.warning(f"Failed to navigate to home page: {e}")
85
+
86
+ finally:
87
+ # Always return to original page
88
+ await page.goto(original_url, wait_until="domcontentloaded")
89
+
90
+ if verbose:
91
+ logger.info(f"Scraped data: {scraped_data}")
92
+
93
+ return scraped_data
scraper/utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from urllib.parse import urlparse
3
+
4
+ from playwright.async_api import Locator, Page
5
+
6
+ from .config import settings
7
+
8
+ logger = logging.getLogger('fastapi_cli')
9
+
10
+ async def navigate_to_link(page: Page, link: Locator) -> bool:
11
+ r"""
12
+ Navigate to the link.
13
+
14
+ Args:
15
+ page (Page): The page to navigate.
16
+ link (Locator): The link to navigate to.
17
+
18
+ Returns:
19
+ bool: True if navigated, False otherwise.
20
+ """
21
+
22
+ navigated = False
23
+
24
+ # If the link does not open in a new tab, click it.
25
+ try:
26
+ target = await link.get_attribute("target", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
27
+
28
+ if target != "_blank":
29
+ await link.click(timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
30
+ await page.wait_for_load_state("domcontentloaded")
31
+ navigated = True
32
+
33
+ except Exception as e:
34
+ logger.warning(f"Failed to click the link: {e}")
35
+
36
+ # If the link opens in a new tab, replace the current url with the link's url.
37
+ if not navigated:
38
+ try:
39
+ parsed_url = urlparse(page.url)
40
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
41
+ base_url = base_url[:-1] if base_url.endswith("/") else base_url
42
+
43
+ href = await link.get_attribute("href", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
44
+ if not href:
45
+ logger.warning("No href attribute found while trying to navigate to the link.")
46
+ return navigated
47
+
48
+ # NOTE: e.g. /contact -> https://example.com/contact
49
+ if href.startswith("/"):
50
+ href = base_url + href
51
+
52
+ # NOTE: e.g. otoiawase/ -> https://example.com/otoiawase/
53
+ elif not href.startswith("http"):
54
+ href = base_url + "/" + href
55
+
56
+ # NOTE: e.g. https://example.com/contact
57
+ await page.goto(href, wait_until="domcontentloaded")
58
+
59
+ navigated = True
60
+
61
+ except Exception as e:
62
+ logger.warning(f"Failed to navigate to the link: {e}")
63
+
64
+ return navigated
scripts/analyze_pages.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Analyze scraped pages
scripts/fetch_sample.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Script to run scraper on sample links
scripts/test_llm.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Test LLM interactions
server.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from api import researcher # Ensure researcher.py has `api_router`
5
+
6
+
7
+ app = FastAPI(docs_url='/docs')
8
+
9
+ # Enable CORS for frontend communication
10
+ app.add_middleware(
11
+ CORSMiddleware,
12
+ allow_origins=["*"],
13
+ allow_credentials=True,
14
+ allow_methods=["*"],
15
+ allow_headers=["*"],
16
+ )
17
+
18
+ # Include API routes from researcher module
19
+ app.include_router(researcher.api_router)
20
+
21
+ if __name__ == "__main__":
22
+ uvicorn.run(app, host="0.0.0.0", port=8000)
tests/test_customizer.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Unit tests for message customization
tests/test_routes.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # API route tests
tests/test_scraper.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Unit tests for scraper
tests/test_summarizer.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Unit tests for summarizer