Spaces:
Sleeping
Sleeping
Commit ·
95f7828
1
Parent(s): fe6bb80
intiail setup for scrapper
Browse files- .DS_Store +0 -0
- .env.example +9 -0
- .github/workflows/update_space.yml +31 -0
- .gitignore +4 -0
- Dockerfile +40 -0
- api/researcher.py +43 -0
- docs/TODO.md +0 -0
- docs/api_docs.md +0 -0
- docs/experiment_results.md +0 -0
- docs/scraping_notes.md +0 -0
- llm_module/base.py +107 -0
- llm_module/config.py +18 -0
- llm_module/message.py +36 -0
- llm_module/model.py +28 -0
- log/__init__.py +3 -0
- log/logger.py +6 -0
- poetry.lock +0 -0
- pyproject.toml +26 -0
- scraper/__init__.py +3 -0
- scraper/automate.py +36 -0
- scraper/config.py +13 -0
- scraper/locator.py +120 -0
- scraper/navigator.py +93 -0
- scraper/utils.py +64 -0
- scripts/analyze_pages.py +1 -0
- scripts/fetch_sample.py +1 -0
- scripts/test_llm.py +1 -0
- server.py +22 -0
- tests/test_customizer.py +1 -0
- tests/test_routes.py +1 -0
- tests/test_scraper.py +1 -0
- tests/test_summarizer.py +1 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.env.example
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables (e.g., API keys)
|
| 2 |
+
OPENAI_API_KEY=
|
| 3 |
+
GOOGLE_API_KEY=
|
| 4 |
+
LANGCHAIN_TRACING_V2=true
|
| 5 |
+
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
|
| 6 |
+
LANGCHAIN_API_KEY=
|
| 7 |
+
LANGCHAIN_PROJECT=
|
| 8 |
+
LANGCHAIN_RETRY_LIMIT=3
|
| 9 |
+
PLAYWRIGHT_DEFAULT_TIMEOUT=
|
.github/workflows/update_space.yml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Run Python script
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- develop
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
build:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout
|
| 14 |
+
uses: actions/checkout@v2
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v2
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.12'
|
| 20 |
+
|
| 21 |
+
- name: Install Poetry
|
| 22 |
+
run: pip install poetry # Fixed indentation
|
| 23 |
+
|
| 24 |
+
- name: Install Dependencies
|
| 25 |
+
run: poetry install --no-root
|
| 26 |
+
|
| 27 |
+
- name: Log in to Hugging Face
|
| 28 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
| 29 |
+
|
| 30 |
+
- name: Deploy to Spaces
|
| 31 |
+
run: poetry run gradio deploy # Fixed command
|
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.env
|
| 4 |
+
.venv
|
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FROM python:3.12
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
libnss3 \
|
| 10 |
+
libnspr4 \
|
| 11 |
+
libdbus-1-3 \
|
| 12 |
+
libatk1.0-0 \
|
| 13 |
+
libatk-bridge2.0-0 \
|
| 14 |
+
libatspi2.0-0 \
|
| 15 |
+
libxcomposite1 \
|
| 16 |
+
libxdamage1 \
|
| 17 |
+
libxfixes3 \
|
| 18 |
+
libxrandr2 \
|
| 19 |
+
libgbm1 \
|
| 20 |
+
libxkbcommon0 \
|
| 21 |
+
libasound2 \
|
| 22 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
RUN pip install --no-cache-dir poetry
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
COPY pyproject.toml poetry.lock ./
|
| 29 |
+
COPY . .
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
RUN poetry install --no-root
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
RUN poetry run playwright install chromium
|
| 36 |
+
RUN poetry run playwright install-deps
|
| 37 |
+
|
| 38 |
+
EXPOSE 8000
|
| 39 |
+
|
| 40 |
+
CMD ["poetry", "run", "python", "server.py"]
|
api/researcher.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Handles API logic for calling scraper & LLM
|
| 2 |
+
from fastapi import APIRouter, HTTPException
|
| 3 |
+
from pydantic import BaseModel, HttpUrl
|
| 4 |
+
from llm_module.message import chain
|
| 5 |
+
from scraper import multiple_links_scraping
|
| 6 |
+
from typing import Any
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger('fastapi_cli')
|
| 10 |
+
api_router = APIRouter(prefix='/researcher', tags=['Researcher'])
|
| 11 |
+
|
| 12 |
+
class InputField(BaseModel):
|
| 13 |
+
url : str
|
| 14 |
+
generic_message : str
|
| 15 |
+
|
| 16 |
+
class OutputField(BaseModel):
|
| 17 |
+
company_summary : str
|
| 18 |
+
customised_message : str
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@api_router.post('/invoke', response_model=OutputField)
|
| 22 |
+
async def invoke_web_researcher(body: InputField):
|
| 23 |
+
try:
|
| 24 |
+
scraped_data = await multiple_links_scraping([body.url])
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"Failed to scrape: {e}")
|
| 27 |
+
raise HTTPException(status_code=500, detail="Internal Server Error")
|
| 28 |
+
try:
|
| 29 |
+
output = await chain.ainvoke({
|
| 30 |
+
"generic_message": body.generic_message,
|
| 31 |
+
"scraped_message": scraped_data[0]
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
if output is None :
|
| 35 |
+
raise ValueError("LLM chain returned None values")
|
| 36 |
+
print(output)
|
| 37 |
+
return OutputField(
|
| 38 |
+
company_summary=output['summary_output'],
|
| 39 |
+
customised_message=output['generated_message']
|
| 40 |
+
)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
logger.error(f"Failed to invoke the chain: {e}")
|
| 43 |
+
raise HTTPException(status_code=500, detail="Internal Server Error")
|
docs/TODO.md
ADDED
|
File without changes
|
docs/api_docs.md
ADDED
|
File without changes
|
docs/experiment_results.md
ADDED
|
File without changes
|
docs/scraping_notes.md
ADDED
|
File without changes
|
llm_module/base.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Message customization logic
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from typing import Any, TypeVar
|
| 4 |
+
|
| 5 |
+
from langchain.output_parsers import OutputFixingParser
|
| 6 |
+
from langchain.output_parsers.prompts import NAIVE_FIX_PROMPT
|
| 7 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 8 |
+
from langchain_core.runnables import RunnableSerializable
|
| 9 |
+
|
| 10 |
+
from .config import settings
|
| 11 |
+
from .model import gpt4o
|
| 12 |
+
|
| 13 |
+
T = TypeVar("T")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
extraction_messages = [
|
| 17 |
+
(
|
| 18 |
+
"system",
|
| 19 |
+
"""あなたは企業情報の分析に優れた専門家です。
|
| 20 |
+
与えられた情報を元に、可能な限り詳細で有益な企業概要を**日本語**で作成してください。
|
| 21 |
+
|
| 22 |
+
### **概要に含めるべき内容:**
|
| 23 |
+
1. **会社名とアイデンティティ**
|
| 24 |
+
- 会社名、設立背景、歴史、特徴など。
|
| 25 |
+
- もし背景情報が不足している場合、**可能な推測(例: 業界や関連事業の一般的な傾向)** を交えながら説明する。
|
| 26 |
+
|
| 27 |
+
2. **コアビジネスとミッション**
|
| 28 |
+
- 事業内容、主要サービス、企業の使命や目標。
|
| 29 |
+
- もしミッションが明確でない場合、事業内容から推測できる可能性のある方向性を述べる。
|
| 30 |
+
|
| 31 |
+
3. **主要プロジェクトやイニシアティブ**
|
| 32 |
+
- 企業が取り組んでいる重要なプロジェクトや技術。
|
| 33 |
+
- **具体的な情報がない場合でも、企業の業界における一般的な取り組みを補足し、可能な活動内容を示唆する。**
|
| 34 |
+
|
| 35 |
+
4. **業界と市場での地位**
|
| 36 |
+
- 企業が属する業界、市場での競争環境、主な競合企業。
|
| 37 |
+
- **業界情報を活用して、可能な市場ポジションを考察する。**
|
| 38 |
+
|
| 39 |
+
### **重要なルール**
|
| 40 |
+
- **与えられた情報を最大限活用し、不完全な場合もできる限り推測を交えて説明する。**
|
| 41 |
+
- **決して情報を捏造せず、信頼できる情報源が不足している場合は、その旨を明示する。**
|
| 42 |
+
- **単なる「情報が不足しています」という回答ではなく、読者が次に取るべき調査ステップを提案する。**
|
| 43 |
+
"""
|
| 44 |
+
),
|
| 45 |
+
(
|
| 46 |
+
"human",
|
| 47 |
+
"""以下の企業情報を分析し、可能な限り詳細な企業概要を **日本語** で作成してください。
|
| 48 |
+
|
| 49 |
+
**提供された企業情報:**
|
| 50 |
+
{scraped_message}
|
| 51 |
+
|
| 52 |
+
**要求事項:**
|
| 53 |
+
- 企業の業界、事業内容、競争環境を考慮した **具体的な説明** を記述すること。
|
| 54 |
+
- 情報が不足している場合でも、 **業界の一般的な知識を活用し、可能な考察を加えること**。
|
| 55 |
+
- もし詳細情報が不足している場合は、 **読者が追加情報を得るための推奨アクション(例: 公式サイトの確認、ニュースリリースのチェック)** を明示すること。
|
| 56 |
+
"""
|
| 57 |
+
)
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
generation_messages = [
|
| 63 |
+
(
|
| 64 |
+
"system",
|
| 65 |
+
"""You are an expert in writing **personalized, compelling, and persuasive sales messages** in Japanese.
|
| 66 |
+
|
| 67 |
+
Your task is to craft a customized message based on the company's business summary and the provided generic message.
|
| 68 |
+
|
| 69 |
+
### **Guidelines:**
|
| 70 |
+
1. **Personalize the message** using the company's industry, goals, and key projects.
|
| 71 |
+
2. **Make a strong connection** between the company's mission and the value proposition in the generic message.
|
| 72 |
+
3. **Keep the core intent** of the generic message intact while making it highly relevant.
|
| 73 |
+
4. **Ensure the message flows naturally** and does not feel robotic or forced.
|
| 74 |
+
5. **If the company summary lacks details**, return the generic message **without forced personalization**.
|
| 75 |
+
"""
|
| 76 |
+
),
|
| 77 |
+
(
|
| 78 |
+
"human",
|
| 79 |
+
"""Generate a well-structured and persuasive sales message in **Japanese** based on the following details:
|
| 80 |
+
|
| 81 |
+
**Company Summary:**
|
| 82 |
+
{summary_output}
|
| 83 |
+
|
| 84 |
+
**Generic Message:**
|
| 85 |
+
{generic_message}
|
| 86 |
+
|
| 87 |
+
### **Instructions:**
|
| 88 |
+
- Begin with: **"貴社は [business field/service] に従事しており、"**
|
| 89 |
+
- Clearly highlight how the company's **projects and goals** align with the proposed solution.
|
| 90 |
+
- Use a **smooth, engaging, and persuasive tone** without making it sound artificial.
|
| 91 |
+
- If the summary lacks sufficient details, simply **return the generic message as it is** without attempting to force customization.
|
| 92 |
+
"""
|
| 93 |
+
)
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def build_output_fixing_parser(
|
| 99 |
+
*, parser: Any, llm: Any = gpt4o
|
| 100 |
+
) -> OutputFixingParser[Any]:
|
| 101 |
+
"""Build output fixing parser with retry capability"""
|
| 102 |
+
retry_chain: RunnableSerializable[dict[str, Any], str] = NAIVE_FIX_PROMPT | llm | StrOutputParser()
|
| 103 |
+
return OutputFixingParser(
|
| 104 |
+
parser=parser,
|
| 105 |
+
retry_chain=retry_chain,
|
| 106 |
+
max_retries=settings.LANGCHAIN_RETRY_LIMIT,
|
| 107 |
+
)
|
llm_module/config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Load environment variables from .env file
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY")
|
| 10 |
+
GOOGLE_API_KEY: str = os.getenv("GOOGLE_API_KEY")
|
| 11 |
+
LANGCHAIN_TRACING_V2: bool = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
|
| 12 |
+
LANGCHAIN_ENDPOINT: str = os.getenv("LANGCHAIN_ENDPOINT")
|
| 13 |
+
LANGCHAIN_API_KEY: str = os.getenv("LANGCHAIN_API_KEY")
|
| 14 |
+
LANGCHAIN_PROJECT: str = os.getenv("LANGCHAIN_PROJECT")
|
| 15 |
+
LANGCHAIN_RETRY_LIMIT: int = os.getenv("LANGCHAIN_RETRY_LIMIT")
|
| 16 |
+
|
| 17 |
+
# Initialize settings instance
|
| 18 |
+
settings = Settings()
|
llm_module/message.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
from langchain.prompts import ChatPromptTemplate
|
| 5 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 6 |
+
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
|
| 7 |
+
|
| 8 |
+
from .model import gpt4o
|
| 9 |
+
from .base import build_output_fixing_parser, extraction_messages, generation_messages
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("fastapi_cli")
|
| 12 |
+
|
| 13 |
+
# Build parser
|
| 14 |
+
parser = build_output_fixing_parser(parser=StrOutputParser(), llm=gpt4o)
|
| 15 |
+
|
| 16 |
+
# Extraction chain
|
| 17 |
+
extraction_prompt = ChatPromptTemplate.from_messages(extraction_messages)
|
| 18 |
+
extraction_chain = (
|
| 19 |
+
extraction_prompt
|
| 20 |
+
| gpt4o
|
| 21 |
+
| parser
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Generation chain
|
| 25 |
+
generation_prompt = ChatPromptTemplate.from_messages(generation_messages)
|
| 26 |
+
generation_chain = (
|
| 27 |
+
generation_prompt
|
| 28 |
+
| gpt4o
|
| 29 |
+
| parser
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Define the LCEL-based pipeline
|
| 33 |
+
chain = (
|
| 34 |
+
RunnablePassthrough.assign(summary_output=extraction_chain) # Runs extraction_chain and adds its output
|
| 35 |
+
.assign(generated_message=generation_chain) # Runs generation_chain using the extracted summary
|
| 36 |
+
)
|
llm_module/model.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load and interact with LLM
|
| 2 |
+
from langchain_openai import ChatOpenAI
|
| 3 |
+
from pydantic import SecretStr
|
| 4 |
+
|
| 5 |
+
from .config import settings
|
| 6 |
+
|
| 7 |
+
gpt4o = ChatOpenAI(
|
| 8 |
+
model="gpt-4o",
|
| 9 |
+
api_key=SecretStr(settings.OPENAI_API_KEY),
|
| 10 |
+
temperature=0,
|
| 11 |
+
max_tokens=8192,
|
| 12 |
+
timeout=60,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
gpt4o_mini = ChatOpenAI(
|
| 16 |
+
model="gpt-4o-mini",
|
| 17 |
+
api_key=SecretStr(settings.OPENAI_API_KEY),
|
| 18 |
+
temperature=0,
|
| 19 |
+
max_tokens=8192,
|
| 20 |
+
timeout=60,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
o1_mini = ChatOpenAI(
|
| 24 |
+
model="o1-mini",
|
| 25 |
+
api_key=SecretStr(settings.OPENAI_API_KEY),
|
| 26 |
+
temperature=1,
|
| 27 |
+
timeout=60,
|
| 28 |
+
)
|
log/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .logger import logger
|
| 2 |
+
|
| 3 |
+
__all__ = ['logger']
|
log/logger.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
logging.basicConfig(level=logging.INFO,
|
| 5 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 6 |
+
logger = logging.getLogger(__name__)
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "web-researcher"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Beta version for testing the feature of scraping company info and generating personalized messages."
|
| 5 |
+
authors = ["UjjTiw"]
|
| 6 |
+
license = "2WINS Inc."
|
| 7 |
+
readme = "README.md"
|
| 8 |
+
|
| 9 |
+
[tool.poetry.dependencies]
|
| 10 |
+
python = "^3.12"
|
| 11 |
+
langchain = "^0.3.18"
|
| 12 |
+
langchain-openai = "^0.3.4"
|
| 13 |
+
playwright = "^1.50.0"
|
| 14 |
+
uvicorn = "^0.34.0"
|
| 15 |
+
fastapi = "^0.115.8"
|
| 16 |
+
pydantic = "^2.10.6"
|
| 17 |
+
pydantic-settings = "^2.7.1"
|
| 18 |
+
gradio = "^5.15.0"
|
| 19 |
+
langchain-gemini = "^0.1.1"
|
| 20 |
+
huggingface-hub = "^0.28.1"
|
| 21 |
+
httpx = "^0.28.1"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
[build-system]
|
| 25 |
+
requires = ["poetry-core"]
|
| 26 |
+
build-backend = "poetry.core.masonry.api"
|
scraper/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .automate import multiple_links_scraping
|
| 2 |
+
|
| 3 |
+
__all__ = ['multiple_links_scraping']
|
scraper/automate.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
from playwright.async_api import async_playwright, Error, BrowserContext
|
| 4 |
+
from .navigator import navigate_and_scrape
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger("fastapi_cli")
|
| 7 |
+
|
| 8 |
+
async def link_scraping(context: BrowserContext, url: str):
|
| 9 |
+
"""Scrapes a single link using Playwright"""
|
| 10 |
+
try:
|
| 11 |
+
page = await context.new_page()
|
| 12 |
+
await page.goto(url, wait_until="domcontentloaded")
|
| 13 |
+
|
| 14 |
+
scraped_data = await navigate_and_scrape(page)
|
| 15 |
+
await page.close() # Close the page after scraping
|
| 16 |
+
return scraped_data
|
| 17 |
+
|
| 18 |
+
except Error as e:
|
| 19 |
+
logger.warning(f"Failed to scrape {url}: {e}")
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
async def multiple_links_scraping(target_urls: list[str]):
|
| 23 |
+
"""Scrapes multiple links concurrently"""
|
| 24 |
+
async with async_playwright() as p:
|
| 25 |
+
browser = await p.chromium.launch(
|
| 26 |
+
headless=True,
|
| 27 |
+
args=["--no-sandbox", "--disable-setuid-sandbox"]
|
| 28 |
+
)
|
| 29 |
+
# Set headless=True for production
|
| 30 |
+
context = await browser.new_context()
|
| 31 |
+
|
| 32 |
+
tasks = [link_scraping(context, url) for url in target_urls]
|
| 33 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 34 |
+
|
| 35 |
+
await browser.close() # Ensure browser closes after scraping
|
| 36 |
+
return results # Return scraped data for all links
|
scraper/config.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
class Settings(BaseSettings):
|
| 8 |
+
|
| 9 |
+
PLAYWRIGHT_DEFAULT_TIMEOUT: int = os.getenv("PLAYWRIGHT_DEFAULT_TIMEOUT", 7_000)
|
| 10 |
+
PLAYWRIGHT_WAIT_SECONDS_BEFORE_CLOSE: int = 3_600 # 1 hour
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
settings = Settings() # type: ignore
|
scraper/locator.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
from playwright.async_api import Locator, Page
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger('fastapi_cli')
|
| 6 |
+
|
| 7 |
+
# XPaths for navigation links
|
| 8 |
+
INFO_PAGE_XPATHS = [
|
| 9 |
+
"//a[contains(@href, 'about') or contains(@href, 'company') or contains(@href, 'info') or contains(translate(text(), 'ABOUT', 'about'), 'about')]",
|
| 10 |
+
"//a[contains(text(), '会社概要') or contains(text(), '企業情報')]"
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
HOME_PAGE_XPATHS = [
|
| 14 |
+
"//a[contains(@href, 'home') or contains(@href, 'index') or @href='/']",
|
| 15 |
+
"//a[contains(text(), 'ホーム') or contains(text(), 'HOME')]"
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
def normalize_url(url: str, base_url: str) -> str:
|
| 19 |
+
"""Normalize URL to absolute form."""
|
| 20 |
+
if not url:
|
| 21 |
+
return ""
|
| 22 |
+
|
| 23 |
+
if url.startswith("http"):
|
| 24 |
+
return url.rstrip("/")
|
| 25 |
+
|
| 26 |
+
if url.startswith("/"):
|
| 27 |
+
return base_url.rstrip("/") + url
|
| 28 |
+
|
| 29 |
+
return base_url.rstrip("/") + "/" + url
|
| 30 |
+
|
| 31 |
+
async def locate_info_page_links(page: Page) -> list[Locator]:
|
| 32 |
+
"""Locate all possible info/about page links in parallel."""
|
| 33 |
+
tasks = [page.locator(xpath).all() for xpath in INFO_PAGE_XPATHS]
|
| 34 |
+
results = await asyncio.gather(*tasks)
|
| 35 |
+
links = [link for result in results for link in result]
|
| 36 |
+
|
| 37 |
+
valid_links = []
|
| 38 |
+
for link in links:
|
| 39 |
+
href = await link.get_attribute("href")
|
| 40 |
+
if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
| 41 |
+
valid_links.append(link)
|
| 42 |
+
|
| 43 |
+
return valid_links
|
| 44 |
+
|
| 45 |
+
async def locate_home_page_link(page: Page) -> Locator | None:
|
| 46 |
+
"""Locate the home page link in parallel."""
|
| 47 |
+
tasks = [page.locator(xpath).all() for xpath in HOME_PAGE_XPATHS]
|
| 48 |
+
results = await asyncio.gather(*tasks)
|
| 49 |
+
|
| 50 |
+
for links in results:
|
| 51 |
+
for link in links:
|
| 52 |
+
href = await link.get_attribute("href")
|
| 53 |
+
if href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
| 54 |
+
return link
|
| 55 |
+
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
async def scrape_page_content(page: Page) -> dict[str, str]:
|
| 59 |
+
"""Scrape relevant textual content from the page in parallel."""
|
| 60 |
+
content = {
|
| 61 |
+
'url': page.url,
|
| 62 |
+
'title': await page.title(),
|
| 63 |
+
'main_content': '',
|
| 64 |
+
'company_info': '',
|
| 65 |
+
'meta_description': '',
|
| 66 |
+
'meta_keywords': ''
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
async def get_meta_data():
|
| 70 |
+
"""Fetch meta description and keywords concurrently."""
|
| 71 |
+
try:
|
| 72 |
+
meta_desc_task = page.locator('meta[name="description"]').get_attribute('content')
|
| 73 |
+
meta_keywords_task = page.locator('meta[name="keywords"]').get_attribute('content')
|
| 74 |
+
meta_desc, meta_keywords = await asyncio.gather(meta_desc_task, meta_keywords_task)
|
| 75 |
+
content['meta_description'] = meta_desc or ''
|
| 76 |
+
content['meta_keywords'] = meta_keywords or ''
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.warning(f'Failed to get meta description and keywords: {e}')
|
| 79 |
+
|
| 80 |
+
async def get_main_content():
|
| 81 |
+
"""Fetch the main page content concurrently."""
|
| 82 |
+
main_selectors = [
|
| 83 |
+
"main", "article", ".main-content", "#main-content",
|
| 84 |
+
".content", "#content", "body" # fallback
|
| 85 |
+
]
|
| 86 |
+
for selector in main_selectors:
|
| 87 |
+
try:
|
| 88 |
+
elements = await page.locator(selector).all()
|
| 89 |
+
tasks = [element.inner_text() for element in elements if await element.is_visible()]
|
| 90 |
+
texts = await asyncio.gather(*tasks)
|
| 91 |
+
valid_texts = [text for text in texts if text and len(text) > 100]
|
| 92 |
+
if valid_texts:
|
| 93 |
+
content['main_content'] = valid_texts[0]
|
| 94 |
+
break
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.warning(f'Failed to scrape the main page content: {e}')
|
| 97 |
+
|
| 98 |
+
async def get_company_info():
|
| 99 |
+
"""Fetch company information concurrently."""
|
| 100 |
+
company_selectors = [
|
| 101 |
+
".company-info", "#company-info",
|
| 102 |
+
"section:has-text('会社概要')", "div:has-text('企業情報')",
|
| 103 |
+
"table:has-text('会社概要')", "table:has-text('企業情報')"
|
| 104 |
+
]
|
| 105 |
+
for selector in company_selectors:
|
| 106 |
+
try:
|
| 107 |
+
elements = await page.locator(selector).all()
|
| 108 |
+
tasks = [element.inner_text() for element in elements if await element.is_visible()]
|
| 109 |
+
texts = await asyncio.gather(*tasks)
|
| 110 |
+
valid_texts = [text for text in texts if text]
|
| 111 |
+
if valid_texts:
|
| 112 |
+
content['company_info'] = valid_texts[0]
|
| 113 |
+
break
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.warning(f'Failed to scrape company info: {e}')
|
| 116 |
+
|
| 117 |
+
# Run all scraping tasks concurrently
|
| 118 |
+
await asyncio.gather(get_meta_data(), get_main_content(), get_company_info())
|
| 119 |
+
|
| 120 |
+
return content
|
scraper/navigator.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core web scraping logic
|
| 2 |
+
import asyncio
|
| 3 |
+
import logging
|
| 4 |
+
from playwright.async_api import Locator, Page
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
from .locator import locate_home_page_link, locate_info_page_links, normalize_url, scrape_page_content
|
| 7 |
+
from .utils import navigate_to_link
|
| 8 |
+
from .config import settings
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger('fastapi_cli')
|
| 11 |
+
async def navigate_and_scrape(
|
| 12 |
+
page: Page,
|
| 13 |
+
*,
|
| 14 |
+
verbose: bool = False,
|
| 15 |
+
) -> dict[str, dict[str, str]]:
|
| 16 |
+
r"""
|
| 17 |
+
Navigate to info and home pages and scrape their content.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
page (Page): The page to navigate from.
|
| 21 |
+
verbose (bool, optional): Whether to print verbose logs. Defaults to False.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
dict[str, dict[str, str]]: Dictionary containing scraped data from info and home pages.
|
| 25 |
+
"""
|
| 26 |
+
original_url = page.url
|
| 27 |
+
parsed_url = urlparse(original_url)
|
| 28 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
| 29 |
+
scraped_data = {}
|
| 30 |
+
seen_urls = set()
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# First try info/about page
|
| 34 |
+
info_links = await locate_info_page_links(page)
|
| 35 |
+
if verbose:
|
| 36 |
+
logger.info(f"Current URL: {original_url}")
|
| 37 |
+
logger.info(
|
| 38 |
+
f"Located the following info page links: {await asyncio.gather(*(link.get_attribute('href') for link in info_links))}"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
for i, link in enumerate(info_links):
|
| 42 |
+
try:
|
| 43 |
+
href = await link.get_attribute("href", timeout=30000)
|
| 44 |
+
absolute_url = normalize_url(href, base_url)
|
| 45 |
+
|
| 46 |
+
if absolute_url in seen_urls:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
seen_urls.add(absolute_url)
|
| 50 |
+
if not await navigate_to_link(page, link):
|
| 51 |
+
logger.warning("Failed to navigate to the info page link.")
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
scraped_data['info_page'] = await scrape_page_content(page)
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.warning(f"Failed to navigate to info page: {e}")
|
| 59 |
+
continue
|
| 60 |
+
|
| 61 |
+
# Reset the page if scraping failed and there are more links to try
|
| 62 |
+
if i < len(info_links) - 1:
|
| 63 |
+
logger.warning("Failed to scrape info page. Returning to the original page.")
|
| 64 |
+
await page.goto(original_url, wait_until="domcontentloaded")
|
| 65 |
+
|
| 66 |
+
# Return to original page before trying home page
|
| 67 |
+
await page.goto(original_url, wait_until="domcontentloaded")
|
| 68 |
+
|
| 69 |
+
# Try home page
|
| 70 |
+
home_link = await locate_home_page_link(page)
|
| 71 |
+
if home_link:
|
| 72 |
+
try:
|
| 73 |
+
href = await home_link.get_attribute("href", timeout=30000)
|
| 74 |
+
absolute_url = normalize_url(href, base_url)
|
| 75 |
+
|
| 76 |
+
if absolute_url not in seen_urls:
|
| 77 |
+
seen_urls.add(absolute_url)
|
| 78 |
+
if not await navigate_to_link(page, home_link):
|
| 79 |
+
logger.warning("Failed to navigate to the home page link.")
|
| 80 |
+
else:
|
| 81 |
+
scraped_data['home_page'] = await scrape_page_content(page)
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.warning(f"Failed to navigate to home page: {e}")
|
| 85 |
+
|
| 86 |
+
finally:
|
| 87 |
+
# Always return to original page
|
| 88 |
+
await page.goto(original_url, wait_until="domcontentloaded")
|
| 89 |
+
|
| 90 |
+
if verbose:
|
| 91 |
+
logger.info(f"Scraped data: {scraped_data}")
|
| 92 |
+
|
| 93 |
+
return scraped_data
|
scraper/utils.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from urllib.parse import urlparse
|
| 3 |
+
|
| 4 |
+
from playwright.async_api import Locator, Page
|
| 5 |
+
|
| 6 |
+
from .config import settings
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger('fastapi_cli')
|
| 9 |
+
|
| 10 |
+
async def navigate_to_link(page: Page, link: Locator) -> bool:
|
| 11 |
+
r"""
|
| 12 |
+
Navigate to the link.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
page (Page): The page to navigate.
|
| 16 |
+
link (Locator): The link to navigate to.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
bool: True if navigated, False otherwise.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
navigated = False
|
| 23 |
+
|
| 24 |
+
# If the link does not open in a new tab, click it.
|
| 25 |
+
try:
|
| 26 |
+
target = await link.get_attribute("target", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
|
| 27 |
+
|
| 28 |
+
if target != "_blank":
|
| 29 |
+
await link.click(timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
|
| 30 |
+
await page.wait_for_load_state("domcontentloaded")
|
| 31 |
+
navigated = True
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
logger.warning(f"Failed to click the link: {e}")
|
| 35 |
+
|
| 36 |
+
# If the link opens in a new tab, replace the current url with the link's url.
|
| 37 |
+
if not navigated:
|
| 38 |
+
try:
|
| 39 |
+
parsed_url = urlparse(page.url)
|
| 40 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
| 41 |
+
base_url = base_url[:-1] if base_url.endswith("/") else base_url
|
| 42 |
+
|
| 43 |
+
href = await link.get_attribute("href", timeout=settings.PLAYWRIGHT_DEFAULT_TIMEOUT)
|
| 44 |
+
if not href:
|
| 45 |
+
logger.warning("No href attribute found while trying to navigate to the link.")
|
| 46 |
+
return navigated
|
| 47 |
+
|
| 48 |
+
# NOTE: e.g. /contact -> https://example.com/contact
|
| 49 |
+
if href.startswith("/"):
|
| 50 |
+
href = base_url + href
|
| 51 |
+
|
| 52 |
+
# NOTE: e.g. otoiawase/ -> https://example.com/otoiawase/
|
| 53 |
+
elif not href.startswith("http"):
|
| 54 |
+
href = base_url + "/" + href
|
| 55 |
+
|
| 56 |
+
# NOTE: e.g. https://example.com/contact
|
| 57 |
+
await page.goto(href, wait_until="domcontentloaded")
|
| 58 |
+
|
| 59 |
+
navigated = True
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.warning(f"Failed to navigate to the link: {e}")
|
| 63 |
+
|
| 64 |
+
return navigated
|
scripts/analyze_pages.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Analyze scraped pages
|
scripts/fetch_sample.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Script to run scraper on sample links
|
scripts/test_llm.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Test LLM interactions
|
server.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uvicorn
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from api import researcher # Ensure researcher.py has `api_router`
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
app = FastAPI(docs_url='/docs')
|
| 8 |
+
|
| 9 |
+
# Enable CORS for frontend communication
|
| 10 |
+
app.add_middleware(
|
| 11 |
+
CORSMiddleware,
|
| 12 |
+
allow_origins=["*"],
|
| 13 |
+
allow_credentials=True,
|
| 14 |
+
allow_methods=["*"],
|
| 15 |
+
allow_headers=["*"],
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Include API routes from researcher module
|
| 19 |
+
app.include_router(researcher.api_router)
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
tests/test_customizer.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Unit tests for message customization
|
tests/test_routes.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# API route tests
|
tests/test_scraper.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Unit tests for scraper
|
tests/test_summarizer.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Unit tests for summarizer
|