course-creator / searcher.py
Prof-Reza's picture
Use JSON schema for course outline; implement structured plan generation; add schema file; update planner to produce JSON; update app to write JSON and doc attachments; update requirements and searcher for PDF extraction and unify dependencies.
06825b1 verified
import os
def run_web_search(query, num_results=5, domain_filter=""):
"""
Run a web search using Tavily API.
Args:
query (str): Search query.
num_results (int): Number of results to retrieve.
domain_filter (str): Optional domain filter (comma-separated domains).
Returns:
list[dict] | dict: Tavily response. It may return a list directly or a dict with a "results" key.
Raises:
ValueError: If the TAVILY_API_KEY env var is not set.
"""
try:
from tavily import TavilyClient
except ImportError:
raise ImportError("Please install tavily-python")
api_key = os.getenv("TAVILY_API_KEY")
if not api_key:
raise ValueError("TAVILY_API_KEY environment variable is required")
client = TavilyClient(api_key=api_key)
params = {"num": num_results}
if domain_filter:
# Tavily does not support domain filter directly; pass as search_kwargs if needed
params["search_kwargs"] = {"site": domain_filter}
results = client.search(query, **params)
return results
# ---------------------------------------------------------------------------
# Extended helper functions for credible research and extraction.
# ---------------------------------------------------------------------------
import re
from typing import List, Dict, Optional
# Additional imports for PDF extraction
import io
try:
from PyPDF2 import PdfReader # type: ignore
except ImportError:
# PyPDF2 will be installed via requirements; if missing, pdf extraction will be disabled
PdfReader = None
# Import DB helpers from sibling module. Note: db.py resides in the same package directory.
from db import get_resource, upsert_resource
def web_search(query: str, max_results: int = 5, allowed_domains: Optional[List[str]] = None) -> List[Dict]:
"""
Perform a web search and return a list of result dictionaries, filtering by allowed domains.
Args:
query: Search string.
max_results: Maximum number of results to return.
allowed_domains: Optional list of domains to permit. If provided, only results with URLs
containing one of these domains will be included.
Returns:
A list of search results (dicts with at least 'url' and 'title' keys).
"""
raw_results = run_web_search(query, num_results=max_results)
# Tavily can return either a list or a dict with 'results'
results_list = raw_results.get("results", []) if isinstance(raw_results, dict) else raw_results or []
# Filter out results that do not meet allowed domains, if specified
filtered: List[Dict] = []
for item in results_list:
if not isinstance(item, dict):
continue
url = item.get("url", "")
# Basic domain filtering: allow if allowed_domains is None or URL's domain ends with allowed domain
if allowed_domains:
try:
from urllib.parse import urlparse
domain = urlparse(url).netloc.lower()
if not any(domain.endswith(ad.lower()) for ad in allowed_domains):
continue
except Exception:
continue
filtered.append(item)
if len(filtered) >= max_results:
break
return filtered
def fetch_and_extract(url: str, timeout: int = 15) -> Optional[Dict]:
"""
Fetch a web page and extract its main textual content. Caches results in the database.
Args:
url: The URL to fetch.
timeout: HTTP timeout in seconds.
Returns:
A dictionary with keys: url, title, source, excerpt, meta, or None on failure.
"""
# Return cached record if present
cached = get_resource(url)
if cached:
return cached
# Attempt to fetch page
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("Please install requests and beautifulsoup4")
try:
resp = requests.get(url, timeout=timeout, headers={"User-Agent": "CourseCreatorBot/1.0"})
resp.raise_for_status()
except Exception:
return None
# If the response is a PDF (by content type or URL), attempt to extract text using PyPDF2
content_type = resp.headers.get("Content-Type", "").lower()
if (content_type.startswith("application/pdf") or url.lower().endswith(".pdf")) and PdfReader is not None:
try:
# Read PDF content
pdf_stream = io.BytesIO(resp.content)
reader = PdfReader(pdf_stream)
all_text = ""
for page in reader.pages:
try:
text = page.extract_text() or ""
except Exception:
text = ""
all_text += text + "\n"
if not all_text.strip():
return None
excerpt = all_text[:2000]
# Use the URL as the title for PDFs
title = url
# Determine domain
try:
from urllib.parse import urlparse
domain = urlparse(url).netloc
except Exception:
domain = ""
upsert_resource(url, title, domain, excerpt, meta={"length": len(all_text), "pdf": True})
return get_resource(url)
except Exception:
# If PDF extraction fails, continue with HTML extraction
pass
# Parse HTML
soup = BeautifulSoup(resp.text, "html.parser")
# Title: fall back to URL if missing
title = (soup.title.string.strip() if soup.title and soup.title.string else url)[:200]
# Extract paragraphs
paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]
content_text = "\n".join(paragraphs)
excerpt = content_text[:2000]
# Domain as source
try:
from urllib.parse import urlparse
domain = urlparse(url).netloc
except Exception:
domain = ""
# Store in DB
upsert_resource(url, title, domain, excerpt, meta={"length": len(content_text)})
return get_resource(url)
# New function to extract content from a given URL using Tavily Extract API.
def extract_web_content(url):
"""Extract the main content of a web page via Tavily Extract.
Args:
url (str): The URL of the page to extract.
Returns:
dict: The Tavily extract response containing page content and metadata.
Raises:
ImportError: If the tavily-python package is missing.
ValueError: If the TAVILY_API_KEY environment variable is not set.
"""
try:
from tavily import TavilyClient
except ImportError:
raise ImportError("Please install tavily-python")
api_key = os.getenv("TAVILY_API_KEY")
if not api_key:
raise ValueError("TAVILY_API_KEY environment variable is required")
client = TavilyClient(api_key=api_key)
# Call the extract endpoint to retrieve structured content from the URL
response = client.extract(url)
return response
# New function to get a YouTube video transcript given its URL
def get_youtube_transcript(video_url):
"""Fetch the transcript of a YouTube video using youtube-transcript-api.
Args:
video_url (str): The full URL to a YouTube video.
Returns:
str: The concatenated transcript text, or an empty string if none found.
Raises:
ImportError: If youtube-transcript-api is not installed.
"""
# Parse the video ID from the URL
try:
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError("Please install youtube-transcript-api for YouTube transcript extraction")
parsed = urlparse(video_url)
video_id = None
if "youtube.com" in parsed.netloc:
# Extract v parameter
query = parse_qs(parsed.query)
video_id = query.get("v", [None])[0]
elif "youtu.be" in parsed.netloc:
# Shortened link; path contains the ID
video_id = parsed.path.strip("/")
if not video_id:
return ""
try:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
except Exception:
return ""
# Concatenate all transcript segments into a single string
transcript_text = " ".join(seg.get("text", "") for seg in transcript_list)
return transcript_text