Spaces:
Runtime error
Runtime error
Implement grounded web search and summarization with caching; integrate DB helpers
Browse filesThis commit updates app.py to perform real web searches using Tavily, fetch and extract article content, summarise each article via OpenAI, and cache results. It integrates db.py for resource caching, adds improved search pipeline in the chat function, and refactors the conversation to avoid hallucinated resources. Also uploads updated searcher.py and db.py.
- app.py +112 -74
- db.py +158 -0
- searcher.py +104 -1
app.py
CHANGED
|
@@ -4,7 +4,10 @@ import openai
|
|
| 4 |
|
| 5 |
from planner import plan_course
|
| 6 |
from generators import generate_course_zip
|
| 7 |
-
from searcher import
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# System prompt guiding the assistant's behaviour during brainstorming
|
| 10 |
SYSTEM_PROMPT = (
|
|
@@ -29,40 +32,29 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
|
|
| 29 |
messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
|
| 30 |
# Check if the user message contains a URL to open and read.
|
| 31 |
url = None
|
| 32 |
-
# Simple heuristic: look for http/https links in the message
|
| 33 |
for part in user_message.split():
|
| 34 |
if part.startswith("http://") or part.startswith("https://"):
|
| 35 |
url = part
|
| 36 |
break
|
| 37 |
if url:
|
| 38 |
-
#
|
| 39 |
try:
|
| 40 |
-
|
| 41 |
-
# Special handling for YouTube links: attempt to fetch transcript
|
| 42 |
if "youtube.com" in url or "youtu.be" in url:
|
| 43 |
try:
|
| 44 |
transcript_text = get_youtube_transcript(url)
|
| 45 |
except Exception:
|
| 46 |
transcript_text = ""
|
| 47 |
page_content = transcript_text or ""
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
if
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
page_content = extract_response.get("article", "")
|
| 58 |
-
elif extract_response.get("results"):
|
| 59 |
-
results_list = extract_response.get("results", [])
|
| 60 |
-
if isinstance(results_list, list):
|
| 61 |
-
page_content = "\n".join([
|
| 62 |
-
item.get("content", item.get("title", ""))
|
| 63 |
-
for item in results_list
|
| 64 |
-
if isinstance(item, dict)
|
| 65 |
-
])
|
| 66 |
if not page_content:
|
| 67 |
assistant_reply = "I couldn't extract content from that page."
|
| 68 |
else:
|
|
@@ -75,7 +67,6 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
|
|
| 75 |
if not api_key:
|
| 76 |
raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
|
| 77 |
summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
|
| 78 |
-
# Truncate content to avoid exceeding token limits
|
| 79 |
truncated_content = page_content[:8000]
|
| 80 |
summary_messages = [
|
| 81 |
{"role": "system", "content": summary_system},
|
|
@@ -130,64 +121,111 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
|
|
| 130 |
# of calling the language model. This allows the assistant to fetch resources when
|
| 131 |
# the user asks the agent to "search" or "search the internet".
|
| 132 |
search_triggers = ["search", "internet search", "web search"]
|
| 133 |
-
lower_msg = user_message.lower()
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
try:
|
| 136 |
-
#
|
| 137 |
-
query_key =
|
| 138 |
if query_key in resource_cache:
|
| 139 |
-
|
| 140 |
else:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
resource_cache[query_key] =
|
| 144 |
-
#
|
| 145 |
-
|
| 146 |
-
# the list of search results. If so, extract that list. If it's a
|
| 147 |
-
# list already, use it directly. Otherwise, default to an empty list.
|
| 148 |
-
if isinstance(results, dict):
|
| 149 |
-
normalized_results = results.get("results", [])
|
| 150 |
-
elif isinstance(results, list):
|
| 151 |
-
normalized_results = results
|
| 152 |
-
else:
|
| 153 |
-
normalized_results = []
|
| 154 |
-
# Ensure the sources list is initialised
|
| 155 |
if sources is None:
|
| 156 |
sources = []
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
for
|
| 160 |
-
if isinstance(
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
else:
|
| 184 |
-
|
| 185 |
-
assistant_reply = "I've already shared the relevant resources from this search."
|
| 186 |
-
else:
|
| 187 |
-
assistant_reply = "I couldn't find any results for that query."
|
| 188 |
except Exception as e:
|
| 189 |
assistant_reply = (
|
| 190 |
-
"An error occurred during web search. Please ensure your
|
| 191 |
f"(Error: {e})"
|
| 192 |
)
|
| 193 |
else:
|
|
|
|
| 4 |
|
| 5 |
from planner import plan_course
|
| 6 |
from generators import generate_course_zip
|
| 7 |
+
from searcher import web_search, fetch_and_extract, get_youtube_transcript
|
| 8 |
+
|
| 9 |
+
# Bring in DB helpers to persist resources if needed later
|
| 10 |
+
from db import get_resource, upsert_resource, list_resources, new_chat, append_message, load_chat, soft_delete_message
|
| 11 |
|
| 12 |
# System prompt guiding the assistant's behaviour during brainstorming
|
| 13 |
SYSTEM_PROMPT = (
|
|
|
|
| 32 |
messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
|
| 33 |
# Check if the user message contains a URL to open and read.
|
| 34 |
url = None
|
|
|
|
| 35 |
for part in user_message.split():
|
| 36 |
if part.startswith("http://") or part.startswith("https://"):
|
| 37 |
url = part
|
| 38 |
break
|
| 39 |
if url:
|
| 40 |
+
# If the message contains a URL, attempt to fetch and summarise it using our extraction helpers.
|
| 41 |
try:
|
| 42 |
+
# Detect YouTube links and fetch transcript
|
|
|
|
| 43 |
if "youtube.com" in url or "youtu.be" in url:
|
| 44 |
try:
|
| 45 |
transcript_text = get_youtube_transcript(url)
|
| 46 |
except Exception:
|
| 47 |
transcript_text = ""
|
| 48 |
page_content = transcript_text or ""
|
| 49 |
+
page_title = url
|
| 50 |
+
else:
|
| 51 |
+
record = fetch_and_extract(url)
|
| 52 |
+
if record:
|
| 53 |
+
page_content = record.get("excerpt", "")
|
| 54 |
+
page_title = record.get("title", url)
|
| 55 |
+
else:
|
| 56 |
+
page_content = ""
|
| 57 |
+
page_title = url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if not page_content:
|
| 59 |
assistant_reply = "I couldn't extract content from that page."
|
| 60 |
else:
|
|
|
|
| 67 |
if not api_key:
|
| 68 |
raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
|
| 69 |
summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
|
|
|
|
| 70 |
truncated_content = page_content[:8000]
|
| 71 |
summary_messages = [
|
| 72 |
{"role": "system", "content": summary_system},
|
|
|
|
| 121 |
# of calling the language model. This allows the assistant to fetch resources when
|
| 122 |
# the user asks the agent to "search" or "search the internet".
|
| 123 |
search_triggers = ["search", "internet search", "web search"]
|
| 124 |
+
lower_msg = user_message.lower().strip()
|
| 125 |
+
# Determine if a search should be performed
|
| 126 |
+
do_search = any(lower_msg.startswith(trig) for trig in search_triggers)
|
| 127 |
+
if do_search:
|
| 128 |
+
# Extract query after trigger word if present (e.g. "search vibe coding" -> "vibe coding")
|
| 129 |
+
# Otherwise use the full message minus the trigger
|
| 130 |
+
query = user_message
|
| 131 |
+
for trig in search_triggers:
|
| 132 |
+
if lower_msg.startswith(trig):
|
| 133 |
+
# Remove the trigger from the start of the query string
|
| 134 |
+
query = user_message[len(trig):].strip() or user_message
|
| 135 |
+
break
|
| 136 |
try:
|
| 137 |
+
# Use cached search results if available for this query key (case-insensitive)
|
| 138 |
+
query_key = query.lower()
|
| 139 |
if query_key in resource_cache:
|
| 140 |
+
search_results = resource_cache[query_key]
|
| 141 |
else:
|
| 142 |
+
# Use our wrapped web_search for better domain filtering and consistent return type
|
| 143 |
+
search_results = web_search(query, max_results=5)
|
| 144 |
+
resource_cache[query_key] = search_results
|
| 145 |
+
# Iterate over search results, fetch their content, cache resources and summarise
|
| 146 |
+
summaries = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
if sources is None:
|
| 148 |
sources = []
|
| 149 |
+
existing_urls = {src.get("url") for src in sources if isinstance(src, dict) and src.get("url")}
|
| 150 |
+
# For each result (should be a dict with 'url' and 'title')
|
| 151 |
+
for item in search_results:
|
| 152 |
+
if not isinstance(item, dict):
|
| 153 |
+
continue
|
| 154 |
+
url = item.get("url")
|
| 155 |
+
title = item.get("title", url)
|
| 156 |
+
if not url or url in existing_urls:
|
| 157 |
+
continue
|
| 158 |
+
# Fetch and cache resource content
|
| 159 |
+
record = fetch_and_extract(url)
|
| 160 |
+
if not record:
|
| 161 |
+
# Skip if unable to fetch
|
| 162 |
+
continue
|
| 163 |
+
# Add to sources for plan generation (avoid duplicates)
|
| 164 |
+
sources.append({"title": record.get("title", title), "url": record.get("url", url)})
|
| 165 |
+
existing_urls.add(url)
|
| 166 |
+
# Summarise the resource's excerpt using OpenAI
|
| 167 |
+
try:
|
| 168 |
+
model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
|
| 169 |
+
temperature = float(os.getenv("TEMPERATURE", "0.7"))
|
| 170 |
+
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "256"))
|
| 171 |
+
api_key = os.getenv("OPENAI_API_KEY") or os.getenv("COURSECREATOR_API_KEY")
|
| 172 |
+
if not api_key:
|
| 173 |
+
raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
|
| 174 |
+
summary_system = "You are a helpful assistant. Summarize the following article excerpt in one paragraph."
|
| 175 |
+
excerpt = record.get("excerpt", "")[:3000]
|
| 176 |
+
summary_messages = [
|
| 177 |
+
{"role": "system", "content": summary_system},
|
| 178 |
+
{"role": "user", "content": excerpt},
|
| 179 |
+
]
|
| 180 |
+
if hasattr(openai, "OpenAI"):
|
| 181 |
+
client = openai.OpenAI(api_key=api_key)
|
| 182 |
+
try:
|
| 183 |
+
resp = client.chat.completions.create(
|
| 184 |
+
model=model,
|
| 185 |
+
messages=summary_messages,
|
| 186 |
+
temperature=temperature,
|
| 187 |
+
max_tokens=max_tokens,
|
| 188 |
+
)
|
| 189 |
+
except Exception:
|
| 190 |
+
resp = client.chat.completions.create(
|
| 191 |
+
model=model,
|
| 192 |
+
messages=summary_messages,
|
| 193 |
+
temperature=temperature,
|
| 194 |
+
max_completion_tokens=max_tokens,
|
| 195 |
+
)
|
| 196 |
+
summary_text = resp.choices[0].message.content
|
| 197 |
+
else:
|
| 198 |
+
openai.api_key = api_key
|
| 199 |
+
try:
|
| 200 |
+
resp = openai.ChatCompletion.create(
|
| 201 |
+
model=model,
|
| 202 |
+
messages=summary_messages,
|
| 203 |
+
temperature=temperature,
|
| 204 |
+
max_tokens=max_tokens,
|
| 205 |
+
)
|
| 206 |
+
except Exception:
|
| 207 |
+
resp = openai.ChatCompletion.create(
|
| 208 |
+
model=model,
|
| 209 |
+
messages=summary_messages,
|
| 210 |
+
temperature=temperature,
|
| 211 |
+
max_completion_tokens=max_tokens,
|
| 212 |
+
)
|
| 213 |
+
summary_text = resp["choices"][0]["message"]["content"]
|
| 214 |
+
except Exception as se:
|
| 215 |
+
# If summarization fails, just include the title and URL without a summary
|
| 216 |
+
summary_text = ""
|
| 217 |
+
# Compose summary line with link and summary
|
| 218 |
+
line = f"**{title}** ({url})"
|
| 219 |
+
if summary_text:
|
| 220 |
+
line += f"\n{summary_text.strip()}"
|
| 221 |
+
summaries.append(line)
|
| 222 |
+
if summaries:
|
| 223 |
+
assistant_reply = "Here are some articles I found and summarised:\n\n" + "\n\n".join(summaries)
|
| 224 |
else:
|
| 225 |
+
assistant_reply = "I couldn't fetch or summarise any credible articles for that query."
|
|
|
|
|
|
|
|
|
|
| 226 |
except Exception as e:
|
| 227 |
assistant_reply = (
|
| 228 |
+
"An error occurred during web search and summarisation. Please ensure your API keys are configured.\n"
|
| 229 |
f"(Error: {e})"
|
| 230 |
)
|
| 231 |
else:
|
db.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sqlite3
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
Simple SQLite helper for persisting resources and conversation messages.
|
| 8 |
+
|
| 9 |
+
This module centralises all database access used by the Course Creator agent.
|
| 10 |
+
It defines three tables:
|
| 11 |
+
resources (id, url, title, source, published_at, retrieved_at, content_excerpt, meta_json)
|
| 12 |
+
chats (id, chat_key, title, created_at)
|
| 13 |
+
messages (id, chat_key, role, content, status, created_at)
|
| 14 |
+
|
| 15 |
+
Resources are de-duplicated by URL. Chats are keyed by a unique string
|
| 16 |
+
(UUID-like) generated externally. Messages are stored in the order received
|
| 17 |
+
and may be soft-deleted by updating their status column.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
# Determine database path. Use environment override or default to a local data dir.
|
| 21 |
+
DB_PATH = os.getenv("COURSECREATOR_DB", os.path.join(os.path.dirname(__file__), "data", "course_creator.db"))
|
| 22 |
+
|
| 23 |
+
def _ensure_db():
|
| 24 |
+
"""Initialise the SQLite database with the required tables if they don't exist."""
|
| 25 |
+
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
|
| 26 |
+
conn = sqlite3.connect(DB_PATH)
|
| 27 |
+
conn.execute("PRAGMA journal_mode=WAL;")
|
| 28 |
+
conn.executescript(
|
| 29 |
+
"""
|
| 30 |
+
CREATE TABLE IF NOT EXISTS resources (
|
| 31 |
+
id INTEGER PRIMARY KEY,
|
| 32 |
+
url TEXT UNIQUE,
|
| 33 |
+
title TEXT,
|
| 34 |
+
source TEXT,
|
| 35 |
+
published_at TEXT,
|
| 36 |
+
retrieved_at INTEGER,
|
| 37 |
+
content_excerpt TEXT,
|
| 38 |
+
meta_json TEXT
|
| 39 |
+
);
|
| 40 |
+
CREATE TABLE IF NOT EXISTS chats (
|
| 41 |
+
id INTEGER PRIMARY KEY,
|
| 42 |
+
chat_key TEXT UNIQUE,
|
| 43 |
+
title TEXT,
|
| 44 |
+
created_at INTEGER
|
| 45 |
+
);
|
| 46 |
+
CREATE TABLE IF NOT EXISTS messages (
|
| 47 |
+
id INTEGER PRIMARY KEY,
|
| 48 |
+
chat_key TEXT,
|
| 49 |
+
role TEXT,
|
| 50 |
+
content TEXT,
|
| 51 |
+
status TEXT DEFAULT 'normal',
|
| 52 |
+
created_at INTEGER,
|
| 53 |
+
FOREIGN KEY(chat_key) REFERENCES chats(chat_key)
|
| 54 |
+
);
|
| 55 |
+
"""
|
| 56 |
+
)
|
| 57 |
+
conn.commit()
|
| 58 |
+
conn.close()
|
| 59 |
+
|
| 60 |
+
def get_conn():
|
| 61 |
+
"""Return a connection with WAL mode enabled and ensure tables exist."""
|
| 62 |
+
_ensure_db()
|
| 63 |
+
conn = sqlite3.connect(DB_PATH)
|
| 64 |
+
conn.execute("PRAGMA journal_mode=WAL;")
|
| 65 |
+
return conn
|
| 66 |
+
|
| 67 |
+
def upsert_resource(url: str, title: str, source: str, content_excerpt: str, meta: dict | None = None) -> None:
|
| 68 |
+
"""Insert or update a resource record based on its URL.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
url: The canonical URL of the resource.
|
| 72 |
+
title: Title or headline.
|
| 73 |
+
source: Domain or source label.
|
| 74 |
+
content_excerpt: A short excerpt of the page content.
|
| 75 |
+
meta: Optional dictionary of additional metadata.
|
| 76 |
+
"""
|
| 77 |
+
now = int(time.time())
|
| 78 |
+
meta_json = json.dumps(meta or {})
|
| 79 |
+
with get_conn() as conn:
|
| 80 |
+
conn.execute(
|
| 81 |
+
"""
|
| 82 |
+
INSERT INTO resources (url, title, source, retrieved_at, content_excerpt, meta_json)
|
| 83 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 84 |
+
ON CONFLICT(url) DO UPDATE SET
|
| 85 |
+
title = excluded.title,
|
| 86 |
+
source = excluded.source,
|
| 87 |
+
retrieved_at = excluded.retrieved_at,
|
| 88 |
+
content_excerpt = excluded.content_excerpt,
|
| 89 |
+
meta_json = excluded.meta_json
|
| 90 |
+
""",
|
| 91 |
+
(url, title, source, now, content_excerpt, meta_json),
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def get_resource(url: str) -> dict | None:
|
| 95 |
+
"""Retrieve a resource by URL, returning a dictionary or None."""
|
| 96 |
+
with get_conn() as conn:
|
| 97 |
+
row = conn.execute(
|
| 98 |
+
"SELECT url, title, source, published_at, retrieved_at, content_excerpt, meta_json FROM resources WHERE url=?",
|
| 99 |
+
(url,),
|
| 100 |
+
).fetchone()
|
| 101 |
+
if not row:
|
| 102 |
+
return None
|
| 103 |
+
url, title, source, published_at, retrieved_at, content_excerpt, meta_json = row
|
| 104 |
+
meta = json.loads(meta_json or "{}")
|
| 105 |
+
return {
|
| 106 |
+
"url": url,
|
| 107 |
+
"title": title,
|
| 108 |
+
"source": source,
|
| 109 |
+
"published_at": published_at,
|
| 110 |
+
"retrieved_at": retrieved_at,
|
| 111 |
+
"excerpt": content_excerpt,
|
| 112 |
+
"meta": meta,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
def list_resources(limit: int = 200) -> list[dict]:
|
| 116 |
+
"""List recently retrieved resources."""
|
| 117 |
+
with get_conn() as conn:
|
| 118 |
+
rows = conn.execute(
|
| 119 |
+
"SELECT url, title, source, retrieved_at FROM resources ORDER BY retrieved_at DESC LIMIT ?",
|
| 120 |
+
(limit,),
|
| 121 |
+
).fetchall()
|
| 122 |
+
return [{"url": url, "title": title, "source": source, "retrieved_at": retrieved_at} for url, title, source, retrieved_at in rows]
|
| 123 |
+
|
| 124 |
+
def new_chat(title: str = "Untitled") -> str:
|
| 125 |
+
"""Create a new chat and return its key."""
|
| 126 |
+
import uuid
|
| 127 |
+
chat_key = str(uuid.uuid4())
|
| 128 |
+
now = int(time.time())
|
| 129 |
+
with get_conn() as conn:
|
| 130 |
+
conn.execute("INSERT INTO chats (chat_key, title, created_at) VALUES (?, ?, ?)", (chat_key, title, now))
|
| 131 |
+
return chat_key
|
| 132 |
+
|
| 133 |
+
def append_message(chat_key: str, role: str, content: str, status: str = "normal") -> None:
|
| 134 |
+
"""Append a message to a chat."""
|
| 135 |
+
now = int(time.time())
|
| 136 |
+
with get_conn() as conn:
|
| 137 |
+
conn.execute(
|
| 138 |
+
"INSERT INTO messages (chat_key, role, content, status, created_at) VALUES (?, ?, ?, ?, ?)",
|
| 139 |
+
(chat_key, role, content, status, now),
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
def load_chat(chat_key: str) -> list[dict]:
|
| 143 |
+
"""Load all non-deleted messages for a chat key."""
|
| 144 |
+
with get_conn() as conn:
|
| 145 |
+
rows = conn.execute(
|
| 146 |
+
"SELECT rowid, role, content, status FROM messages WHERE chat_key=? ORDER BY id ASC",
|
| 147 |
+
(chat_key,),
|
| 148 |
+
).fetchall()
|
| 149 |
+
messages = []
|
| 150 |
+
for rowid, role, content, status in rows:
|
| 151 |
+
if status != "deleted":
|
| 152 |
+
messages.append({"id": rowid, "role": role, "content": content})
|
| 153 |
+
return messages
|
| 154 |
+
|
| 155 |
+
def soft_delete_message(message_id: int) -> None:
|
| 156 |
+
"""Mark a message as deleted without removing it."""
|
| 157 |
+
with get_conn() as conn:
|
| 158 |
+
conn.execute("UPDATE messages SET status='deleted' WHERE id=?", (message_id,))
|
searcher.py
CHANGED
|
@@ -2,7 +2,20 @@ import os
|
|
| 2 |
|
| 3 |
|
| 4 |
def run_web_search(query, num_results=5, domain_filter=""):
|
| 5 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
try:
|
| 7 |
from tavily import TavilyClient
|
| 8 |
except ImportError:
|
|
@@ -18,6 +31,96 @@ def run_web_search(query, num_results=5, domain_filter=""):
|
|
| 18 |
results = client.search(query, **params)
|
| 19 |
return results
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# New function to extract content from a given URL using Tavily Extract API.
|
| 22 |
def extract_web_content(url):
|
| 23 |
"""Extract the main content of a web page via Tavily Extract.
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def run_web_search(query, num_results=5, domain_filter=""):
|
| 5 |
+
"""
|
| 6 |
+
Run a web search using Tavily API.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
query (str): Search query.
|
| 10 |
+
num_results (int): Number of results to retrieve.
|
| 11 |
+
domain_filter (str): Optional domain filter (comma-separated domains).
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
list[dict] | dict: Tavily response. It may return a list directly or a dict with a "results" key.
|
| 15 |
+
|
| 16 |
+
Raises:
|
| 17 |
+
ValueError: If the TAVILY_API_KEY env var is not set.
|
| 18 |
+
"""
|
| 19 |
try:
|
| 20 |
from tavily import TavilyClient
|
| 21 |
except ImportError:
|
|
|
|
| 31 |
results = client.search(query, **params)
|
| 32 |
return results
|
| 33 |
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Extended helper functions for credible research and extraction.
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
import re
|
| 39 |
+
from typing import List, Dict, Optional
|
| 40 |
+
|
| 41 |
+
# Import DB helpers from sibling module. Note: db.py resides in the same package directory.
|
| 42 |
+
from db import get_resource, upsert_resource
|
| 43 |
+
|
| 44 |
+
def web_search(query: str, max_results: int = 5, allowed_domains: Optional[List[str]] = None) -> List[Dict]:
|
| 45 |
+
"""
|
| 46 |
+
Perform a web search and return a list of result dictionaries, filtering by allowed domains.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
query: Search string.
|
| 50 |
+
max_results: Maximum number of results to return.
|
| 51 |
+
allowed_domains: Optional list of domains to permit. If provided, only results with URLs
|
| 52 |
+
containing one of these domains will be included.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
A list of search results (dicts with at least 'url' and 'title' keys).
|
| 56 |
+
"""
|
| 57 |
+
raw_results = run_web_search(query, num_results=max_results)
|
| 58 |
+
# Tavily can return either a list or a dict with 'results'
|
| 59 |
+
results_list = raw_results.get("results", []) if isinstance(raw_results, dict) else raw_results or []
|
| 60 |
+
# Filter out results that do not meet allowed domains, if specified
|
| 61 |
+
filtered: List[Dict] = []
|
| 62 |
+
for item in results_list:
|
| 63 |
+
if not isinstance(item, dict):
|
| 64 |
+
continue
|
| 65 |
+
url = item.get("url", "")
|
| 66 |
+
# Basic domain filtering: allow if allowed_domains is None or URL's domain ends with allowed domain
|
| 67 |
+
if allowed_domains:
|
| 68 |
+
try:
|
| 69 |
+
from urllib.parse import urlparse
|
| 70 |
+
domain = urlparse(url).netloc.lower()
|
| 71 |
+
if not any(domain.endswith(ad.lower()) for ad in allowed_domains):
|
| 72 |
+
continue
|
| 73 |
+
except Exception:
|
| 74 |
+
continue
|
| 75 |
+
filtered.append(item)
|
| 76 |
+
if len(filtered) >= max_results:
|
| 77 |
+
break
|
| 78 |
+
return filtered
|
| 79 |
+
|
| 80 |
+
def fetch_and_extract(url: str, timeout: int = 15) -> Optional[Dict]:
|
| 81 |
+
"""
|
| 82 |
+
Fetch a web page and extract its main textual content. Caches results in the database.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
url: The URL to fetch.
|
| 86 |
+
timeout: HTTP timeout in seconds.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
A dictionary with keys: url, title, source, excerpt, meta, or None on failure.
|
| 90 |
+
"""
|
| 91 |
+
# Return cached record if present
|
| 92 |
+
cached = get_resource(url)
|
| 93 |
+
if cached:
|
| 94 |
+
return cached
|
| 95 |
+
# Attempt to fetch page
|
| 96 |
+
try:
|
| 97 |
+
import requests
|
| 98 |
+
from bs4 import BeautifulSoup
|
| 99 |
+
except ImportError:
|
| 100 |
+
raise ImportError("Please install requests and beautifulsoup4")
|
| 101 |
+
try:
|
| 102 |
+
resp = requests.get(url, timeout=timeout, headers={"User-Agent": "CourseCreatorBot/1.0"})
|
| 103 |
+
resp.raise_for_status()
|
| 104 |
+
except Exception:
|
| 105 |
+
return None
|
| 106 |
+
# Parse HTML
|
| 107 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 108 |
+
# Title: fall back to URL if missing
|
| 109 |
+
title = (soup.title.string.strip() if soup.title and soup.title.string else url)[:200]
|
| 110 |
+
# Extract paragraphs
|
| 111 |
+
paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]
|
| 112 |
+
content_text = "\n".join(paragraphs)
|
| 113 |
+
excerpt = content_text[:2000]
|
| 114 |
+
# Domain as source
|
| 115 |
+
try:
|
| 116 |
+
from urllib.parse import urlparse
|
| 117 |
+
domain = urlparse(url).netloc
|
| 118 |
+
except Exception:
|
| 119 |
+
domain = ""
|
| 120 |
+
# Store in DB
|
| 121 |
+
upsert_resource(url, title, domain, excerpt, meta={"length": len(content_text)})
|
| 122 |
+
return get_resource(url)
|
| 123 |
+
|
| 124 |
# New function to extract content from a given URL using Tavily Extract API.
|
| 125 |
def extract_web_content(url):
|
| 126 |
"""Extract the main content of a web page via Tavily Extract.
|