Prof-Reza commited on
Commit
813f3c0
·
verified ·
1 Parent(s): a24abb9

Implement grounded web search and summarization with caching; integrate DB helpers

Browse files

This commit updates app.py to perform real web searches using Tavily, fetch and extract article content, summarise each article via OpenAI, and cache results. It integrates db.py for resource caching, adds improved search pipeline in the chat function, and refactors the conversation to avoid hallucinated resources. Also uploads updated searcher.py and db.py.

Files changed (3) hide show
  1. app.py +112 -74
  2. db.py +158 -0
  3. searcher.py +104 -1
app.py CHANGED
@@ -4,7 +4,10 @@ import openai
4
 
5
  from planner import plan_course
6
  from generators import generate_course_zip
7
- from searcher import run_web_search, extract_web_content, get_youtube_transcript
 
 
 
8
 
9
  # System prompt guiding the assistant's behaviour during brainstorming
10
  SYSTEM_PROMPT = (
@@ -29,40 +32,29 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
29
  messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
30
  # Check if the user message contains a URL to open and read.
31
  url = None
32
- # Simple heuristic: look for http/https links in the message
33
  for part in user_message.split():
34
  if part.startswith("http://") or part.startswith("https://"):
35
  url = part
36
  break
37
  if url:
38
- # User is asking to open/read a specific page or YouTube video
39
  try:
40
- page_content = ""
41
- # Special handling for YouTube links: attempt to fetch transcript
42
  if "youtube.com" in url or "youtu.be" in url:
43
  try:
44
  transcript_text = get_youtube_transcript(url)
45
  except Exception:
46
  transcript_text = ""
47
  page_content = transcript_text or ""
48
- # For non-YouTube links or fallback if transcript empty, use Tavily extract
49
- if not page_content:
50
- extract_response = extract_web_content(url)
51
- if isinstance(extract_response, dict):
52
- if extract_response.get("content"):
53
- page_content = extract_response.get("content", "")
54
- elif extract_response.get("text"):
55
- page_content = extract_response.get("text", "")
56
- elif extract_response.get("article"):
57
- page_content = extract_response.get("article", "")
58
- elif extract_response.get("results"):
59
- results_list = extract_response.get("results", [])
60
- if isinstance(results_list, list):
61
- page_content = "\n".join([
62
- item.get("content", item.get("title", ""))
63
- for item in results_list
64
- if isinstance(item, dict)
65
- ])
66
  if not page_content:
67
  assistant_reply = "I couldn't extract content from that page."
68
  else:
@@ -75,7 +67,6 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
75
  if not api_key:
76
  raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
77
  summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
78
- # Truncate content to avoid exceeding token limits
79
  truncated_content = page_content[:8000]
80
  summary_messages = [
81
  {"role": "system", "content": summary_system},
@@ -130,64 +121,111 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
130
  # of calling the language model. This allows the assistant to fetch resources when
131
  # the user asks the agent to "search" or "search the internet".
132
  search_triggers = ["search", "internet search", "web search"]
133
- lower_msg = user_message.lower()
134
- if any(trig in lower_msg for trig in search_triggers):
 
 
 
 
 
 
 
 
 
 
135
  try:
136
- # Perform web search using the entire user message as the query. Use cached results if available
137
- query_key = user_message.strip().lower()
138
  if query_key in resource_cache:
139
- results = resource_cache[query_key]
140
  else:
141
- results = run_web_search(user_message, num_results=5, domain_filter="")
142
- # store results in cache for future queries
143
- resource_cache[query_key] = results
144
- # Normalize results:
145
- # Tavily may return a dictionary with a "results" key containing
146
- # the list of search results. If so, extract that list. If it's a
147
- # list already, use it directly. Otherwise, default to an empty list.
148
- if isinstance(results, dict):
149
- normalized_results = results.get("results", [])
150
- elif isinstance(results, list):
151
- normalized_results = results
152
- else:
153
- normalized_results = []
154
- # Ensure the sources list is initialised
155
  if sources is None:
156
  sources = []
157
- # Filter out duplicate URLs already in sources
158
- existing_urls = set()
159
- for src in sources:
160
- if isinstance(src, dict):
161
- url = src.get("url")
162
- if url:
163
- existing_urls.add(url)
164
- new_results = []
165
- for r in normalized_results:
166
- if isinstance(r, dict):
167
- url = r.get("url")
168
- if url and url not in existing_urls:
169
- new_results.append(r)
170
- existing_urls.add(url)
171
- sources.extend(new_results)
172
- # Summarise results into a simple string with title and URL
173
- summary_lines = []
174
- for r in new_results:
175
- # Defensive: ensure r is a dict
176
- if isinstance(r, dict):
177
- title = r.get("title", "")
178
- url = r.get("url", "")
179
- if title or url:
180
- summary_lines.append(f"{title} - {url}")
181
- if summary_lines:
182
- assistant_reply = "Here are some resources I found:\n" + "\n".join(summary_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  else:
184
- if normalized_results:
185
- assistant_reply = "I've already shared the relevant resources from this search."
186
- else:
187
- assistant_reply = "I couldn't find any results for that query."
188
  except Exception as e:
189
  assistant_reply = (
190
- "An error occurred during web search. Please ensure your search API key is configured.\n"
191
  f"(Error: {e})"
192
  )
193
  else:
 
4
 
5
  from planner import plan_course
6
  from generators import generate_course_zip
7
+ from searcher import web_search, fetch_and_extract, get_youtube_transcript
8
+
9
+ # Bring in DB helpers to persist resources if needed later
10
+ from db import get_resource, upsert_resource, list_resources, new_chat, append_message, load_chat, soft_delete_message
11
 
12
  # System prompt guiding the assistant's behaviour during brainstorming
13
  SYSTEM_PROMPT = (
 
32
  messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
33
  # Check if the user message contains a URL to open and read.
34
  url = None
 
35
  for part in user_message.split():
36
  if part.startswith("http://") or part.startswith("https://"):
37
  url = part
38
  break
39
  if url:
40
+ # If the message contains a URL, attempt to fetch and summarise it using our extraction helpers.
41
  try:
42
+ # Detect YouTube links and fetch transcript
 
43
  if "youtube.com" in url or "youtu.be" in url:
44
  try:
45
  transcript_text = get_youtube_transcript(url)
46
  except Exception:
47
  transcript_text = ""
48
  page_content = transcript_text or ""
49
+ page_title = url
50
+ else:
51
+ record = fetch_and_extract(url)
52
+ if record:
53
+ page_content = record.get("excerpt", "")
54
+ page_title = record.get("title", url)
55
+ else:
56
+ page_content = ""
57
+ page_title = url
 
 
 
 
 
 
 
 
 
58
  if not page_content:
59
  assistant_reply = "I couldn't extract content from that page."
60
  else:
 
67
  if not api_key:
68
  raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
69
  summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
 
70
  truncated_content = page_content[:8000]
71
  summary_messages = [
72
  {"role": "system", "content": summary_system},
 
121
  # of calling the language model. This allows the assistant to fetch resources when
122
  # the user asks the agent to "search" or "search the internet".
123
  search_triggers = ["search", "internet search", "web search"]
124
+ lower_msg = user_message.lower().strip()
125
+ # Determine if a search should be performed
126
+ do_search = any(lower_msg.startswith(trig) for trig in search_triggers)
127
+ if do_search:
128
+ # Extract query after trigger word if present (e.g. "search vibe coding" -> "vibe coding")
129
+ # Otherwise use the full message minus the trigger
130
+ query = user_message
131
+ for trig in search_triggers:
132
+ if lower_msg.startswith(trig):
133
+ # Remove the trigger from the start of the query string
134
+ query = user_message[len(trig):].strip() or user_message
135
+ break
136
  try:
137
+ # Use cached search results if available for this query key (case-insensitive)
138
+ query_key = query.lower()
139
  if query_key in resource_cache:
140
+ search_results = resource_cache[query_key]
141
  else:
142
+ # Use our wrapped web_search for better domain filtering and consistent return type
143
+ search_results = web_search(query, max_results=5)
144
+ resource_cache[query_key] = search_results
145
+ # Iterate over search results, fetch their content, cache resources and summarise
146
+ summaries = []
 
 
 
 
 
 
 
 
 
147
  if sources is None:
148
  sources = []
149
+ existing_urls = {src.get("url") for src in sources if isinstance(src, dict) and src.get("url")}
150
+ # For each result (should be a dict with 'url' and 'title')
151
+ for item in search_results:
152
+ if not isinstance(item, dict):
153
+ continue
154
+ url = item.get("url")
155
+ title = item.get("title", url)
156
+ if not url or url in existing_urls:
157
+ continue
158
+ # Fetch and cache resource content
159
+ record = fetch_and_extract(url)
160
+ if not record:
161
+ # Skip if unable to fetch
162
+ continue
163
+ # Add to sources for plan generation (avoid duplicates)
164
+ sources.append({"title": record.get("title", title), "url": record.get("url", url)})
165
+ existing_urls.add(url)
166
+ # Summarise the resource's excerpt using OpenAI
167
+ try:
168
+ model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
169
+ temperature = float(os.getenv("TEMPERATURE", "0.7"))
170
+ max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "256"))
171
+ api_key = os.getenv("OPENAI_API_KEY") or os.getenv("COURSECREATOR_API_KEY")
172
+ if not api_key:
173
+ raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
174
+ summary_system = "You are a helpful assistant. Summarize the following article excerpt in one paragraph."
175
+ excerpt = record.get("excerpt", "")[:3000]
176
+ summary_messages = [
177
+ {"role": "system", "content": summary_system},
178
+ {"role": "user", "content": excerpt},
179
+ ]
180
+ if hasattr(openai, "OpenAI"):
181
+ client = openai.OpenAI(api_key=api_key)
182
+ try:
183
+ resp = client.chat.completions.create(
184
+ model=model,
185
+ messages=summary_messages,
186
+ temperature=temperature,
187
+ max_tokens=max_tokens,
188
+ )
189
+ except Exception:
190
+ resp = client.chat.completions.create(
191
+ model=model,
192
+ messages=summary_messages,
193
+ temperature=temperature,
194
+ max_completion_tokens=max_tokens,
195
+ )
196
+ summary_text = resp.choices[0].message.content
197
+ else:
198
+ openai.api_key = api_key
199
+ try:
200
+ resp = openai.ChatCompletion.create(
201
+ model=model,
202
+ messages=summary_messages,
203
+ temperature=temperature,
204
+ max_tokens=max_tokens,
205
+ )
206
+ except Exception:
207
+ resp = openai.ChatCompletion.create(
208
+ model=model,
209
+ messages=summary_messages,
210
+ temperature=temperature,
211
+ max_completion_tokens=max_tokens,
212
+ )
213
+ summary_text = resp["choices"][0]["message"]["content"]
214
+ except Exception as se:
215
+ # If summarization fails, just include the title and URL without a summary
216
+ summary_text = ""
217
+ # Compose summary line with link and summary
218
+ line = f"**{title}** ({url})"
219
+ if summary_text:
220
+ line += f"\n{summary_text.strip()}"
221
+ summaries.append(line)
222
+ if summaries:
223
+ assistant_reply = "Here are some articles I found and summarised:\n\n" + "\n\n".join(summaries)
224
  else:
225
+ assistant_reply = "I couldn't fetch or summarise any credible articles for that query."
 
 
 
226
  except Exception as e:
227
  assistant_reply = (
228
+ "An error occurred during web search and summarisation. Please ensure your API keys are configured.\n"
229
  f"(Error: {e})"
230
  )
231
  else:
db.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import json
4
+ import time
5
+
6
+ """
7
+ Simple SQLite helper for persisting resources and conversation messages.
8
+
9
+ This module centralises all database access used by the Course Creator agent.
10
+ It defines three tables:
11
+ resources (id, url, title, source, published_at, retrieved_at, content_excerpt, meta_json)
12
+ chats (id, chat_key, title, created_at)
13
+ messages (id, chat_key, role, content, status, created_at)
14
+
15
+ Resources are de-duplicated by URL. Chats are keyed by a unique string
16
+ (UUID-like) generated externally. Messages are stored in the order received
17
+ and may be soft-deleted by updating their status column.
18
+ """
19
+
20
+ # Determine database path. Use environment override or default to a local data dir.
21
+ DB_PATH = os.getenv("COURSECREATOR_DB", os.path.join(os.path.dirname(__file__), "data", "course_creator.db"))
22
+
23
+ def _ensure_db():
24
+ """Initialise the SQLite database with the required tables if they don't exist."""
25
+ os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
26
+ conn = sqlite3.connect(DB_PATH)
27
+ conn.execute("PRAGMA journal_mode=WAL;")
28
+ conn.executescript(
29
+ """
30
+ CREATE TABLE IF NOT EXISTS resources (
31
+ id INTEGER PRIMARY KEY,
32
+ url TEXT UNIQUE,
33
+ title TEXT,
34
+ source TEXT,
35
+ published_at TEXT,
36
+ retrieved_at INTEGER,
37
+ content_excerpt TEXT,
38
+ meta_json TEXT
39
+ );
40
+ CREATE TABLE IF NOT EXISTS chats (
41
+ id INTEGER PRIMARY KEY,
42
+ chat_key TEXT UNIQUE,
43
+ title TEXT,
44
+ created_at INTEGER
45
+ );
46
+ CREATE TABLE IF NOT EXISTS messages (
47
+ id INTEGER PRIMARY KEY,
48
+ chat_key TEXT,
49
+ role TEXT,
50
+ content TEXT,
51
+ status TEXT DEFAULT 'normal',
52
+ created_at INTEGER,
53
+ FOREIGN KEY(chat_key) REFERENCES chats(chat_key)
54
+ );
55
+ """
56
+ )
57
+ conn.commit()
58
+ conn.close()
59
+
60
+ def get_conn():
61
+ """Return a connection with WAL mode enabled and ensure tables exist."""
62
+ _ensure_db()
63
+ conn = sqlite3.connect(DB_PATH)
64
+ conn.execute("PRAGMA journal_mode=WAL;")
65
+ return conn
66
+
67
+ def upsert_resource(url: str, title: str, source: str, content_excerpt: str, meta: dict | None = None) -> None:
68
+ """Insert or update a resource record based on its URL.
69
+
70
+ Args:
71
+ url: The canonical URL of the resource.
72
+ title: Title or headline.
73
+ source: Domain or source label.
74
+ content_excerpt: A short excerpt of the page content.
75
+ meta: Optional dictionary of additional metadata.
76
+ """
77
+ now = int(time.time())
78
+ meta_json = json.dumps(meta or {})
79
+ with get_conn() as conn:
80
+ conn.execute(
81
+ """
82
+ INSERT INTO resources (url, title, source, retrieved_at, content_excerpt, meta_json)
83
+ VALUES (?, ?, ?, ?, ?, ?)
84
+ ON CONFLICT(url) DO UPDATE SET
85
+ title = excluded.title,
86
+ source = excluded.source,
87
+ retrieved_at = excluded.retrieved_at,
88
+ content_excerpt = excluded.content_excerpt,
89
+ meta_json = excluded.meta_json
90
+ """,
91
+ (url, title, source, now, content_excerpt, meta_json),
92
+ )
93
+
94
+ def get_resource(url: str) -> dict | None:
95
+ """Retrieve a resource by URL, returning a dictionary or None."""
96
+ with get_conn() as conn:
97
+ row = conn.execute(
98
+ "SELECT url, title, source, published_at, retrieved_at, content_excerpt, meta_json FROM resources WHERE url=?",
99
+ (url,),
100
+ ).fetchone()
101
+ if not row:
102
+ return None
103
+ url, title, source, published_at, retrieved_at, content_excerpt, meta_json = row
104
+ meta = json.loads(meta_json or "{}")
105
+ return {
106
+ "url": url,
107
+ "title": title,
108
+ "source": source,
109
+ "published_at": published_at,
110
+ "retrieved_at": retrieved_at,
111
+ "excerpt": content_excerpt,
112
+ "meta": meta,
113
+ }
114
+
115
+ def list_resources(limit: int = 200) -> list[dict]:
116
+ """List recently retrieved resources."""
117
+ with get_conn() as conn:
118
+ rows = conn.execute(
119
+ "SELECT url, title, source, retrieved_at FROM resources ORDER BY retrieved_at DESC LIMIT ?",
120
+ (limit,),
121
+ ).fetchall()
122
+ return [{"url": url, "title": title, "source": source, "retrieved_at": retrieved_at} for url, title, source, retrieved_at in rows]
123
+
124
+ def new_chat(title: str = "Untitled") -> str:
125
+ """Create a new chat and return its key."""
126
+ import uuid
127
+ chat_key = str(uuid.uuid4())
128
+ now = int(time.time())
129
+ with get_conn() as conn:
130
+ conn.execute("INSERT INTO chats (chat_key, title, created_at) VALUES (?, ?, ?)", (chat_key, title, now))
131
+ return chat_key
132
+
133
+ def append_message(chat_key: str, role: str, content: str, status: str = "normal") -> None:
134
+ """Append a message to a chat."""
135
+ now = int(time.time())
136
+ with get_conn() as conn:
137
+ conn.execute(
138
+ "INSERT INTO messages (chat_key, role, content, status, created_at) VALUES (?, ?, ?, ?, ?)",
139
+ (chat_key, role, content, status, now),
140
+ )
141
+
142
+ def load_chat(chat_key: str) -> list[dict]:
143
+ """Load all non-deleted messages for a chat key."""
144
+ with get_conn() as conn:
145
+ rows = conn.execute(
146
+ "SELECT rowid, role, content, status FROM messages WHERE chat_key=? ORDER BY id ASC",
147
+ (chat_key,),
148
+ ).fetchall()
149
+ messages = []
150
+ for rowid, role, content, status in rows:
151
+ if status != "deleted":
152
+ messages.append({"id": rowid, "role": role, "content": content})
153
+ return messages
154
+
155
+ def soft_delete_message(message_id: int) -> None:
156
+ """Mark a message as deleted without removing it."""
157
+ with get_conn() as conn:
158
+ conn.execute("UPDATE messages SET status='deleted' WHERE id=?", (message_id,))
searcher.py CHANGED
@@ -2,7 +2,20 @@ import os
2
 
3
 
4
  def run_web_search(query, num_results=5, domain_filter=""):
5
- """Run a web search using Tavily API."""
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  try:
7
  from tavily import TavilyClient
8
  except ImportError:
@@ -18,6 +31,96 @@ def run_web_search(query, num_results=5, domain_filter=""):
18
  results = client.search(query, **params)
19
  return results
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # New function to extract content from a given URL using Tavily Extract API.
22
  def extract_web_content(url):
23
  """Extract the main content of a web page via Tavily Extract.
 
2
 
3
 
4
  def run_web_search(query, num_results=5, domain_filter=""):
5
+ """
6
+ Run a web search using Tavily API.
7
+
8
+ Args:
9
+ query (str): Search query.
10
+ num_results (int): Number of results to retrieve.
11
+ domain_filter (str): Optional domain filter (comma-separated domains).
12
+
13
+ Returns:
14
+ list[dict] | dict: Tavily response. It may return a list directly or a dict with a "results" key.
15
+
16
+ Raises:
17
+ ValueError: If the TAVILY_API_KEY env var is not set.
18
+ """
19
  try:
20
  from tavily import TavilyClient
21
  except ImportError:
 
31
  results = client.search(query, **params)
32
  return results
33
 
34
+ # ---------------------------------------------------------------------------
35
+ # Extended helper functions for credible research and extraction.
36
+ # ---------------------------------------------------------------------------
37
+
38
+ import re
39
+ from typing import List, Dict, Optional
40
+
41
+ # Import DB helpers from sibling module. Note: db.py resides in the same package directory.
42
+ from db import get_resource, upsert_resource
43
+
44
+ def web_search(query: str, max_results: int = 5, allowed_domains: Optional[List[str]] = None) -> List[Dict]:
45
+ """
46
+ Perform a web search and return a list of result dictionaries, filtering by allowed domains.
47
+
48
+ Args:
49
+ query: Search string.
50
+ max_results: Maximum number of results to return.
51
+ allowed_domains: Optional list of domains to permit. If provided, only results with URLs
52
+ containing one of these domains will be included.
53
+
54
+ Returns:
55
+ A list of search results (dicts with at least 'url' and 'title' keys).
56
+ """
57
+ raw_results = run_web_search(query, num_results=max_results)
58
+ # Tavily can return either a list or a dict with 'results'
59
+ results_list = raw_results.get("results", []) if isinstance(raw_results, dict) else raw_results or []
60
+ # Filter out results that do not meet allowed domains, if specified
61
+ filtered: List[Dict] = []
62
+ for item in results_list:
63
+ if not isinstance(item, dict):
64
+ continue
65
+ url = item.get("url", "")
66
+ # Basic domain filtering: allow if allowed_domains is None or URL's domain ends with allowed domain
67
+ if allowed_domains:
68
+ try:
69
+ from urllib.parse import urlparse
70
+ domain = urlparse(url).netloc.lower()
71
+ if not any(domain.endswith(ad.lower()) for ad in allowed_domains):
72
+ continue
73
+ except Exception:
74
+ continue
75
+ filtered.append(item)
76
+ if len(filtered) >= max_results:
77
+ break
78
+ return filtered
79
+
80
+ def fetch_and_extract(url: str, timeout: int = 15) -> Optional[Dict]:
81
+ """
82
+ Fetch a web page and extract its main textual content. Caches results in the database.
83
+
84
+ Args:
85
+ url: The URL to fetch.
86
+ timeout: HTTP timeout in seconds.
87
+
88
+ Returns:
89
+ A dictionary with keys: url, title, source, excerpt, meta, or None on failure.
90
+ """
91
+ # Return cached record if present
92
+ cached = get_resource(url)
93
+ if cached:
94
+ return cached
95
+ # Attempt to fetch page
96
+ try:
97
+ import requests
98
+ from bs4 import BeautifulSoup
99
+ except ImportError:
100
+ raise ImportError("Please install requests and beautifulsoup4")
101
+ try:
102
+ resp = requests.get(url, timeout=timeout, headers={"User-Agent": "CourseCreatorBot/1.0"})
103
+ resp.raise_for_status()
104
+ except Exception:
105
+ return None
106
+ # Parse HTML
107
+ soup = BeautifulSoup(resp.text, "html.parser")
108
+ # Title: fall back to URL if missing
109
+ title = (soup.title.string.strip() if soup.title and soup.title.string else url)[:200]
110
+ # Extract paragraphs
111
+ paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]
112
+ content_text = "\n".join(paragraphs)
113
+ excerpt = content_text[:2000]
114
+ # Domain as source
115
+ try:
116
+ from urllib.parse import urlparse
117
+ domain = urlparse(url).netloc
118
+ except Exception:
119
+ domain = ""
120
+ # Store in DB
121
+ upsert_resource(url, title, domain, excerpt, meta={"length": len(content_text)})
122
+ return get_resource(url)
123
+
124
  # New function to extract content from a given URL using Tavily Extract API.
125
  def extract_web_content(url):
126
  """Extract the main content of a web page via Tavily Extract.