Spaces:

Prof-Reza
/

course-creator

Runtime error

App Files Files Community

Prof-Reza commited on Aug 11, 2025

Commit

813f3c0

verified ·

1 Parent(s): a24abb9

Implement grounded web search and summarization with caching; integrate DB helpers

Browse files

This commit updates app.py to perform real web searches using Tavily, fetch and extract article content, summarise each article via OpenAI, and cache results. It integrates db.py for resource caching, adds improved search pipeline in the chat function, and refactors the conversation to avoid hallucinated resources. Also uploads updated searcher.py and db.py.

Files changed (3) hide show

app.py +112 -74
db.py +158 -0
searcher.py +104 -1

app.py CHANGED Viewed

@@ -4,7 +4,10 @@ import openai
 from planner import plan_course
 from generators import generate_course_zip
-from searcher import run_web_search, extract_web_content, get_youtube_transcript
 # System prompt guiding the assistant's behaviour during brainstorming
 SYSTEM_PROMPT = (
@@ -29,40 +32,29 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
     messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
     # Check if the user message contains a URL to open and read.
     url = None
-    # Simple heuristic: look for http/https links in the message
     for part in user_message.split():
         if part.startswith("http://") or part.startswith("https://"):
             url = part
             break
     if url:
-        # User is asking to open/read a specific page or YouTube video
         try:
-            page_content = ""
-            # Special handling for YouTube links: attempt to fetch transcript
             if "youtube.com" in url or "youtu.be" in url:
                 try:
                     transcript_text = get_youtube_transcript(url)
                 except Exception:
                     transcript_text = ""
                 page_content = transcript_text or ""
-            # For non-YouTube links or fallback if transcript empty, use Tavily extract
-            if not page_content:
-                extract_response = extract_web_content(url)
-                if isinstance(extract_response, dict):
-                    if extract_response.get("content"):
-                        page_content = extract_response.get("content", "")
-                    elif extract_response.get("text"):
-                        page_content = extract_response.get("text", "")
-                    elif extract_response.get("article"):
-                        page_content = extract_response.get("article", "")
-                    elif extract_response.get("results"):
-                        results_list = extract_response.get("results", [])
-                        if isinstance(results_list, list):
-                            page_content = "\n".join([
-                                item.get("content", item.get("title", ""))
-                                for item in results_list
-                                if isinstance(item, dict)
-                            ])
             if not page_content:
                 assistant_reply = "I couldn't extract content from that page."
             else:
@@ -75,7 +67,6 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
                     if not api_key:
                         raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
                     summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
-                    # Truncate content to avoid exceeding token limits
                     truncated_content = page_content[:8000]
                     summary_messages = [
                         {"role": "system", "content": summary_system},
@@ -130,64 +121,111 @@ def chat(user_message, chat_history, chat_pairs, sources, plan, resource_cache):
         # of calling the language model. This allows the assistant to fetch resources when
         # the user asks the agent to "search" or "search the internet".
         search_triggers = ["search", "internet search", "web search"]
-        lower_msg = user_message.lower()
-        if any(trig in lower_msg for trig in search_triggers):
             try:
-                # Perform web search using the entire user message as the query. Use cached results if available
-                query_key = user_message.strip().lower()
                 if query_key in resource_cache:
-                    results = resource_cache[query_key]
                 else:
-                    results = run_web_search(user_message, num_results=5, domain_filter="")
-                    # store results in cache for future queries
-                    resource_cache[query_key] = results
-                # Normalize results:
-                # Tavily may return a dictionary with a "results" key containing
-                # the list of search results. If so, extract that list. If it's a
-                # list already, use it directly. Otherwise, default to an empty list.
-                if isinstance(results, dict):
-                    normalized_results = results.get("results", [])
-                elif isinstance(results, list):
-                    normalized_results = results
-                else:
-                    normalized_results = []
-                # Ensure the sources list is initialised
                 if sources is None:
                     sources = []
-                # Filter out duplicate URLs already in sources
-                existing_urls = set()
-                for src in sources:
-                    if isinstance(src, dict):
-                        url = src.get("url")
-                        if url:
-                            existing_urls.add(url)
-                new_results = []
-                for r in normalized_results:
-                    if isinstance(r, dict):
-                        url = r.get("url")
-                        if url and url not in existing_urls:
-                            new_results.append(r)
-                            existing_urls.add(url)
-                sources.extend(new_results)
-                # Summarise results into a simple string with title and URL
-                summary_lines = []
-                for r in new_results:
-                    # Defensive: ensure r is a dict
-                    if isinstance(r, dict):
-                        title = r.get("title", "")
-                        url = r.get("url", "")
-                        if title or url:
-                            summary_lines.append(f"{title} - {url}")
-                if summary_lines:
-                    assistant_reply = "Here are some resources I found:\n" + "\n".join(summary_lines)
                 else:
-                    if normalized_results:
-                        assistant_reply = "I've already shared the relevant resources from this search."
-                    else:
-                        assistant_reply = "I couldn't find any results for that query."
             except Exception as e:
                 assistant_reply = (
-                    "An error occurred during web search. Please ensure your search API key is configured.\n"
                     f"(Error: {e})"
                 )
         else:

 from planner import plan_course
 from generators import generate_course_zip
+from searcher import web_search, fetch_and_extract, get_youtube_transcript
+# Bring in DB helpers to persist resources if needed later
+from db import get_resource, upsert_resource, list_resources, new_chat, append_message, load_chat, soft_delete_message
 # System prompt guiding the assistant's behaviour during brainstorming
 SYSTEM_PROMPT = (
     messages = [{"role": "system", "content": SYSTEM_PROMPT}] + chat_history
     # Check if the user message contains a URL to open and read.
     url = None
     for part in user_message.split():
         if part.startswith("http://") or part.startswith("https://"):
             url = part
             break
     if url:
+        # If the message contains a URL, attempt to fetch and summarise it using our extraction helpers.
         try:
+            # Detect YouTube links and fetch transcript
             if "youtube.com" in url or "youtu.be" in url:
                 try:
                     transcript_text = get_youtube_transcript(url)
                 except Exception:
                     transcript_text = ""
                 page_content = transcript_text or ""
+                page_title = url
+            else:
+                record = fetch_and_extract(url)
+                if record:
+                    page_content = record.get("excerpt", "")
+                    page_title = record.get("title", url)
+                else:
+                    page_content = ""
+                    page_title = url
             if not page_content:
                 assistant_reply = "I couldn't extract content from that page."
             else:
                     if not api_key:
                         raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
                     summary_system = "You are a helpful assistant. Summarize the given content in a concise and clear way."
                     truncated_content = page_content[:8000]
                     summary_messages = [
                         {"role": "system", "content": summary_system},
         # of calling the language model. This allows the assistant to fetch resources when
         # the user asks the agent to "search" or "search the internet".
         search_triggers = ["search", "internet search", "web search"]
+        lower_msg = user_message.lower().strip()
+        # Determine if a search should be performed
+        do_search = any(lower_msg.startswith(trig) for trig in search_triggers)
+        if do_search:
+            # Extract query after trigger word if present (e.g. "search vibe coding" -> "vibe coding")
+            # Otherwise use the full message minus the trigger
+            query = user_message
+            for trig in search_triggers:
+                if lower_msg.startswith(trig):
+                    # Remove the trigger from the start of the query string
+                    query = user_message[len(trig):].strip() or user_message
+                    break
             try:
+                # Use cached search results if available for this query key (case-insensitive)
+                query_key = query.lower()
                 if query_key in resource_cache:
+                    search_results = resource_cache[query_key]
                 else:
+                    # Use our wrapped web_search for better domain filtering and consistent return type
+                    search_results = web_search(query, max_results=5)
+                    resource_cache[query_key] = search_results
+                # Iterate over search results, fetch their content, cache resources and summarise
+                summaries = []
                 if sources is None:
                     sources = []
+                existing_urls = {src.get("url") for src in sources if isinstance(src, dict) and src.get("url")}
+                # For each result (should be a dict with 'url' and 'title')
+                for item in search_results:
+                    if not isinstance(item, dict):
+                        continue
+                    url = item.get("url")
+                    title = item.get("title", url)
+                    if not url or url in existing_urls:
+                        continue
+                    # Fetch and cache resource content
+                    record = fetch_and_extract(url)
+                    if not record:
+                        # Skip if unable to fetch
+                        continue
+                    # Add to sources for plan generation (avoid duplicates)
+                    sources.append({"title": record.get("title", title), "url": record.get("url", url)})
+                    existing_urls.add(url)
+                    # Summarise the resource's excerpt using OpenAI
+                    try:
+                        model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
+                        temperature = float(os.getenv("TEMPERATURE", "0.7"))
+                        max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "256"))
+                        api_key = os.getenv("OPENAI_API_KEY") or os.getenv("COURSECREATOR_API_KEY")
+                        if not api_key:
+                            raise ValueError("OPENAI_API_KEY or COURSECREATOR_API_KEY is not set")
+                        summary_system = "You are a helpful assistant. Summarize the following article excerpt in one paragraph."
+                        excerpt = record.get("excerpt", "")[:3000]
+                        summary_messages = [
+                            {"role": "system", "content": summary_system},
+                            {"role": "user", "content": excerpt},
+                        ]
+                        if hasattr(openai, "OpenAI"):
+                            client = openai.OpenAI(api_key=api_key)
+                            try:
+                                resp = client.chat.completions.create(
+                                    model=model,
+                                    messages=summary_messages,
+                                    temperature=temperature,
+                                    max_tokens=max_tokens,
+                                )
+                            except Exception:
+                                resp = client.chat.completions.create(
+                                    model=model,
+                                    messages=summary_messages,
+                                    temperature=temperature,
+                                    max_completion_tokens=max_tokens,
+                                )
+                            summary_text = resp.choices[0].message.content
+                        else:
+                            openai.api_key = api_key
+                            try:
+                                resp = openai.ChatCompletion.create(
+                                    model=model,
+                                    messages=summary_messages,
+                                    temperature=temperature,
+                                    max_tokens=max_tokens,
+                                )
+                            except Exception:
+                                resp = openai.ChatCompletion.create(
+                                    model=model,
+                                    messages=summary_messages,
+                                    temperature=temperature,
+                                    max_completion_tokens=max_tokens,
+                                )
+                            summary_text = resp["choices"][0]["message"]["content"]
+                    except Exception as se:
+                        # If summarization fails, just include the title and URL without a summary
+                        summary_text = ""
+                    # Compose summary line with link and summary
+                    line = f"**{title}** ({url})"
+                    if summary_text:
+                        line += f"\n{summary_text.strip()}"
+                    summaries.append(line)
+                if summaries:
+                    assistant_reply = "Here are some articles I found and summarised:\n\n" + "\n\n".join(summaries)
                 else:
+                    assistant_reply = "I couldn't fetch or summarise any credible articles for that query."
             except Exception as e:
                 assistant_reply = (
+                    "An error occurred during web search and summarisation. Please ensure your API keys are configured.\n"
                     f"(Error: {e})"
                 )
         else:

db.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+import sqlite3
+import json
+import time
+"""
+Simple SQLite helper for persisting resources and conversation messages.
+This module centralises all database access used by the Course Creator agent.
+It defines three tables:
+  resources (id, url, title, source, published_at, retrieved_at, content_excerpt, meta_json)
+  chats (id, chat_key, title, created_at)
+  messages (id, chat_key, role, content, status, created_at)
+Resources are de-duplicated by URL. Chats are keyed by a unique string
+(UUID-like) generated externally. Messages are stored in the order received
+and may be soft-deleted by updating their status column.
+"""
+# Determine database path. Use environment override or default to a local data dir.
+DB_PATH = os.getenv("COURSECREATOR_DB", os.path.join(os.path.dirname(__file__), "data", "course_creator.db"))
+def _ensure_db():
+    """Initialise the SQLite database with the required tables if they don't exist."""
+    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
+    conn = sqlite3.connect(DB_PATH)
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.executescript(
+        """
+        CREATE TABLE IF NOT EXISTS resources (
+            id INTEGER PRIMARY KEY,
+            url TEXT UNIQUE,
+            title TEXT,
+            source TEXT,
+            published_at TEXT,
+            retrieved_at INTEGER,
+            content_excerpt TEXT,
+            meta_json TEXT
+        );
+        CREATE TABLE IF NOT EXISTS chats (
+            id INTEGER PRIMARY KEY,
+            chat_key TEXT UNIQUE,
+            title TEXT,
+            created_at INTEGER
+        );
+        CREATE TABLE IF NOT EXISTS messages (
+            id INTEGER PRIMARY KEY,
+            chat_key TEXT,
+            role TEXT,
+            content TEXT,
+            status TEXT DEFAULT 'normal',
+            created_at INTEGER,
+            FOREIGN KEY(chat_key) REFERENCES chats(chat_key)
+        );
+        """
+    )
+    conn.commit()
+    conn.close()
+def get_conn():
+    """Return a connection with WAL mode enabled and ensure tables exist."""
+    _ensure_db()
+    conn = sqlite3.connect(DB_PATH)
+    conn.execute("PRAGMA journal_mode=WAL;")
+    return conn
+def upsert_resource(url: str, title: str, source: str, content_excerpt: str, meta: dict | None = None) -> None:
+    """Insert or update a resource record based on its URL.
+    Args:
+        url: The canonical URL of the resource.
+        title: Title or headline.
+        source: Domain or source label.
+        content_excerpt: A short excerpt of the page content.
+        meta: Optional dictionary of additional metadata.
+    """
+    now = int(time.time())
+    meta_json = json.dumps(meta or {})
+    with get_conn() as conn:
+        conn.execute(
+            """
+            INSERT INTO resources (url, title, source, retrieved_at, content_excerpt, meta_json)
+            VALUES (?, ?, ?, ?, ?, ?)
+            ON CONFLICT(url) DO UPDATE SET
+              title = excluded.title,
+              source = excluded.source,
+              retrieved_at = excluded.retrieved_at,
+              content_excerpt = excluded.content_excerpt,
+              meta_json = excluded.meta_json
+            """,
+            (url, title, source, now, content_excerpt, meta_json),
+        )
+def get_resource(url: str) -> dict | None:
+    """Retrieve a resource by URL, returning a dictionary or None."""
+    with get_conn() as conn:
+        row = conn.execute(
+            "SELECT url, title, source, published_at, retrieved_at, content_excerpt, meta_json FROM resources WHERE url=?",
+            (url,),
+        ).fetchone()
+    if not row:
+        return None
+    url, title, source, published_at, retrieved_at, content_excerpt, meta_json = row
+    meta = json.loads(meta_json or "{}")
+    return {
+        "url": url,
+        "title": title,
+        "source": source,
+        "published_at": published_at,
+        "retrieved_at": retrieved_at,
+        "excerpt": content_excerpt,
+        "meta": meta,
+    }
+def list_resources(limit: int = 200) -> list[dict]:
+    """List recently retrieved resources."""
+    with get_conn() as conn:
+        rows = conn.execute(
+            "SELECT url, title, source, retrieved_at FROM resources ORDER BY retrieved_at DESC LIMIT ?",
+            (limit,),
+        ).fetchall()
+    return [{"url": url, "title": title, "source": source, "retrieved_at": retrieved_at} for url, title, source, retrieved_at in rows]
+def new_chat(title: str = "Untitled") -> str:
+    """Create a new chat and return its key."""
+    import uuid
+    chat_key = str(uuid.uuid4())
+    now = int(time.time())
+    with get_conn() as conn:
+        conn.execute("INSERT INTO chats (chat_key, title, created_at) VALUES (?, ?, ?)", (chat_key, title, now))
+    return chat_key
+def append_message(chat_key: str, role: str, content: str, status: str = "normal") -> None:
+    """Append a message to a chat."""
+    now = int(time.time())
+    with get_conn() as conn:
+        conn.execute(
+            "INSERT INTO messages (chat_key, role, content, status, created_at) VALUES (?, ?, ?, ?, ?)",
+            (chat_key, role, content, status, now),
+        )
+def load_chat(chat_key: str) -> list[dict]:
+    """Load all non-deleted messages for a chat key."""
+    with get_conn() as conn:
+        rows = conn.execute(
+            "SELECT rowid, role, content, status FROM messages WHERE chat_key=? ORDER BY id ASC",
+            (chat_key,),
+        ).fetchall()
+    messages = []
+    for rowid, role, content, status in rows:
+        if status != "deleted":
+            messages.append({"id": rowid, "role": role, "content": content})
+    return messages
+def soft_delete_message(message_id: int) -> None:
+    """Mark a message as deleted without removing it."""
+    with get_conn() as conn:
+        conn.execute("UPDATE messages SET status='deleted' WHERE id=?", (message_id,))

searcher.py CHANGED Viewed

@@ -2,7 +2,20 @@ import os
 def run_web_search(query, num_results=5, domain_filter=""):
-    """Run a web search using Tavily API."""
     try:
         from tavily import TavilyClient
     except ImportError:
@@ -18,6 +31,96 @@ def run_web_search(query, num_results=5, domain_filter=""):
     results = client.search(query, **params)
     return results
 # New function to extract content from a given URL using Tavily Extract API.
 def extract_web_content(url):
     """Extract the main content of a web page via Tavily Extract.

 def run_web_search(query, num_results=5, domain_filter=""):
+    """
+    Run a web search using Tavily API.
+    Args:
+        query (str): Search query.
+        num_results (int): Number of results to retrieve.
+        domain_filter (str): Optional domain filter (comma-separated domains).
+    Returns:
+        list[dict] | dict: Tavily response. It may return a list directly or a dict with a "results" key.
+    Raises:
+        ValueError: If the TAVILY_API_KEY env var is not set.
+    """
     try:
         from tavily import TavilyClient
     except ImportError:
     results = client.search(query, **params)
     return results
+# ---------------------------------------------------------------------------
+# Extended helper functions for credible research and extraction.
+# ---------------------------------------------------------------------------
+import re
+from typing import List, Dict, Optional
+# Import DB helpers from sibling module. Note: db.py resides in the same package directory.
+from db import get_resource, upsert_resource
+def web_search(query: str, max_results: int = 5, allowed_domains: Optional[List[str]] = None) -> List[Dict]:
+    """
+    Perform a web search and return a list of result dictionaries, filtering by allowed domains.
+    Args:
+        query: Search string.
+        max_results: Maximum number of results to return.
+        allowed_domains: Optional list of domains to permit. If provided, only results with URLs
+                         containing one of these domains will be included.
+    Returns:
+        A list of search results (dicts with at least 'url' and 'title' keys).
+    """
+    raw_results = run_web_search(query, num_results=max_results)
+    # Tavily can return either a list or a dict with 'results'
+    results_list = raw_results.get("results", []) if isinstance(raw_results, dict) else raw_results or []
+    # Filter out results that do not meet allowed domains, if specified
+    filtered: List[Dict] = []
+    for item in results_list:
+        if not isinstance(item, dict):
+            continue
+        url = item.get("url", "")
+        # Basic domain filtering: allow if allowed_domains is None or URL's domain ends with allowed domain
+        if allowed_domains:
+            try:
+                from urllib.parse import urlparse
+                domain = urlparse(url).netloc.lower()
+                if not any(domain.endswith(ad.lower()) for ad in allowed_domains):
+                    continue
+            except Exception:
+                continue
+        filtered.append(item)
+        if len(filtered) >= max_results:
+            break
+    return filtered
+def fetch_and_extract(url: str, timeout: int = 15) -> Optional[Dict]:
+    """
+    Fetch a web page and extract its main textual content. Caches results in the database.
+    Args:
+        url: The URL to fetch.
+        timeout: HTTP timeout in seconds.
+    Returns:
+        A dictionary with keys: url, title, source, excerpt, meta, or None on failure.
+    """
+    # Return cached record if present
+    cached = get_resource(url)
+    if cached:
+        return cached
+    # Attempt to fetch page
+    try:
+        import requests
+        from bs4 import BeautifulSoup
+    except ImportError:
+        raise ImportError("Please install requests and beautifulsoup4")
+    try:
+        resp = requests.get(url, timeout=timeout, headers={"User-Agent": "CourseCreatorBot/1.0"})
+        resp.raise_for_status()
+    except Exception:
+        return None
+    # Parse HTML
+    soup = BeautifulSoup(resp.text, "html.parser")
+    # Title: fall back to URL if missing
+    title = (soup.title.string.strip() if soup.title and soup.title.string else url)[:200]
+    # Extract paragraphs
+    paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]
+    content_text = "\n".join(paragraphs)
+    excerpt = content_text[:2000]
+    # Domain as source
+    try:
+        from urllib.parse import urlparse
+        domain = urlparse(url).netloc
+    except Exception:
+        domain = ""
+    # Store in DB
+    upsert_resource(url, title, domain, excerpt, meta={"length": len(content_text)})
+    return get_resource(url)
 # New function to extract content from a given URL using Tavily Extract API.
 def extract_web_content(url):
     """Extract the main content of a web page via Tavily Extract.