Spaces:

Moai633
/

StemGraph_AI

Running

File size: 5,969 Bytes

from __future__ import annotations
import re
from langchain_core.tools import tool
from tavily import TavilyClient  # type:ignore
from youtube_transcript_api import YouTubeTranscriptApi #type:ignore
from config import TAVILY_API_KEY


# ── Web Search (Tavily) ───────────────────────────────────────
def run_web_search(
    query: str,
    api_key: str = "",
    *,
    search_depth: str = "advanced",
    topic: str = "general",
    max_results: int = 5,
    include_answer: bool = True,
) -> str:
    """
    Run a web search using the Tavily API.

    `api_key` is the user's own Tavily key (BYOK). Falls back to the shared
    server key if the user didn't provide one. If no key is available at all,
    web search is treated as disabled.

    Returns a formatted block: an optional LLM-ready answer followed by
    result cards (title, URL, relevance score, content snippet).
    """
    key = api_key or TAVILY_API_KEY
    if not key:
        return (
            "Web search is unavailable: no Tavily API key configured. "
            "Add a Tavily API key in Settings to enable web search."
        )

    try:
        client = TavilyClient(api_key=key)
        resp = client.search(
            query,
            search_depth=search_depth,
            topic=topic,
            max_results=max_results,
            include_answer=include_answer,
            include_raw_content=False,
            chunks_per_source=3,
        )
    except Exception as e:
        print(f"[TAVILY SEARCH ERROR] {e}", flush=True)
        return f"Web search is temporarily unavailable. (Error: {e})"

    results = resp.get("results", []) if isinstance(resp, dict) else []
    if not results and not (isinstance(resp, dict) and resp.get("answer")):
        return "No search results found."

    blocks = []
    answer = resp.get("answer") if isinstance(resp, dict) else None
    if answer:
        blocks.append(f"Answer: {answer}\n")

    for r in results:
        title = r.get("title", "No Title")
        url = r.get("url", "")
        score = r.get("score", "")
        content = r.get("content", "")
        score_str = f" (relevance: {score:.2f})" if isinstance(score, (int, float)) else ""
        blocks.append(f"Title: {title}{score_str}\nURL: {url}\nSnippet: {content}\n")

    return "\n".join(blocks)


@tool
def web_search(query: str) -> str:
    """
    Search the internet for current information. Use when the student asks
    about recent events, specific facts, or anything not covered by the NCERT
    curriculum context. Input: a concise search query string.
    """
    return run_web_search(query)


# ── YouTube Transcript ────────────────────────────────────────
def _extract_video_id(url_or_id: str) -> str | None:
    """Extract YouTube 11-character video ID from URL or bare ID."""
    patterns = [
        r"(?:v=|youtu\.be/|embed/|shorts/)([A-Za-z0-9_-]{11})",
        r"^([A-Za-z0-9_-]{11})$",
    ]
    for pat in patterns:
        m = re.search(pat, url_or_id.strip())
        if m:
            return m.group(1)
    return None


@tool
def yt_transcript(youtube_url: str) -> str:
    """
    Fetch the full transcript of a YouTube video.
    """
    return fetch_yt_transcript(youtube_url)


def fetch_yt_transcript(youtube_url: str) -> str:
    """
    Programmatic helper to fetch the transcript of a YouTube video URL or ID.
    """
    video_id = _extract_video_id(youtube_url)
    if not video_id:
        print(f"[YT TRANSCRIPT] No video ID found in input: {youtube_url[:80]}", flush=True)
        return "TRANSCRIPT_UNAVAILABLE: Could not extract a valid YouTube video ID from the message."

    try:
        langs = ["en", "hi", "en-IN", "en-US"]
        # youtube-transcript-api ≥ 1.0 replaced the static `get_transcript`
        # with an instance method `.fetch()`. Support both APIs.
        if hasattr(YouTubeTranscriptApi, "get_transcript"):
            fetched = YouTubeTranscriptApi.get_transcript(video_id, languages=langs)  # type:ignore[attr-defined]
            transcript = " ".join(seg["text"] for seg in fetched)
        else:
            fetched = YouTubeTranscriptApi().fetch(video_id, languages=langs)
            # FetchedTranscript yields snippet objects with a `.text` attribute
            # (older dict form `seg["text"]` is handled as a fallback).
            transcript = " ".join(
                getattr(seg, "text", None) or (seg.get("text", "") if isinstance(seg, dict) else "")
                for seg in fetched
            )
        if not transcript.strip():
            print(f"[YT TRANSCRIPT] Empty transcript for video_id={video_id}", flush=True)
            return "TRANSCRIPT_UNAVAILABLE: Transcript is empty for this video."
        print(f"[YT TRANSCRIPT] OK — {len(transcript)} chars for video_id={video_id}", flush=True)
        return transcript
    except Exception as exc:
        print(f"[YT TRANSCRIPT EXCEPTION] video_id={video_id} | error={exc}", flush=True)
        err = str(exc).lower()
        if "disabled" in err or "no transcript" in err or "no element" in err:
            return (
                "TRANSCRIPT_UNAVAILABLE: This video has no transcript available "
                "(subtitles are disabled or no captions exist for this video)."
            )
        if "too many requests" in err or "429" in err:
            return (
                "TRANSCRIPT_UNAVAILABLE: YouTube is rate-limiting transcript requests right now. "
                "Please try again in a few minutes."
            )
        return (
            f"TRANSCRIPT_UNAVAILABLE: Could not retrieve transcript. Reason: {exc}"
        )


# ── Exported list ─────────────────────────────────────────────
TOOLS = [web_search, yt_transcript]