File size: 21,301 Bytes
5e77d41
53f8f7c
 
e04e3db
53f8f7c
 
5e77d41
e04e3db
5e77d41
e04e3db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53f8f7c
 
5e77d41
 
 
53f8f7c
5e77d41
 
 
 
 
 
 
 
 
 
 
 
 
 
e04e3db
 
 
5e77d41
e04e3db
 
 
 
5e77d41
e04e3db
 
 
 
5e77d41
 
d4eadfe
e04e3db
 
d4eadfe
 
 
e04e3db
d4eadfe
e04e3db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eadfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e04e3db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eadfe
e04e3db
 
 
 
 
 
d4eadfe
e04e3db
 
 
 
 
d4eadfe
e04e3db
 
 
 
 
 
 
 
 
 
 
 
 
d4eadfe
 
 
e04e3db
 
d4eadfe
 
e04e3db
 
d4eadfe
 
e04e3db
 
d4eadfe
e04e3db
 
 
 
 
 
d4eadfe
 
5e77d41
 
 
53f8f7c
 
 
5e77d41
 
 
 
53f8f7c
 
 
5e77d41
 
 
 
d4eadfe
 
 
5e77d41
d4eadfe
5e77d41
d4eadfe
 
5e77d41
01b23e7
 
 
 
 
 
 
5e77d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53f8f7c
 
e04e3db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbec116
e04e3db
 
 
 
5e77d41
 
e04e3db
 
 
 
 
 
5e77d41
 
d4eadfe
5e77d41
 
e04e3db
 
 
53f8f7c
 
 
 
 
 
 
 
 
 
 
 
 
5e77d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4eadfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53f8f7c
 
 
 
 
 
 
fbec116
53f8f7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
import os
from typing import List

from langchain_chroma import Chroma
from langchain_core.documents.base import Document
from langchain_core.tools import tool
from langchain_core.tools.base import ArgsSchema
from langchain_huggingface import HuggingFaceEmbeddings
from pydantic import SecretStr
from sqlalchemy.sql.selectable import ForUpdateParameter

# Initialize RAG vector store for strategy retrieval
CHROMA_PATH = "./chroma_gaia_db"
_embeddings = None
_vector_store = None


def _get_vector_store():
    """Lazy load vector store."""
    global _embeddings, _vector_store
    if _vector_store is None:
        _embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )
        _vector_store = Chroma(
            persist_directory=CHROMA_PATH, embedding_function=_embeddings
        )
    return _vector_store


@tool
def get_solving_strategy(question: str) -> str:
    """Search for similar solved questions and get the solving strategy.
    Use this FIRST to understand how to approach a problem before using other tools.

    Args:
        question: The question you need to solve."""
    print(f"\n[GET_SOLVING_STRATEGY] Searching for: {question[:80]}...")
    try:
        vector_store = _get_vector_store()
        similar_docs = vector_store.similarity_search(question, k=1)
        print(f"[GET_SOLVING_STRATEGY] Found {len(similar_docs)} similar questions")

        if similar_docs:
            doc = similar_docs[0]
            steps = (
                doc.page_content.split("Steps to solve:")[-1]
                .split("Tools needed:")[0]
                .strip()
            )
            tools_raw = doc.metadata.get("tools", "")
            # Clean up tools format - replace inline numbers with newlines
            tools = tools_raw.replace("\n", "\n- ").strip()
            if tools and not tools.startswith("-"):
                tools = "- " + tools

            set_current_strategy(steps)

            return f"""Similar question found!

## Strategy to solve (按此策略执行):
{steps}

## Rules (必须严格遵守):
1. Use EXACT wording from sources. Do not paraphrase or shorten.
2. For lists: sort items alphabetically, separate with comma and space.
3. Use tools to find information. Do not guess.
4. When you find the answer, call `submit_answer` immediately. 不要继续搜索。

"""
        else:
            return "No similar questions found. Use your best judgment."
    except Exception as e:
        return f"Error searching for strategy: {e}"


def _get_llm():
    """Get LLM for post-processing."""
    from langchain_openai import ChatOpenAI

    if os.getenv("ZAI_API_KEY"):
        api_base = "https://api.z.ai/api/paas/v4/"
        if os.getenv("ZAI_USE_CODING_PLAN", "f") == "t":
            api_base = "https://api.z.ai/api/coding/paas/v4/"
        return ChatOpenAI(
            model="GLM-4.5-Air",
            temperature=0,
            base_url=api_base,
            api_key=SecretStr(os.getenv("ZAI_API_KEY", "")),
        )
    else:
        return ChatOpenAI(model="gpt-4o-mini", temperature=0)


def _fetch_url_with_tables(url: str) -> str:
    """Fetch URL content including tables using Jina reader."""
    import requests

    try:
        # Use Jina to get full page content including tables
        api_key = os.getenv("JINA_API_KEY", "")
        headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}

        response = requests.get(f"https://r.jina.ai/{url}", headers=headers, timeout=30)
        return response.text
    except Exception:
        return ""


@tool
def wiki_search(query: str) -> str:
    """Search Wikipedia for a query and return relevant content including tables.

    Args:
        query: The search query."""
    import wikipedia

    try:
        # Search for pages
        search_results = wikipedia.search(query, results=3)
        if not search_results:
            return "No Wikipedia results found."

        formatted_parts = []
        for title in search_results[:2]:
            try:
                page = wikipedia.page(title, auto_suggest=False)
                url = page.url

                # Fetch the page via Jina to get full content including tables
                content = _fetch_url_with_tables(url)

                if not content:
                    # Fallback to wikipedia API content
                    content = page.content

                # Use smart section extraction
                extracted = _extract_relevant_content(content, query)
                formatted_parts.append(
                    f'<Document source="{url}" title="{title}">\n{extracted}\n</Document>'
                )
            except (wikipedia.DisambiguationError, wikipedia.PageError):
                continue
            except Exception:
                continue

        return (
            "\n\n---\n\n".join(formatted_parts)
            if formatted_parts
            else "No results found."
        )
    except Exception as e:
        return f"Wikipedia search error: {e}"


_zai_mcp_tools = None


async def _get_zai_mcp_tools():
    """Lazy load Z.AI MCP tools."""
    global _zai_mcp_tools
    if _zai_mcp_tools is None:
        from langchain_mcp_adapters.client import MultiServerMCPClient

        api_key = os.getenv("ZAI_API_KEY", "")
        client = MultiServerMCPClient(
            {
                "web-search": {
                    "transport": "streamable_http",
                    "url": "https://api.z.ai/api/mcp/web_search_prime/mcp",
                    "headers": {"Authorization": f"Bearer {api_key}"},
                },
                "web-reader": {
                    "transport": "streamable_http",
                    "url": "https://api.z.ai/api/mcp/web_reader/mcp",
                    "headers": {"Authorization": f"Bearer {api_key}"},
                },
                "zai-mcp": {
                    "transport": "stdio",
                    "command": "npx",
                    "args": ["-y", "@z_ai/mcp-server"],
                    "env": {
                        "Z_AI_API_KEY": api_key,
                        "Z_AI_MODE": "ZAI",
                    },
                },
            }
        )
        _zai_mcp_tools = await client.get_tools()
    return _zai_mcp_tools


@tool
def jina_search(query: str) -> str:
    """Search the web using Jina AI and return clean results.

    Args:
        query: The search query."""
    import requests

    api_key = os.getenv("JINA_API_KEY", "")
    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}

    response = requests.get(f"https://s.jina.ai/{query}", headers=headers, timeout=30)
    return response.text


def _extract_section_by_marker(
    content: str, section_marker: str, context_lines: int = 50
) -> str:
    """Extract a section starting from a marker found in strategy steps.

    This is the SMART extraction - uses strategy steps like "scrolled down to Studio albums"
    to find the exact section we need.
    """
    import re

    lines = content.split("\n")
    marker_lower = section_marker.lower().strip()

    print(f"[EXTRACT_SECTION] Looking for section marker: '{section_marker}'")

    # Find the line containing the section marker
    start_idx = None
    for i, line in enumerate(lines):
        if marker_lower in line.lower():
            start_idx = i
            print(f"[EXTRACT_SECTION] Found marker at line {i}: {line[:80]}")
            break

    if start_idx is None:
        # Try partial matching (e.g., "Studio albums" might be "Studio Albums" or "Discography")
        for i, line in enumerate(lines):
            # Check if most words from marker are in line
            marker_words = [
                w for w in re.findall(r"\b\w+\b", marker_lower) if len(w) > 2
            ]
            line_lower = line.lower()
            matches = sum(1 for w in marker_words if w in line_lower)
            if matches >= len(marker_words) * 0.6:  # 60% match threshold
                start_idx = i
                print(f"[EXTRACT_SECTION] Found partial match at line {i}: {line[:80]}")
                break

    if start_idx is None:
        print(f"[EXTRACT_SECTION] Section marker not found")
        return ""

    # Extract from marker line + context_lines after it
    end_idx = min(start_idx + context_lines, len(lines))
    section = "\n".join(lines[start_idx:end_idx])

    print(f"[EXTRACT_SECTION] Extracted {end_idx - start_idx} lines from section")
    return section


def _parse_section_markers_from_strategy(strategy: str) -> list:
    """Parse strategy steps to extract section markers.

    Looks for phrases like:
    - "scrolled down to Studio albums" -> "Studio albums"
    - "found the Discography section" -> "Discography"
    - "went to Studio albums" -> "Studio albums"
    """
    import re

    markers = []

    # Patterns that indicate a section name
    patterns = [
        r'scrolled?\s+(?:down\s+)?to\s+["\']?([^"\',.]+)["\']?',  # scrolled down to X
        r'went\s+to\s+(?:the\s+)?["\']?([^"\',.]+)["\']?\s+section',  # went to X section
        r'found\s+(?:the\s+)?["\']?([^"\',.]+)["\']?\s+section',  # found X section
        r'clicked\s+on\s+["\']?([^"\',.]+)["\']?',  # clicked on X
        r'looked\s+(?:at|under)\s+["\']?([^"\',.]+)["\']?',  # looked at/under X
        r'(?:in|under)\s+(?:the\s+)?["\']?([^"\',.]+)["\']?\s+section',  # in/under X section
    ]

    for pattern in patterns:
        matches = re.findall(pattern, strategy.lower())
        for match in matches:
            cleaned = match.strip()
            if cleaned and len(cleaned) > 2 and len(cleaned) < 50:
                markers.append(cleaned)

    # Also look for quoted section names
    quoted = re.findall(r'"([^"]+)"', strategy)
    for q in quoted:
        if len(q) > 2 and len(q) < 50 and q.lower() not in ["wikipedia", "google"]:
            markers.append(q)

    print(f"[PARSE_STRATEGY] Extracted section markers: {markers}")
    return markers


# Global variable to store current strategy for smart extraction
_current_strategy = None


def set_current_strategy(strategy: str):
    """Store the current strategy for use by content extraction."""
    global _current_strategy
    _current_strategy = strategy
    print(f"[STRATEGY] Updated current strategy")


@tool
def jina_read(url: str, question: str = "") -> str:
    """Read a webpage and extract content relevant to the question.

    Args:
        url: The URL to read.
        question: The question to extract relevant info for."""
    import requests

    api_key = os.getenv("JINA_API_KEY", "")
    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}

    response = requests.get(f"https://r.jina.ai/{url}", headers=headers, timeout=30)
    content = response.text

    # Use smart extraction with strategy section markers
    if question:
        return content[:10000]


@tool
def web_search(query: str) -> str:
    """Search the web and return summarized results with URLs."""
    if os.getenv("TAVILY_API_KEY"):
        from langchain_tavily import TavilySearch

        web_search_tool = TavilySearch(
            max_results=5,
            include_answer=False,
        )
    else:
        from langchain_community.tools import DuckDuckGoSearchResults

        web_search_tool = DuckDuckGoSearchResults()

    search_docs = web_search_tool.invoke(query)

    if isinstance(search_docs, str):
        return search_docs
    elif isinstance(search_docs, dict) and "results" in search_docs:
        results = search_docs["results"]
    elif isinstance(search_docs, list):
        results = search_docs
    else:
        return str(search_docs)

    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.get("url", "")}"/>\n{doc.get("content", "")}\n</Document>'
            for doc in results
        ]
    )
    return formatted_search_docs


@tool
def arxiv_search(query: str) -> str:
    """Search arXiv for a query and return maximum 2 results.

    Args:
        query: The search query."""
    from langchain_community.document_loaders import ArxivLoader

    search_docs = ArxivLoader(query=query, load_max_docs=2).load()
    parts = []
    for doc in search_docs:
        source = doc.metadata.get("source", "")
        parts.append(f"Source: {source}\n{doc.page_content}")
    return "\n\n---\n\n".join(parts)


@tool
def analyze_text(text: str, question: str) -> str:
    """Analyze text and extract the answer to a specific question. Use after fetching a webpage or PDF."""
    llm = _get_llm()
    response = llm.invoke(
        f"Given this text:\n\n{text[:8000]}\n\n"
        f"Answer this question: {question}\n\n"
        f"Be specific and list any relevant data points (numbers, dates, names). "
        f"If counting items, list each one explicitly before giving the count."
    )
    return response.content


@tool
def read_excel(file_path: str) -> str:
    """Read and extract data from an Excel file (.xlsx, .xls).

    Args:
        file_path: Path to the Excel file."""
    import pandas as pd

    try:
        # Read all sheets
        xlsx = pd.ExcelFile(file_path)
        results = []
        for sheet_name in xlsx.sheet_names:
            df = pd.read_excel(xlsx, sheet_name=sheet_name)
            results.append(f"=== Sheet: {sheet_name} ===\n{df.to_string()}")
        return "\n\n".join(results)[:15000]
    except Exception as e:
        return f"Error reading Excel: {e}"


@tool
def read_csv(file_path: str) -> str:
    """Read and extract data from a CSV file.

    Args:
        file_path: Path to the CSV file."""
    import pandas as pd

    try:
        df = pd.read_csv(file_path)
        return df.to_string()[:15000]
    except Exception as e:
        return f"Error reading CSV: {e}"


@tool
def read_docx(file_path: str) -> str:
    """Read and extract text from a Word document (.docx).

    Args:
        file_path: Path to the Word document."""
    try:
        from docx import Document

        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text[:15000]
    except Exception as e:
        return f"Error reading Word doc: {e}"


@tool
def read_pptx(file_path: str) -> str:
    """Read and extract text from a PowerPoint presentation (.pptx).

    Args:
        file_path: Path to the PowerPoint file."""
    try:
        from pptx import Presentation

        prs = Presentation(file_path)
        text_parts = []
        for slide_num, slide in enumerate(prs.slides, 1):
            slide_text = [f"=== Slide {slide_num} ==="]
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    slide_text.append(shape.text)
            text_parts.append("\n".join(slide_text))
        return "\n\n".join(text_parts)[:15000]
    except Exception as e:
        return f"Error reading PowerPoint: {e}"


@tool
def extract_zip(file_path: str) -> str:
    """Extract a zip file and list its contents.

    Args:
        file_path: Path to the zip file."""
    import zipfile
    from pathlib import Path

    try:
        extract_dir = Path(file_path).parent / Path(file_path).stem
        extract_dir.mkdir(exist_ok=True)

        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
            file_list = zip_ref.namelist()

        return f"Extracted to: {extract_dir}\nContents:\n" + "\n".join(file_list)
    except Exception as e:
        return f"Error extracting zip: {e}"


@tool
def analyze_image(file_path: str, question: str) -> str:
    """Analyze an image and answer a question about it using vision model.

    Args:
        file_path: Path to the image file (png, jpg, etc.)
        question: Question to answer about the image."""
    import base64

    from langchain_openai import ChatOpenAI

    try:
        with open(file_path, "rb") as f:
            image_data = base64.b64encode(f.read()).decode("utf-8")

        # Determine mime type
        ext = file_path.lower().split(".")[-1]
        mime_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg"}.get(
            ext, "image/png"
        )

        # Use GPT-4o for vision
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        response = llm.invoke(
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{image_data}"
                            },
                        },
                    ],
                }
            ]
        )
        return response.content
    except Exception as e:
        return f"Error analyzing image: {e}"


@tool
def submit_answer(answer: str) -> str:
    """Submit your final answer. Use this when you have found the answer.

    Args:
        answer: The final answer to submit."""
    print(f"[SUBMIT_ANSWER] {answer}")
    return f"FINAL ANSWER: {answer}"


async def get_tools() -> list:
    """Retrieve the list of available tools for the agent."""
    base_tools = [
        get_solving_strategy,  # Use FIRST to get approach
        submit_answer,
        # wiki_search,
        download_file,
        read_pdf,
        read_excel,
        read_csv,
        read_docx,
        read_pptx,
        extract_zip,
        analyze_image,
        py_calc_tool,
        youtube_transcript_tool,
        transcribe_audio,
        arxiv_search,
    ]
    # Add Z.AI MCP tools (webSearchPrime, webReader)
    zai_tools = await _get_zai_mcp_tools()
    return base_tools + zai_tools


@tool
def py_calc_tool(expression: str) -> str:
    """Evaluate a Python expression safely."""
    try:
        allowed_builtins = {"__builtins__": {}}
        result = eval(expression, allowed_builtins, {})
        return str(result)
    except Exception as e:
        return f"Error evaluating expression: {e}"


@tool
def download_file(url: str) -> str:
    """Download a file (PDF, etc.) from URL and save locally. Returns the local file path."""
    import hashlib
    from pathlib import Path

    import requests

    try:
        # Create downloads directory
        downloads_dir = Path("downloads")
        downloads_dir.mkdir(exist_ok=True)

        # Generate filename from URL hash + extension
        ext = Path(url).suffix or ".bin"
        filename = hashlib.md5(url.encode()).hexdigest()[:12] + ext
        filepath = downloads_dir / filename

        # Download if not already cached
        if not filepath.exists():
            response = requests.get(url, timeout=60)
            response.raise_for_status()
            filepath.write_bytes(response.content)

        return f"Downloaded to: {filepath}"

    except Exception as e:
        return f"Error downloading: {e}"


@tool
def read_pdf(file_path: str) -> str:
    """Read and extract text from a local PDF file."""
    try:
        from pypdf import PdfReader

        reader = PdfReader(file_path)
        text = "\n".join(page.extract_text() or "" for page in reader.pages)
        return text[:15000]  # Limit to 15k chars
    except Exception as e:
        return f"Error reading PDF: {e}"


@tool
def fetch_webpage(url: str) -> str:
    """Fetch and read content from a webpage URL. For PDFs, use download_file then read_pdf instead."""
    import requests

    # Reject PDF URLs
    if url.lower().endswith(".pdf"):
        return "Error: This is a PDF file. Use download_file(url) first, then read_pdf(filepath) to read it."

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        response = requests.get(url, timeout=30, headers=headers)
        response.raise_for_status()

        if "application/pdf" in response.headers.get("content-type", ""):
            return "Error: This is a PDF file. Use download_file(url) first, then read_pdf(filepath) to read it."

        import html2text

        h = html2text.HTML2Text()
        h.ignore_links = False
        h.ignore_images = True
        h.ignore_emphasis = False
        h.body_width = 0  # No wrapping

        markdown = h.handle(response.text)

        return markdown[:10000] if markdown else "No content found"

    except Exception as e:
        return f"Error fetching URL: {e}"


@tool
def transcribe_audio(file_path: str) -> str:
    """Transcribe an audio file to text using OpenAI Whisper.

    Args:
        file_path: Path to the audio file (mp3, wav)."""
    from openai import OpenAI

    client = OpenAI()

    with open(file_path, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
        )

    print(f"[TRANSCRIPTION]: {transcription.text}")
    return transcription.text


@tool
def youtube_transcript_tool(video_url: str) -> List[Document]:
    """Fetch the transcript of a YouTube video given its URL."""

    from langchain_community.document_loaders import YoutubeLoader

    loader = YoutubeLoader.from_youtube_url(video_url, add_video_info=False)

    return loader.load()