prthm11 commited on
Commit
d03eb21
·
verified ·
1 Parent(s): c5bf922

Delete utils/agent.py

Browse files
Files changed (1) hide show
  1. utils/agent.py +0 -1647
utils/agent.py DELETED
@@ -1,1647 +0,0 @@
1
- #─── Basic imports ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
2
- import os
3
- import math
4
- import sqlite3
5
- import fitz # PyMuPDF for PDF parsing
6
- import re
7
-
8
- from dotenv import load_dotenv
9
- # Load environment variables from .env file
10
- load_dotenv() # This line ensures .env variables are loaded
11
-
12
- from langgraph.graph import START, StateGraph, MessagesState, END
13
- from langgraph.prebuilt import tools_condition
14
- from langgraph.prebuilt import ToolNode
15
- from langgraph.constants import START
16
- from langchain_core.tools import tool
17
- from langchain.schema import SystemMessage
18
- #from langchain.chat_models import init_chat_model
19
- #from langgraph.prebuilt import create_react_agent
20
-
21
- from langchain.embeddings import HuggingFaceEmbeddings
22
- #from langchain.vectorstores import Pinecone
23
- from langchain.tools.retriever import create_retriever_tool
24
- #import pinecone
25
- #from pinecone import Pinecone as PineconeClient, ServerlessSpec
26
- #from pinecone import Index # the blocking‐call client constructor
27
- #from pinecone import Pinecone as PineconeClient, ServerlessSpec
28
- from langchain.embeddings import HuggingFaceEmbeddings
29
- from langchain_community.vectorstores.pinecone import Pinecone as LC_Pinecone
30
-
31
- # ─── Langchain Frameworks ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
32
- #from langchain.tools import Tool
33
- from langchain.chat_models import ChatOpenAI
34
- from langchain_groq import ChatGroq
35
- from langchain_mistralai import ChatMistralAI
36
- from langchain.agents import initialize_agent, AgentType
37
- from langchain.schema import Document
38
- from langchain.chains import RetrievalQA
39
- from langchain.embeddings import OpenAIEmbeddings
40
- from langchain_community.embeddings import HuggingFaceEmbeddings
41
- from langchain.vectorstores import FAISS
42
- from langchain.text_splitter import RecursiveCharacterTextSplitter
43
- from langchain.prompts import PromptTemplate
44
- from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
45
- from langchain_community.document_loaders.wikipedia import WikipediaLoader
46
- from langchain_community.document_loaders.arxiv import ArxivLoader
47
- from langchain_experimental.tools.python.tool import PythonREPLTool
48
-
49
-
50
- # ─── Memory ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
51
- from langchain.agents import initialize_agent, AgentType
52
- from langchain.tools import Tool
53
- from typing import List, Callable
54
- from langchain.schema import BaseMemory, AIMessage, HumanMessage, SystemMessage
55
- from langchain.schema import HumanMessage, SystemMessage
56
- from langchain.llms.base import LLM
57
- from langchain.memory.chat_memory import BaseChatMemory
58
- from pydantic import PrivateAttr
59
- from langchain_core.messages import get_buffer_string
60
-
61
- # ─── Image Processing ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
62
-
63
- from PIL import Image
64
- import pytesseract
65
- from transformers import pipeline
66
- from groq import Groq
67
- import requests
68
- from io import BytesIO
69
- from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
70
- import requests
71
- import base64
72
- from PIL import UnidentifiedImageError
73
-
74
- # ─── Browser var ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
75
- from typing import List, Dict
76
- import json
77
- from io import BytesIO
78
- #from langchain.tools import tool # or langchain_core.tools
79
- from playwright.sync_api import sync_playwright
80
- from duckduckgo_search import DDGS
81
- import time
82
- import random
83
- import logging
84
- from functools import lru_cache, wraps
85
- import requests
86
- from playwright.sync_api import sync_playwright
87
- from bs4 import BeautifulSoup
88
- import tenacity
89
- from tenacity import retry, stop_after_attempt, wait_exponential
90
-
91
- # Initialize logger
92
- logger = logging.getLogger(__name__)
93
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
94
-
95
- # Additional imports for new functionality
96
- import pandas as pd
97
- from PyPDF2 import PdfReader
98
- import docx
99
- import pytesseract
100
- import speech_recognition as sr
101
- from pydub import AudioSegment
102
- from pytube import YouTube
103
- from newspaper import Article
104
- from langchain.document_loaders import ArxivLoader
105
- from langchain_community.document_loaders.youtube import YoutubeLoader, TranscriptFormat
106
-
107
- from playwright.sync_api import sync_playwright
108
- # Attempt to import Playwright for dynamic page rendering
109
- try:
110
- from playwright.sync_api import sync_playwright
111
- _playwright_available = True
112
- except ImportError:
113
- _playwright_available = False
114
-
115
- # Define forbidden keywords for basic NSFW filtering
116
- _forbidden = ["porn", "sex", "xxx", "nude", "erotic"]
117
-
118
- # ─── LLM Setup ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
119
- # Load OpenAI API key from environment (required for LLM and embeddings)
120
-
121
- # API Keys from .env file
122
- os.environ.setdefault("OPENAI_API_KEY", "<YOUR_OPENAI_KEY>") # Set your own key or env var
123
- os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY", "default_key_or_placeholder")
124
- os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY", "default_key_or_placeholder")
125
-
126
- # Tavily API Key
127
- TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "default_key_or_placeholder")
128
- _forbidden = ["nsfw", "porn", "sex", "explicit"]
129
- _playwright_available = True # set False to disable Playwright
130
-
131
- # Globals for RAG system
132
- vector_store = None
133
- rag_chain = None
134
- DB_PATH = None # will be set when a .db is uploaded
135
- DOC_PATH = None # will be set when a document is uploaded
136
- IMG_PATH = None # will be set when an image is uploaded
137
- OTH_PATH = None # will be set when an other file is uploaded
138
-
139
-
140
- # ─── LLMS ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
141
- #llm = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True, temperature=0)
142
- from tenacity import retry, stop_after_attempt, wait_exponential
143
-
144
- # Import the RetryingChatGroq client
145
- from retry_groq import RetryingChatGroq
146
-
147
- # Use the retrying version instead
148
- llm = RetryingChatGroq(model="deepseek-r1-distill-llama-70b", streaming=False, temperature=0)
149
- #llm = ChatMistralAI(model="mistral-large-latest", streaming=True, temperature=0)
150
-
151
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
152
- # ─────────────────────────────────────────────── Tool for multiply ──────────────────────────────────────────────────────────────────────
153
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
154
- @tool(parse_docstring=True)
155
- def multiply(a: int, b: int) -> int:
156
- """
157
- Multiply two numbers.
158
-
159
- Args:
160
- a (int): The first factor.
161
- b (int): The second factor.
162
-
163
- Returns:
164
- int: The product of a and b.
165
- """
166
- try:
167
- # Direct calculation without relying on LangChain handling
168
- result = a * b
169
- return result
170
- except Exception as e:
171
- return f"Error in multiplication: {str(e)}"
172
-
173
- # ───────────────────────────────────────��──────────────────────────────────────────────────────────────────────────────────────────────────
174
- # ─────────────────────────────────────────────── Tool for add ──────────────────────────────────────────────────────────────────────────
175
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
176
- @tool(parse_docstring=True)
177
- def add(a: int, b: int) -> int:
178
- """
179
- Add two numbers.
180
-
181
- Args:
182
- a (int): The first factor.
183
- b (int): The second factor.
184
-
185
- Returns:
186
- int: The addition of a and b.
187
- """
188
- try:
189
- # Direct calculation without relying on LangChain handling
190
- result = a + b
191
- return result
192
- except Exception as e:
193
- return f"Error in addition: {str(e)}"
194
-
195
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
196
- # ─────────────────────────────────────────────── Tool for subtract ──────────────────────────────────────────────────────────────────────
197
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
198
- @tool(parse_docstring=True)
199
- def subtract(a: int, b: int) -> int:
200
- """
201
- Subtract two numbers.
202
-
203
- Args:
204
- a (int): The first factor.
205
- b (int): The second factor.
206
-
207
- Returns:
208
- int: The subtraction of a and b.
209
- """
210
- try:
211
- # Direct calculation without relying on LangChain handling
212
- result = a - b
213
- return result
214
- except Exception as e:
215
- return f"Error in subtraction: {str(e)}"
216
-
217
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
218
- # ─────────────────────────────────────────────── Tool for divide ──────────────────────────────────────────────────────────────────────
219
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
220
- @tool(parse_docstring=True)
221
- def divide(a: int, b: int) -> int:
222
- """
223
- Divide two numbers.
224
-
225
- Args:
226
- a (int): The numerator.
227
- b (int): The denominator.
228
-
229
- Returns:
230
- float: The result of a divided by b.
231
-
232
- Raises:
233
- ValueError: If b is zero.
234
- """
235
- try:
236
- if b == 0:
237
- return "Error: Cannot divide by zero."
238
- # Direct calculation without relying on LangChain handling
239
- result = a / b
240
- return result
241
- except Exception as e:
242
- return f"Error in division: {str(e)}"
243
-
244
- # ─────────────────���────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
245
- # ─────────────────────────────────────────────── Tool for modulus ──────────────────────────────────────────────────────────────────────
246
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
247
- @tool(parse_docstring=True)
248
- def modulus(a: int, b: int) -> int:
249
- """
250
- Get the modulus (remainder) of two numbers.
251
-
252
- Args:
253
- a (int): The dividend.
254
- b (int): The divisor.
255
-
256
- Returns:
257
- int: The remainder when a is divided by b.
258
- """
259
- try:
260
- if b == 0:
261
- return "Error: Cannot calculate modulus with zero divisor."
262
- # Direct calculation without relying on LangChain handling
263
- result = a % b
264
- return result
265
- except Exception as e:
266
- return f"Error in modulus calculation: {str(e)}"
267
-
268
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
269
- # ─────────────────────────────────────────────── Tool for browsing ──────────────────────────────────────────────────────────────────────
270
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
271
- def with_retry(max_attempts: int = 3, backoff_base: int = 2):
272
- """
273
- Decorator for retrying a function with exponential backoff on exception.
274
- """
275
- def decorator(fn):
276
- @wraps(fn)
277
- def wrapper(*args, **kwargs):
278
- for attempt in range(max_attempts):
279
- try:
280
- return fn(*args, **kwargs)
281
- except Exception as e:
282
- wait = backoff_base ** attempt + random.uniform(0, 1)
283
- logger.warning(f"{fn.__name__} failed (attempt {attempt+1}/{max_attempts}): {e}")
284
- if attempt < max_attempts - 1:
285
- time.sleep(wait)
286
- logger.error(f"{fn.__name__} failed after {max_attempts} attempts.")
287
- return []
288
- return wrapper
289
- return decorator
290
-
291
- @with_retry()
292
- @lru_cache(maxsize=128)
293
- def tavily_search(query: str, top_k: int = 3) -> List[Dict]:
294
- """Call Tavily API and return a list of result dicts."""
295
- if not TAVILY_API_KEY:
296
- logger.info("[Tavily] No API key set. Skipping Tavily search.")
297
- return []
298
- url = "https://api.tavily.com/search"
299
- headers = {
300
- "Authorization": f"Bearer {TAVILY_API_KEY}",
301
- "Content-Type": "application/json",
302
- }
303
- payload = {"query": query, "num_results": top_k}
304
- resp = requests.post(url, headers=headers, json=payload, timeout=10)
305
- resp.raise_for_status()
306
- data = resp.json()
307
- results = []
308
- for item in data.get("results", []):
309
- results.append({
310
- "title": item.get("title", ""),
311
- "url": item.get("url", ""),
312
- "content": item.get("content", "")[:200],
313
- "source": "Tavily"
314
- })
315
- return results
316
-
317
- @with_retry()
318
- @lru_cache(maxsize=128)
319
- def duckduckgo_search(query: str, top_k: int = 3) -> List[Dict]:
320
- """Query DuckDuckGo and return up to top_k raw SERP hits."""
321
- results = []
322
- try:
323
- with DDGS(timeout=15) as ddgs: # Increase timeout from default
324
- for hit in ddgs.text(query, safesearch="On", max_results=top_k, timeout=15):
325
- results.append({
326
- "title": hit.get("title", ""),
327
- "url": hit.get("href") or hit.get("url", ""),
328
- "content": hit.get("body", ""),
329
- "source": "DuckDuckGo"
330
- })
331
- if len(results) >= top_k:
332
- break
333
- except Exception as e:
334
- logger.warning(f"DuckDuckGo search failed: {e}")
335
- # Don't re-raise - just return empty results to allow fallbacks to work
336
-
337
- return results
338
-
339
- # Additional fallback search alternative
340
- def simple_google_search(query: str, top_k: int = 3) -> List[Dict]:
341
- """Simplified Google search as a fallback when other methods fail."""
342
- try:
343
- # Encode the query
344
- import urllib.parse
345
- import bs4
346
-
347
- encoded_query = urllib.parse.quote(query)
348
- url = f"https://www.google.com/search?q={encoded_query}"
349
-
350
- headers = {
351
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
352
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
353
- "Accept-Language": "en-US,en;q=0.5",
354
- "Referer": "https://www.google.com/",
355
- "Connection": "keep-alive",
356
- }
357
-
358
- response = requests.get(url, headers=headers, timeout=20)
359
- response.raise_for_status()
360
-
361
- soup = bs4.BeautifulSoup(response.text, "html.parser")
362
- results = []
363
-
364
- # Extract search results
365
- for result in soup.select("div.g")[:top_k]:
366
- title_elem = result.select_one("h3")
367
- link_elem = result.select_one("a")
368
- snippet_elem = result.select_one("div.VwiC3b")
369
-
370
- if title_elem and link_elem and snippet_elem and "href" in link_elem.attrs:
371
- href = link_elem["href"]
372
- if href.startswith("/url?q="):
373
- href = href.split("/url?q=")[1].split("&")[0]
374
-
375
- if href.startswith("http"):
376
- results.append({
377
- "title": title_elem.get_text(),
378
- "url": href,
379
- "content": snippet_elem.get_text(),
380
- "source": "Google"
381
- })
382
-
383
- return results
384
-
385
- except Exception as e:
386
- logger.warning(f"Simple Google search failed: {e}")
387
- return []
388
-
389
- def hybrid_search(query: str, top_k: int = 3) -> List[Dict]:
390
- """Combine multiple search sources with fallbacks."""
391
- # Try primary search methods first
392
- results = []
393
-
394
- # Start with Tavily if API key is available
395
- if TAVILY_API_KEY and TAVILY_API_KEY != "default_key_or_placeholder":
396
- try:
397
- tavily_results = tavily_search(query, top_k)
398
- results.extend(tavily_results)
399
- logger.info(f"Retrieved {len(tavily_results)} results from Tavily")
400
- except Exception as e:
401
- logger.warning(f"Tavily search failed: {e}")
402
-
403
- # If we don't have enough results, try DuckDuckGo
404
- if len(results) < top_k:
405
- try:
406
- ddg_results = duckduckgo_search(query, top_k - len(results))
407
- results.extend(ddg_results)
408
- logger.info(f"Retrieved {len(ddg_results)} results from DuckDuckGo")
409
- except Exception as e:
410
- logger.warning(f"DuckDuckGo search failed: {e}")
411
-
412
- # If we still don't have enough results, try Google
413
- if len(results) < top_k:
414
- try:
415
- google_results = simple_google_search(query, top_k - len(results))
416
- results.extend(google_results)
417
- logger.info(f"Retrieved {len(google_results)} results from Google")
418
- except Exception as e:
419
- logger.warning(f"Google search failed: {e}")
420
-
421
- # If all search methods failed, return a dummy result
422
- if not results:
423
- results.append({
424
- "title": "Search Failed",
425
- "url": "",
426
- "content": f"Sorry, I couldn't find results for '{query}'. Please try refining your search terms or check your internet connection.",
427
- "source": "No results"
428
- })
429
-
430
- return results[:top_k] # Ensure we only return top_k results
431
-
432
- def format_search_docs(search_docs: List[Dict]) -> Dict[str, str]:
433
- """
434
- Turn a list of {source, page, content} dicts into one big
435
- string with <Document ...>…</Document> entries separated by `---`.
436
- """
437
- formatted_search_docs = "\n\n---\n\n".join(
438
- [
439
- f'<Document source="{doc["source"]}" page="{doc.get("page", "")}"/>\n'
440
- f'{doc.get("content", "")}\n'
441
- f'</Document>'
442
- for doc in search_docs
443
- ]
444
- )
445
- return {"web_results": formatted_search_docs}
446
-
447
-
448
- @tool(parse_docstring=True)
449
- def web_search(query: str, top_k: int = 3) -> Dict[str, str]:
450
- """
451
- Perform a hybrid web search combining multiple search engines with robust fallbacks.
452
-
453
- Args:
454
- query: The search query string to look up.
455
- top_k: The maximum number of search results to return (default is 3).
456
-
457
- Returns:
458
- A dictionary mapping result indices to XML-like <Document> blocks, each containing:
459
- - source: The URL of the webpage.
460
- - page: Placeholder for page identifier (empty string by default).
461
- - content: The first 200 words of the page text, cleaned of HTML tags.
462
- """
463
- try:
464
- # Use our robust hybrid search to get initial results
465
- search_results = hybrid_search(query, top_k)
466
- results = []
467
-
468
- # Process each search result to get better content
469
- for hit in search_results:
470
- url = hit.get("url")
471
- if not url:
472
- continue
473
-
474
- # Start with the snippet from search
475
- content = hit.get("content", "")
476
- title = hit.get("title", "")
477
-
478
- # Try to scrape additional content if possible
479
- try:
480
- # Use a random user agent to avoid blocking
481
- headers = {
482
- "User-Agent": random.choice([
483
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
484
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
485
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
486
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62"
487
- ]),
488
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
489
- "Accept-Language": "en-US,en;q=0.5",
490
- "Referer": "https://www.google.com/",
491
- "DNT": "1",
492
- "Connection": "keep-alive"
493
- }
494
-
495
- # Higher timeout for better reliability
496
- resp = requests.get(url, timeout=15, headers=headers)
497
-
498
- # Only process if successful
499
- if resp.status_code == 200:
500
- soup = BeautifulSoup(resp.text, "html.parser")
501
-
502
- # Try to find main content
503
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
504
-
505
- # If we found main content, use it
506
- if main_content:
507
- extracted_text = main_content.get_text(separator=" ", strip=True)
508
- # Take first 200 words
509
- content = " ".join(extracted_text.split()[:200])
510
- else:
511
- # Otherwise use all text
512
- all_text = soup.get_text(separator=" ", strip=True)
513
- content = " ".join(all_text.split()[:200])
514
-
515
- # Use content from page only if it's substantial
516
- if len(content) < 50:
517
- content = hit.get("content", "")[:200]
518
-
519
- # Random delay between 0.5-1.5 seconds to avoid rate limits
520
- time.sleep(0.5 + random.random())
521
-
522
- except requests.exceptions.HTTPError as e:
523
- logger.warning(f"HTTP error when scraping {url}: {e}")
524
- # Keep the search snippet as a fallback
525
- except requests.exceptions.RequestException as e:
526
- logger.warning(f"Request error when scraping {url}: {e}")
527
- # Keep the search snippet as a fallback
528
- except Exception as e:
529
- logger.warning(f"Unexpected error when scraping {url}: {e}")
530
- # Keep the search snippet as a fallback
531
-
532
- # Filter out inappropriate content
533
- if any(f in content.lower() for f in _forbidden):
534
- continue
535
-
536
- # Add to results
537
- results.append({
538
- "source": url,
539
- "page": "",
540
- "content": content
541
- })
542
-
543
- # Return formatted search docs
544
- return format_search_docs(results[:top_k])
545
- except Exception as e:
546
- logger.error(f"Web search failed: {e}")
547
- # Return a helpful error message
548
- return format_search_docs([{
549
- "source": "Error",
550
- "page": "",
551
- "content": f"Search failed with error: {e}. Please try again with different search terms."
552
- }])
553
-
554
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
555
- # ─────────────────────────────────────────────── Tool for File System ───────────────────────────────────────────────────────────────────
556
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
557
- @tool(parse_docstring=True)
558
- def download_file(url: str, dest_path: str) -> str:
559
- """
560
- Download a file from a given URL and save it locally.
561
-
562
- Args:
563
- url: The direct URL of the file to download.
564
- dest_path: The local path to save the downloaded file.
565
-
566
- Returns:
567
- The destination path where the file was saved.
568
- """
569
- r = requests.get(url, stream=True)
570
- r.raise_for_status()
571
- with open(dest_path, 'wb') as f:
572
- for chunk in r.iter_content(8192):
573
- f.write(chunk)
574
- return dest_path
575
-
576
- @tool(parse_docstring=True)
577
- def process_excel_to_text(file_path: str) -> str:
578
- """
579
- Convert an Excel file into CSV-formatted text.
580
-
581
- Args:
582
- file_path: Path to the Excel (.xlsx) file.
583
-
584
- Returns:
585
- A string of CSV-formatted content extracted from the Excel file.
586
- """
587
- try:
588
- # Check if file exists
589
- import os
590
- if not os.path.exists(file_path):
591
- return f"Error: Excel file '{file_path}' does not exist."
592
-
593
- # Try different engines
594
- engines = ['openpyxl', 'xlrd', None]
595
-
596
- for engine in engines:
597
- try:
598
- # For engine=None, pandas will try to auto-detect
599
- if engine:
600
- df = pd.read_excel(file_path, engine=engine)
601
- else:
602
- df = pd.read_excel(file_path)
603
- return df.to_csv(index=False)
604
- except Exception as e:
605
- print(f"Excel engine {engine} failed: {e}")
606
- last_error = e
607
- continue
608
-
609
- # If we got here, all engines failed
610
- return f"Error processing Excel file: {str(last_error)}"
611
- except Exception as e:
612
- return f"Error with Excel file: {str(e)}"
613
-
614
- @tool(parse_docstring=True)
615
- def read_text_from_pdf(file_path: str, question: str = None) -> str:
616
- """
617
- Extract text from a PDF file, chunking large documents if needed.
618
-
619
- Args:
620
- file_path: Path to the PDF file.
621
- question: Optional question to help retrieve relevant parts of long documents.
622
-
623
- Returns:
624
- The extracted text content, potentially chunked if the document is large.
625
- """
626
- try:
627
- # Check if file exists
628
- import os
629
- if not os.path.exists(file_path):
630
- return f"Error: PDF file '{file_path}' does not exist."
631
-
632
- reader = PdfReader(file_path)
633
- full_text = "\n".join([page.extract_text() or "" for page in reader.pages])
634
-
635
- # If a question is provided, use retrieval to get relevant parts
636
- if question and len(full_text) > 5000: # Only chunk if text is large
637
- return process_large_document(full_text, question)
638
-
639
- return full_text
640
- except Exception as e:
641
- return f"Error reading PDF: {str(e)}"
642
-
643
- @tool(parse_docstring=True)
644
- def read_text_from_docx(file_path: str, question: str = None) -> str:
645
- """
646
- Extract text from a DOCX (Word) document, chunking large documents if needed.
647
-
648
- Args:
649
- file_path: Path to the DOCX file.
650
- question: Optional question to help retrieve relevant parts of long documents.
651
-
652
- Returns:
653
- The extracted text, potentially chunked if the document is large.
654
- """
655
- try:
656
- # Check if file exists
657
- import os
658
- if not os.path.exists(file_path):
659
- return f"Error: File '{file_path}' does not exist."
660
-
661
- try:
662
- doc = docx.Document(file_path)
663
- full_text = "\n".join([para.text for para in doc.paragraphs])
664
- except Exception as docx_err:
665
- # Handle "Package not found" error specifically
666
- if "Package not found" in str(docx_err):
667
- # Try to read raw text if possible
668
- try:
669
- import zipfile
670
- from xml.etree.ElementTree import XML
671
-
672
- WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
673
- PARA = WORD_NAMESPACE + 'p'
674
- TEXT = WORD_NAMESPACE + 't'
675
-
676
- with zipfile.ZipFile(file_path) as docx_file:
677
- with docx_file.open('word/document.xml') as document:
678
- tree = XML(document.read())
679
- paragraphs = []
680
- for paragraph in tree.iter(PARA):
681
- texts = [node.text for node in paragraph.iter(TEXT) if node.text]
682
- if texts:
683
- paragraphs.append(''.join(texts))
684
- full_text = '\n'.join(paragraphs)
685
- except Exception as e:
686
- return f"Error reading DOCX file: {str(e)}"
687
- else:
688
- return f"Error reading DOCX file: {str(docx_err)}"
689
-
690
- # If a question is provided, use retrieval to get relevant parts
691
- if question and len(full_text) > 5000: # Only chunk if text is large
692
- return process_large_document(full_text, question)
693
-
694
- return full_text
695
- except Exception as e:
696
- return f"Error reading DOCX file: {str(e)}"
697
-
698
-
699
- @tool(parse_docstring=True)
700
- def transcribe_audio(file_path: str) -> str:
701
- """
702
- Transcribe speech from a local audio file to text.
703
-
704
- Args:
705
- file_path: Path to the audio file.
706
-
707
- Returns:
708
- Transcribed text using Google Web Speech API.
709
- """
710
- try:
711
- # Check if file exists
712
- import os
713
- if not os.path.exists(file_path):
714
- return f"Error: Audio file '{file_path}' does not exist."
715
-
716
- # For non-WAV files, convert to WAV first
717
- if not file_path.lower().endswith('.wav'):
718
- try:
719
- from pydub import AudioSegment
720
- temp_wav = os.path.splitext(file_path)[0] + "_temp.wav"
721
- audio = AudioSegment.from_file(file_path)
722
- audio.export(temp_wav, format="wav")
723
- file_path = temp_wav
724
- except Exception as e:
725
- return f"Failed to convert audio to WAV format: {str(e)}"
726
-
727
- recognizer = sr.Recognizer()
728
- with sr.AudioFile(file_path) as src:
729
- audio = recognizer.record(src)
730
- return recognizer.recognize_google(audio)
731
- except Exception as e:
732
- if "Audio file could not be read" in str(e):
733
- return f"Error: Audio format not supported. Try converting to WAV, MP3, OGG, or FLAC."
734
- return f"Error transcribing audio: {str(e)}"
735
-
736
- @tool(parse_docstring=True)
737
- def youtube_audio_processing(youtube_url: str) -> str:
738
- """
739
- Download and transcribe audio from a YouTube video.
740
-
741
- Args:
742
- youtube_url: URL of the YouTube video.
743
-
744
- Returns:
745
- Transcription text extracted from the video's audio.
746
- """
747
- yt = YouTube(youtube_url)
748
- audio_stream = yt.streams.filter(only_audio=True).first()
749
- out_file = audio_stream.download(output_path='.', filename='yt_audio')
750
- wav_path = 'yt_audio.wav'
751
- AudioSegment.from_file(out_file).export(wav_path, format='wav')
752
- return transcribe_audio(wav_path)
753
-
754
- @tool(parse_docstring=True)
755
- def extract_article_text(url: str, question: str = None) -> str:
756
- """
757
- Download and extract the main article content from a webpage, chunking large articles if needed.
758
-
759
- Args:
760
- url: The URL of the article to extract.
761
- question: Optional question to help retrieve relevant parts of long articles.
762
-
763
- Returns:
764
- The article's textual content, potentially chunked if large.
765
- """
766
- try:
767
- art = Article(url)
768
- art.download()
769
- art.parse()
770
- full_text = art.text
771
-
772
- # If a question is provided, use retrieval to get relevant parts
773
- if question and len(full_text) > 5000: # Only chunk if text is large
774
- return process_large_document(full_text, question)
775
-
776
- return full_text
777
- except Exception as e:
778
- return f"Error extracting article: {str(e)}"
779
-
780
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
781
- # ───────────────────────────────────────────────────────────── Tool for ArXiv ────────────────────────────────────────────────────────────
782
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
783
-
784
- @tool(parse_docstring=True)
785
- def arvix_search(query: str) -> Dict[str, str]:
786
- """
787
- Search for academic papers on ArXiv.
788
-
789
- Args:
790
- query: The search term to look for in ArXiv.
791
-
792
- Returns:
793
- A dictionary of up to 3 relevant paper entries in JSON format.
794
- """
795
- papers = ArxivLoader(query=query, load_max_docs=3).load()
796
- results = []
797
- for doc in papers:
798
- try:
799
- # Handle different metadata formats that might be returned
800
- source = doc.metadata.get("source", "ArXiv")
801
- doc_id = doc.metadata.get("id", doc.metadata.get("entry_id", ""))
802
- result = {
803
- "source": source,
804
- "id": doc_id,
805
- "summary": doc.page_content[:1000] if hasattr(doc, "page_content") else str(doc)[:1000],
806
- }
807
- results.append(result)
808
- except Exception as e:
809
- # Add error information as a fallback
810
- results.append({
811
- "source": "ArXiv Error",
812
- "id": "error",
813
- "summary": f"Error processing paper: {str(e)}"
814
- })
815
-
816
- return {"arvix_results": json.dumps(results)}
817
-
818
- @tool(parse_docstring=True)
819
- def answer_youtube_video_question(
820
- youtube_url: str,
821
- question: str,
822
- chunk_size_seconds: int = 30
823
- ) -> str:
824
- """
825
- Answer a question based on a YouTube video's transcript.
826
-
827
- Args:
828
- youtube_url: URL of the YouTube video.
829
- question: The question to be answered using video content.
830
- chunk_size_seconds: Duration of each transcript chunk.
831
-
832
- Returns:
833
- The answer to the question generated from the video transcript.
834
- """
835
- loader = YoutubeLoader.from_youtube_url(
836
- youtube_url,
837
- add_video_info=True,
838
- transcript_format=TranscriptFormat.CHUNKS,
839
- chunk_size_seconds=chunk_size_seconds,
840
- )
841
- documents = loader.load()
842
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
843
- vectorstore = FAISS.from_documents(documents, embeddings)
844
- llm = RetryingChatGroq(model="deepseek-r1-distill-llama-70b", streaming=False)
845
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
846
- return qa_chain.run(question)
847
-
848
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
849
- # ───────────────────────────────────────────────────────────── Tool for Python REPL tool ────────────────────────────────────────────────
850
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
851
-
852
- python_repl = PythonREPLTool()
853
-
854
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
855
- # ───────────────────────────────────────────────────────────── Tool for Wiki ──────────────────────────────────────────────��─────────────
856
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
857
-
858
- @tool(parse_docstring=True)
859
- def wiki_search(query: str) -> str:
860
- """
861
- Search Wikipedia for information on a given topic.
862
-
863
- Args:
864
- query: The search term for Wikipedia.
865
-
866
- Returns:
867
- A JSON string with up to 3 summary results.
868
- """
869
- # load up to top_k pages
870
- pages = WikipediaLoader(query=query, load_max_docs=3).load()
871
- results: List[Dict] = []
872
- for doc in pages:
873
- results.append({
874
- "source": doc.metadata["source"],
875
- "page": doc.metadata.get("page", ""),
876
- "content": doc.page_content[:1000], # truncate if you like
877
- })
878
- return {"wiki_results": format_search_docs(results)}
879
-
880
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
881
- # ───────────────────────────────────── Tool for Image (understading, captioning & classification) ─────────────────────────────────────────
882
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
883
-
884
- def _load_image(img_path: str, resize_to=(512, 512)) -> Image.Image:
885
- """
886
- Load, verify, convert, and resize an image.
887
- Raises ValueError on failure.
888
- """
889
- if not img_path:
890
- raise ValueError("No image path provided.")
891
- try:
892
- with Image.open(img_path) as img:
893
- img.verify()
894
- img = Image.open(img_path).convert("RGB")
895
- img = img.resize(resize_to)
896
- return img
897
- except UnidentifiedImageError:
898
- raise ValueError(f"File at {img_path} is not a valid image.")
899
- except Exception as e:
900
- raise ValueError(f"Failed to load image at {img_path}: {e}")
901
-
902
- def _encode_image_to_base64(img_path: str) -> str:
903
- """
904
- Load an image, save optimized PNG into memory, and base64‑encode it.
905
- """
906
- img = _load_image(img_path)
907
- buffer = BytesIO()
908
- img.save(buffer, format="PNG", optimize=True)
909
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
910
-
911
- @tool
912
- def image_processing(prompt: str, img_path: str) -> str:
913
- """Process an image using a vision LLM, with OCR fallback.
914
-
915
- Args:
916
- prompt: Instruction or question related to the image.
917
- img_path: Path to the image file.
918
-
919
- Returns:
920
- The model's response or fallback OCR result.
921
- """
922
- try:
923
- import os
924
- # Check if file exists
925
- if not os.path.exists(img_path):
926
- return f"Error: Image file '{img_path}' does not exist."
927
-
928
- try:
929
- b64 = _encode_image_to_base64(img_path)
930
- # Build a single markdown string with inline base64 image
931
- md = f"{prompt}\n\n![](data:image/png;base64,{b64})"
932
- message = HumanMessage(content=md)
933
- # Use RetryingChatGroq with Llama 4 Maverick for vision
934
- llm = RetryingChatGroq(model="meta-llama/llama-4-maverick-17b-128e-instruct", streaming=False, temperature=0)
935
- try:
936
- resp = llm.invoke([message])
937
- if hasattr(resp, 'content'):
938
- return resp.content.strip()
939
- elif isinstance(resp, str):
940
- return resp.strip()
941
- else:
942
- # Handle dictionary or other response types
943
- return str(resp)
944
- except Exception as invoke_err:
945
- print(f"[LLM invoke error] {invoke_err}")
946
- # Fall back to OCR
947
- raise ValueError("LLM invocation failed")
948
- except Exception as llama_err:
949
- print(f"[LLM vision failed] {llama_err}")
950
- try:
951
- img = _load_image(img_path)
952
- return pytesseract.image_to_string(img).strip()
953
- except Exception as ocr_err:
954
- print(f"[OCR fallback failed] {ocr_err}")
955
- return "Unable to process the image. Please check the file and try again."
956
- except Exception as e:
957
- # Catch any other errors
958
- print(f"[image_processing error] {e}")
959
- return f"Error processing image: {str(e)}"
960
-
961
- python_repl_tool = PythonREPLTool()
962
-
963
- @tool
964
- def echo(text: str) -> str:
965
- """Echo back the input text.
966
-
967
- Args:
968
- text: The string to be echoed.
969
-
970
- Returns:
971
- The same text that was provided as input.
972
- """
973
- return text
974
-
975
- # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
976
- # ─────────────────────────────────────────────── Langgraph Agent ───────────────────────────────────────────────────────────────────────
977
- # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
978
-
979
-
980
- # Build graph function
981
- from langchain_core.tools import tool
982
- from langchain.chat_models import ChatOpenAI
983
- from langgraph.prebuilt.chat_agent_executor import create_react_agent, AgentState
984
- from langchain.chat_models import init_chat_model
985
-
986
-
987
-
988
- def build_graph(provider: str = "groq"):
989
- """Construct and compile the multi‑agent GAIA workflow StateGraph.
990
-
991
- This graph wires together three React‑style agents into a streamlined pipeline:
992
- PerceptionAgent → ActionAgent → EvaluationAgent (with appropriate entry/exit points)
993
-
994
- The agents have the following responsibilities:
995
- - PerceptionAgent: Handles web searches, Wikipedia, ArXiv, and image processing
996
- - ActionAgent: Performs calculations, file operations, and code analysis
997
- - EvaluationAgent: Reviews results and ensures the final answer is properly formatted
998
-
999
- Args:
1000
- provider: The name of the LLM provider. Must be "groq".
1001
-
1002
- Returns:
1003
- CompiledGraph: A compiled LangGraph state machine ready for invocation.
1004
-
1005
- Raises:
1006
- ValueError: If `provider` is anything other than "groq".
1007
- """
1008
- try:
1009
- if provider != "groq":
1010
- raise ValueError("Invalid provider. Expected 'groq'.")
1011
-
1012
- # Initialize LLM
1013
- try:
1014
- logger.info("Initializing LLM with model: deepseek-r1-distill-llama-70b")
1015
- api_key = os.getenv("GROQ_API_KEY")
1016
- if not api_key or api_key == "default_key_or_placeholder":
1017
- logger.error("GROQ_API_KEY is not set or is using placeholder value")
1018
- raise ValueError("GROQ_API_KEY environment variable is not set properly. Please set a valid API key.")
1019
-
1020
- llm = RetryingChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0)
1021
- logger.info("LLM initialized successfully")
1022
- except Exception as e:
1023
- logger.error(f"Error initializing LLM: {str(e)}")
1024
- raise
1025
-
1026
- # General system message for agents
1027
- sys_msg = SystemMessage(content="""
1028
- You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template:
1029
-
1030
- FINAL ANSWER: [YOUR FINAL ANSWER]
1031
-
1032
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma-separated list of numbers and/or strings.
1033
-
1034
- If you are asked for a number, don't use commas or units (e.g., $, %, kg) unless specified otherwise.
1035
-
1036
- If you are asked for a string, don't use articles (a, an, the), and don't use abbreviations (e.g., for states).
1037
-
1038
- If you are asked for a comma-separated list, apply the above rules to each element in the list.
1039
- """.strip())
1040
-
1041
- # Special system message for the evaluation agent with stricter formatting requirements
1042
- eval_sys_msg = SystemMessage(content="""
1043
- You are a specialized evaluation agent. Your job is to review the work done by other agents
1044
- and provide a final, properly formatted answer.
1045
-
1046
- IMPORTANT: You MUST ALWAYS format your answer using this exact template:
1047
-
1048
- FINAL ANSWER: [concise answer]
1049
-
1050
- Rules for formatting the answer:
1051
- 1. The answer must be extremely concise - use as few words as possible
1052
- 2. For numeric answers, provide only the number without units unless units are specifically requested
1053
- 3. For text answers, avoid articles (a, an, the) and unnecessary words
1054
- 4. For list answers, use a comma-separated format
1055
- 5. NEVER explain your reasoning in the FINAL ANSWER section
1056
- 6. NEVER skip the "FINAL ANSWER:" prefix
1057
-
1058
- Example good answers:
1059
- FINAL ANSWER: 42
1060
- FINAL ANSWER: Paris
1061
- FINAL ANSWER: 1912, 1945, 1989
1062
-
1063
- Example bad answers (don't do these):
1064
- - Based on my analysis, the answer is 42.
1065
- - I think it's Paris because that's the capital of France.
1066
- - The years were 1912, 1945, and 1989.
1067
-
1068
- Remember: ALWAYS include "FINAL ANSWER:" followed by the most concise answer possible.
1069
- """.strip())
1070
-
1071
- # Define tools for each agent
1072
- logger.info("Setting up agent tools")
1073
- perception_tools = [web_search, wiki_search, news_article_search, arvix_search, image_processing, echo]
1074
- execution_tools = [
1075
- multiply, add, subtract, divide, modulus,
1076
- download_file, process_excel_to_text,
1077
- read_text_from_pdf, read_text_from_docx,
1078
- transcribe_audio, youtube_audio_processing,
1079
- extract_article_text, answer_youtube_video_question,
1080
- python_repl_tool, analyze_code, read_code_file, analyze_python_function
1081
- ]
1082
-
1083
- # ─────────────── Agent Creation ───────────────
1084
- logger.info("Creating agents")
1085
- try:
1086
- # Create agents with proper error handling
1087
- PerceptionAgent = create_react_agent(
1088
- model=llm,
1089
- tools=perception_tools,
1090
- prompt=sys_msg,
1091
- state_schema=AgentState,
1092
- name="PerceptionAgent"
1093
- )
1094
- logger.info("Created PerceptionAgent successfully")
1095
-
1096
- # Combined Planning and Execution agent for better efficiency
1097
- ActionAgent = create_react_agent(
1098
- model=llm,
1099
- tools=execution_tools, # Has access to all execution tools
1100
- prompt=sys_msg,
1101
- state_schema=AgentState,
1102
- name="ActionAgent"
1103
- )
1104
- logger.info("Created ActionAgent successfully")
1105
-
1106
- # Evaluation agent with stricter prompt
1107
- EvaluationAgent = create_react_agent(
1108
- model=llm,
1109
- tools=[], # No tools needed for evaluation
1110
- prompt=eval_sys_msg, # Use the specialized evaluation prompt
1111
- state_schema=AgentState,
1112
- name="EvaluationAgent"
1113
- )
1114
- logger.info("Created EvaluationAgent successfully")
1115
- except Exception as e:
1116
- logger.error(f"Error creating agent: {str(e)}")
1117
- import traceback
1118
- logger.error(f"Traceback: {traceback.format_exc()}")
1119
- raise
1120
-
1121
- # Build the StateGraph
1122
- logger.info("Building StateGraph")
1123
- try:
1124
- builder = StateGraph(AgentState)
1125
-
1126
- # Add agent nodes first
1127
- builder.add_node("PerceptionAgent", PerceptionAgent)
1128
- builder.add_node("ActionAgent", ActionAgent)
1129
- builder.add_node("EvaluationAgent", EvaluationAgent)
1130
-
1131
- # Define the flow with a starting edge
1132
- builder.set_entry_point("PerceptionAgent")
1133
-
1134
- # Add the edges for the simpler linear flow
1135
- builder.add_edge("PerceptionAgent", "ActionAgent")
1136
- builder.add_edge("ActionAgent", "EvaluationAgent")
1137
-
1138
- # Set EvaluationAgent as the end node
1139
- builder.set_finish_point("EvaluationAgent")
1140
-
1141
- logger.info("Compiling StateGraph")
1142
- return builder.compile()
1143
- except Exception as e:
1144
- logger.error(f"Error building graph: {str(e)}")
1145
- import traceback
1146
- logger.error(f"Traceback: {traceback.format_exc()}")
1147
- raise
1148
- except Exception as e:
1149
- logger.error(f"Overall error in build_graph: {str(e)}")
1150
- import traceback
1151
- logger.error(f"Traceback: {traceback.format_exc()}")
1152
- raise
1153
-
1154
- def get_final_answer(text):
1155
- """Extract just the FINAL ANSWER from the model's response.
1156
-
1157
- Args:
1158
- text: The full text response from the LLM
1159
-
1160
- Returns:
1161
- str: The extracted answer without the "FINAL ANSWER:" prefix
1162
- """
1163
- # Log the raw text for debugging if needed
1164
- logger.debug(f"Extracting answer from: {text[:200]}...")
1165
-
1166
- if not text:
1167
- logger.warning("Empty response received")
1168
- return "No answer provided."
1169
-
1170
- # Method 1: Look for "FINAL ANSWER:" with most comprehensive pattern matching
1171
- pattern = r'(?:^|\n)FINAL ANSWER:\s*(.*?)(?:\n\s*$|$)'
1172
- match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
1173
- if match:
1174
- # Return just the answer part, cleaned up
1175
- logger.debug("Found answer using pattern 1")
1176
- return match.group(1).strip()
1177
-
1178
- # Method 2: Try looking for variations on the final answer format
1179
- for variant in ["FINAL ANSWER:", "FINAL_ANSWER:", "Final Answer:", "Answer:"]:
1180
- lines = text.split('\n')
1181
- for i, line in enumerate(reversed(lines)):
1182
- if variant in line:
1183
- # Extract everything after the variant text
1184
- logger.debug(f"Found answer using variant: {variant}")
1185
- answer = line[line.find(variant) + len(variant):].strip()
1186
- if answer:
1187
- return answer
1188
- # If the answer is on the next line, return that
1189
- if i > 0:
1190
- next_line = lines[len(lines) - i]
1191
- if next_line.strip():
1192
- return next_line.strip()
1193
-
1194
- # Method 3: Look for phrases that suggest an answer
1195
- for phrase in ["The answer is", "The result is", "We get", "Therefore,", "In conclusion,"]:
1196
- phrase_pos = text.find(phrase)
1197
- if phrase_pos != -1:
1198
- # Try to extract everything after the phrase until the end of the sentence
1199
- sentence_end = text.find(".", phrase_pos)
1200
- if sentence_end != -1:
1201
- logger.debug(f"Found answer using phrase: {phrase}")
1202
- return text[phrase_pos + len(phrase):sentence_end].strip()
1203
-
1204
- # Method 4: Fall back to taking the last paragraph with actual content
1205
- paragraphs = text.strip().split('\n\n')
1206
- for para in reversed(paragraphs):
1207
- para = para.strip()
1208
- if para and not para.startswith("I ") and not para.lower().startswith("to "):
1209
- logger.debug("Using last meaningful paragraph")
1210
- # If paragraph is very long, try to extract a concise answer
1211
- if len(para) > 100:
1212
- sentences = re.split(r'[.!?]', para)
1213
- for sentence in reversed(sentences):
1214
- sent = sentence.strip()
1215
- if sent and len(sent) > 5 and not sent.startswith("I "):
1216
- return sent
1217
- return para
1218
-
1219
- # Method 5: Last resort - just return the last line with content
1220
- lines = text.strip().split('\n')
1221
- for line in reversed(lines):
1222
- line = line.strip()
1223
- if line and len(line) > 3:
1224
- logger.debug("Using last line with content")
1225
- return line
1226
-
1227
- # If everything fails, warn and return the truncated response
1228
- logger.warning("Could not find a properly formatted answer")
1229
- return text[:100] + "..." if len(text) > 100 else text
1230
-
1231
- # test
1232
- if __name__ == "__main__":
1233
- question = "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect?"
1234
- # Build the graph
1235
- graph = build_graph(provider="groq")
1236
- # Run the graph
1237
- messages = [HumanMessage(content=question)]
1238
- messages = graph.invoke({"messages": messages})
1239
- for m in messages["messages"]:
1240
- m.pretty_print()
1241
-
1242
- # ─────────────────────────────────────────────── Tool for Code Analysis ───────────────────────────────────────────────────────────────
1243
- @tool
1244
- def analyze_code(code_string: str) -> str:
1245
- """Analyze a string of code to understand its structure, functionality, and potential issues.
1246
-
1247
- Args:
1248
- code_string: The code to analyze as a string.
1249
-
1250
- Returns:
1251
- A structured analysis of the code including functions, classes, and key operations.
1252
- """
1253
- try:
1254
- import ast
1255
-
1256
- # Try to parse with Python's AST module
1257
- try:
1258
- parsed = ast.parse(code_string)
1259
-
1260
- # Extract functions and classes
1261
- functions = [node.name for node in ast.walk(parsed) if isinstance(node, ast.FunctionDef)]
1262
- classes = [node.name for node in ast.walk(parsed) if isinstance(node, ast.ClassDef)]
1263
- imports = [node.names[0].name for node in ast.walk(parsed) if isinstance(node, ast.Import)]
1264
- imports.extend([f"{node.module}.{name.name}" if node.module else name.name
1265
- for node in ast.walk(parsed) if isinstance(node, ast.ImportFrom)
1266
- for name in node.names])
1267
-
1268
- # Count various node types for complexity assessment
1269
- num_loops = len([node for node in ast.walk(parsed)
1270
- if isinstance(node, (ast.For, ast.While))])
1271
- num_conditionals = len([node for node in ast.walk(parsed)
1272
- if isinstance(node, (ast.If, ast.IfExp))])
1273
-
1274
- analysis = {
1275
- "language": "Python",
1276
- "functions": functions,
1277
- "classes": classes,
1278
- "imports": imports,
1279
- "complexity": {
1280
- "functions": len(functions),
1281
- "classes": len(classes),
1282
- "loops": num_loops,
1283
- "conditionals": num_conditionals
1284
- }
1285
- }
1286
- return str(analysis)
1287
- except SyntaxError:
1288
- # If not valid Python, try some simple pattern matching
1289
- if "{" in code_string and "}" in code_string:
1290
- if "function" in code_string or "=>" in code_string:
1291
- language = "JavaScript/TypeScript"
1292
- elif "func" in code_string or "struct" in code_string:
1293
- language = "Go or Rust"
1294
- elif "public" in code_string or "private" in code_string or "class" in code_string:
1295
- language = "Java/C#/C++"
1296
- else:
1297
- language = "Unknown C-like language"
1298
- elif "<" in code_string and ">" in code_string and ("/>" in code_string or "</"):
1299
- language = "HTML/XML/JSX"
1300
- else:
1301
- language = "Unknown"
1302
-
1303
- return f"Non-Python code detected ({language}). Basic code structure analysis not available."
1304
- except Exception as e:
1305
- return f"Error analyzing code: {str(e)}"
1306
-
1307
- @tool
1308
- def read_code_file(file_path: str) -> str:
1309
- """Read a code file and return its contents with proper syntax detection.
1310
-
1311
- Args:
1312
- file_path: Path to the code file.
1313
-
1314
- Returns:
1315
- The file contents and detected language.
1316
- """
1317
- try:
1318
- # Check if file exists
1319
- import os
1320
- if not os.path.exists(file_path):
1321
- return f"Error: File '{file_path}' does not exist."
1322
-
1323
- with open(file_path, 'r', encoding='utf-8') as f:
1324
- content = f.read()
1325
-
1326
- # Try to detect language from extension
1327
- ext = os.path.splitext(file_path)[1].lower()
1328
-
1329
- language_map = {
1330
- '.py': 'Python',
1331
- '.js': 'JavaScript',
1332
- '.ts': 'TypeScript',
1333
- '.html': 'HTML',
1334
- '.css': 'CSS',
1335
- '.java': 'Java',
1336
- '.c': 'C',
1337
- '.cpp': 'C++',
1338
- '.cs': 'C#',
1339
- '.go': 'Go',
1340
- '.rs': 'Rust',
1341
- '.php': 'PHP',
1342
- '.rb': 'Ruby',
1343
- '.sh': 'Shell',
1344
- '.bat': 'Batch',
1345
- '.ps1': 'PowerShell',
1346
- '.sql': 'SQL',
1347
- '.json': 'JSON',
1348
- '.xml': 'XML',
1349
- '.yaml': 'YAML',
1350
- '.yml': 'YAML',
1351
- }
1352
-
1353
- language = language_map.get(ext, 'Unknown')
1354
-
1355
- return f"File content ({language}):\n\n{content}"
1356
- except Exception as e:
1357
- return f"Error reading file: {str(e)}"
1358
-
1359
- @tool
1360
- def analyze_python_function(function_name: str, code_string: str) -> str:
1361
- """Extract and analyze a specific function from Python code.
1362
-
1363
- Args:
1364
- function_name: The name of the function to analyze.
1365
- code_string: The complete code containing the function.
1366
-
1367
- Returns:
1368
- Analysis of the function including parameters, return type, and docstring.
1369
- """
1370
- try:
1371
- import ast
1372
- import inspect
1373
- from types import CodeType, FunctionType
1374
-
1375
- # Parse the code string
1376
- parsed = ast.parse(code_string)
1377
-
1378
- # Find the function definition
1379
- function_def = None
1380
- for node in ast.walk(parsed):
1381
- if isinstance(node, ast.FunctionDef) and node.name == function_name:
1382
- function_def = node
1383
- break
1384
-
1385
- if not function_def:
1386
- return f"Function '{function_name}' not found in the provided code."
1387
-
1388
- # Extract parameters
1389
- params = []
1390
- for arg in function_def.args.args:
1391
- param_name = arg.arg
1392
- # Get annotation if it exists
1393
- if arg.annotation:
1394
- if isinstance(arg.annotation, ast.Name):
1395
- param_type = arg.annotation.id
1396
- elif isinstance(arg.annotation, ast.Attribute):
1397
- param_type = f"{arg.annotation.value.id}.{arg.annotation.attr}"
1398
- else:
1399
- param_type = "complex_type"
1400
- params.append(f"{param_name}: {param_type}")
1401
- else:
1402
- params.append(param_name)
1403
-
1404
- # Extract return type if it exists
1405
- return_type = None
1406
- if function_def.returns:
1407
- if isinstance(function_def.returns, ast.Name):
1408
- return_type = function_def.returns.id
1409
- elif isinstance(function_def.returns, ast.Attribute):
1410
- return_type = f"{function_def.returns.value.id}.{function_def.returns.attr}"
1411
- else:
1412
- return_type = "complex_return_type"
1413
-
1414
- # Extract docstring
1415
- docstring = ast.get_docstring(function_def)
1416
-
1417
- # Create a summary
1418
- summary = {
1419
- "function_name": function_name,
1420
- "parameters": params,
1421
- "return_type": return_type,
1422
- "docstring": docstring,
1423
- "decorators": [d.id if isinstance(d, ast.Name) else "complex_decorator" for d in function_def.decorator_list],
1424
- "line_count": len(function_def.body)
1425
- }
1426
-
1427
- # Create a more explicit string representation that ensures key terms are included
1428
- result = f"Function '{function_name}' analysis:\n"
1429
- result += f"- Parameters: {', '.join(params)}\n"
1430
- result += f"- Return type: {return_type or 'None specified'}\n"
1431
- result += f"- Docstring: {docstring or 'None'}\n"
1432
- result += f"- Line count: {len(function_def.body)}"
1433
-
1434
- return result
1435
- except Exception as e:
1436
- return f"Error analyzing function: {str(e)}"
1437
-
1438
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
1439
- # ─────────────────────────────────────────────── Tool for News Article Retrieval ──────────────────────────────────────────────────────────────────────
1440
- # ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
1441
-
1442
- @tool
1443
- def news_article_search(query: str, top_k: int = 3) -> Dict[str, str]:
1444
- """Search for and retrieve news articles with robust error handling for news sites.
1445
-
1446
- Args:
1447
- query: The news topic or keywords to search for.
1448
- top_k: Maximum number of articles to retrieve.
1449
-
1450
- Returns:
1451
- A dictionary with search results formatted as XML-like document entries.
1452
- """
1453
- # First, get URLs from DuckDuckGo with "news" focus
1454
- results = []
1455
- news_sources = [
1456
- "bbc.com", "reuters.com", "apnews.com", "nasa.gov",
1457
- "space.com", "universetoday.com", "nature.com", "science.org",
1458
- "scientificamerican.com", "nytimes.com", "theguardian.com"
1459
- ]
1460
-
1461
- # Find news from reliable sources
1462
- try:
1463
- with DDGS() as ddgs:
1464
- search_query = f"{query} site:{' OR site:'.join(news_sources)}"
1465
- for hit in ddgs.text(search_query, safesearch="On", max_results=top_k*2):
1466
- url = hit.get("href") or hit.get("url", "")
1467
- if not url:
1468
- continue
1469
-
1470
- # Add the search snippet first as a fallback
1471
- result = {
1472
- "source": url,
1473
- "page": "",
1474
- "content": hit.get("body", "")[:250],
1475
- "title": hit.get("title", "")
1476
- }
1477
-
1478
- # Try to get better content via a more robust method
1479
- try:
1480
- headers = {
1481
- "User-Agent": random.choice([
1482
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
1483
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
1484
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
1485
- ]),
1486
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
1487
- "Accept-Language": "en-US,en;q=0.5",
1488
- "Referer": "https://www.google.com/",
1489
- "DNT": "1",
1490
- "Connection": "keep-alive",
1491
- "Upgrade-Insecure-Requests": "1"
1492
- }
1493
-
1494
- # Add a short delay between requests
1495
- time.sleep(1 + random.random())
1496
-
1497
- # Try to use newspaper3k for more reliable article extraction
1498
- from newspaper import Article
1499
- article = Article(url)
1500
- article.download()
1501
- article.parse()
1502
-
1503
- # If we got meaningful content, update the result
1504
- if article.text and len(article.text) > 100:
1505
- # Get a summary - first paragraph + some highlights
1506
- paragraphs = article.text.split('\n\n')
1507
- first_para = paragraphs[0] if paragraphs else ""
1508
- summary = first_para[:300]
1509
- if len(paragraphs) > 1:
1510
- summary += "... " + paragraphs[1][:200]
1511
-
1512
- result["content"] = summary
1513
- if article.title:
1514
- result["title"] = article.title
1515
-
1516
- except Exception as article_err:
1517
- logger.warning(f"Article extraction failed for {url}: {article_err}")
1518
- # Fallback to simple requests-based extraction
1519
- try:
1520
- resp = requests.get(url, timeout=12, headers=headers)
1521
- resp.raise_for_status()
1522
- soup = BeautifulSoup(resp.text, "html.parser")
1523
-
1524
- # Try to get main content
1525
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
1526
-
1527
- if main_content:
1528
- content = " ".join(main_content.get_text(separator=" ", strip=True).split()[:250])
1529
- result["content"] = content
1530
- except Exception as req_err:
1531
- logger.warning(f"Fallback extraction failed for {url}: {req_err}")
1532
- # Keep the original snippet as fallback
1533
-
1534
- results.append(result)
1535
- if len(results) >= top_k:
1536
- break
1537
-
1538
- except Exception as e:
1539
- logger.error(f"News search failed: {e}")
1540
- return format_search_docs([{
1541
- "source": "Error",
1542
- "page": "",
1543
- "content": f"Failed to retrieve news articles for '{query}': {str(e)}"
1544
- }])
1545
-
1546
- if not results:
1547
- # Fallback to regular web search
1548
- logger.info(f"No news results found, falling back to web_search for {query}")
1549
- return web_search(query, top_k)
1550
-
1551
- return format_search_docs(results[:top_k])
1552
-
1553
- # ───────────────────────────────────────────────────────────── Document Chunking Utilities ──────────────────────────────────────────────────────────
1554
- def chunk_document(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
1555
- """
1556
- Split a large document into smaller chunks with overlap to maintain context across chunks.
1557
-
1558
- Args:
1559
- text: The document text to split into chunks
1560
- chunk_size: Maximum size of each chunk in characters
1561
- overlap: Number of characters to overlap between chunks
1562
-
1563
- Returns:
1564
- List of text chunks
1565
- """
1566
- # If text is smaller than chunk_size, return it as is
1567
- if len(text) <= chunk_size:
1568
- return [text]
1569
-
1570
- chunks = []
1571
- start = 0
1572
-
1573
- while start < len(text):
1574
- # Get chunk with overlap
1575
- end = min(start + chunk_size, len(text))
1576
-
1577
- # Try to find sentence boundary for cleaner breaks
1578
- if end < len(text):
1579
- # Look for sentence endings: period, question mark, or exclamation followed by space
1580
- for sentence_end in ['. ', '? ', '! ']:
1581
- last_period = text[start:end].rfind(sentence_end)
1582
- if last_period != -1:
1583
- end = start + last_period + 2 # +2 to include the period and space
1584
- break
1585
-
1586
- # Add chunk to list
1587
- chunks.append(text[start:end])
1588
-
1589
- # Move start position, accounting for overlap
1590
- start = end - overlap if end < len(text) else len(text)
1591
-
1592
- return chunks
1593
-
1594
- # Document processing utility that uses chunking
1595
- def process_large_document(text: str, question: str, llm=None) -> str:
1596
- """
1597
- Process a large document by chunking it and using retrieval to find relevant parts.
1598
-
1599
- Args:
1600
- text: The document text to process
1601
- question: The question being asked about the document
1602
- llm: Optional language model to use (defaults to agent's LLM)
1603
-
1604
- Returns:
1605
- Summarized answer based on relevant chunks
1606
- """
1607
- if not llm:
1608
- llm = RetryingChatGroq(model="deepseek-r1-distill-llama-70b", streaming=False, temperature=0)
1609
-
1610
- # Split document into chunks
1611
- chunks = chunk_document(text)
1612
-
1613
- # If document is small enough, don't bother with retrieval
1614
- if len(chunks) <= 1:
1615
- return text
1616
-
1617
- # For larger documents, create embeddings to find relevant chunks
1618
- try:
1619
- from langchain_community.embeddings import HuggingFaceEmbeddings
1620
- from langchain.vectorstores import FAISS
1621
- from langchain.schema import Document
1622
-
1623
- # Create documents with chunk content
1624
- documents = [Document(page_content=chunk, metadata={"chunk_id": i}) for i, chunk in enumerate(chunks)]
1625
-
1626
- # Create embeddings and vector store
1627
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
1628
- vectorstore = FAISS.from_documents(documents, embeddings)
1629
-
1630
- # Get most relevant chunks
1631
- relevant_chunks = vectorstore.similarity_search(question, k=2) # Get top 2 most relevant chunks
1632
-
1633
- # Join the relevant chunks
1634
- relevant_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
1635
-
1636
- # Option 1: Return relevant chunks directly
1637
- return relevant_text
1638
-
1639
- # Option 2: Summarize with LLM (commented out for now)
1640
- # prompt = f"Using only the following information, answer the question: '{question}'\n\nInformation:\n{relevant_text}"
1641
- # response = llm.invoke([HumanMessage(content=prompt)])
1642
- # return response.content
1643
-
1644
- except Exception as e:
1645
- # Fall back to first chunk if retrieval fails
1646
- logger.warning(f"Retrieval failed: {e}. Falling back to first chunk.")
1647
- return chunks[0]