Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -49,6 +49,7 @@ from langgraph.graph import START, END, StateGraph
|
|
| 49 |
from langchain_groq import ChatGroq
|
| 50 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 51 |
|
|
|
|
| 52 |
# RAG
|
| 53 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 54 |
from langchain_community.vectorstores import FAISS
|
|
@@ -763,7 +764,7 @@ class SearchInput(BaseModel):
|
|
| 763 |
@tool(args_schema=SearchInput)
|
| 764 |
@retry_with_backoff(max_retries=3)
|
| 765 |
def search_tool(query: str) -> str:
|
| 766 |
-
"""Web search with caching"""
|
| 767 |
start_time = time.time()
|
| 768 |
|
| 769 |
try:
|
|
@@ -785,7 +786,15 @@ def search_tool(query: str) -> str:
|
|
| 785 |
|
| 786 |
print(f"π Searching: {query}")
|
| 787 |
|
|
|
|
|
|
|
|
|
|
| 788 |
search = DuckDuckGoSearchRun()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
result = search.run(query)
|
| 790 |
|
| 791 |
if not result or len(result) < 50:
|
|
@@ -1212,110 +1221,122 @@ class ChessAnalysisInput(BaseModel):
|
|
| 1212 |
description: str = Field(description="Context about position", default="")
|
| 1213 |
|
| 1214 |
@tool(args_schema=ChessAnalysisInput)
|
| 1215 |
-
def analyze_chess_position(
|
| 1216 |
"""
|
| 1217 |
-
Analyze chess position using Stockfish.
|
| 1218 |
-
|
| 1219 |
"""
|
| 1220 |
start_time = time.time()
|
| 1221 |
|
| 1222 |
try:
|
| 1223 |
-
print(f"βοΈ Analyzing chess: {
|
| 1224 |
|
| 1225 |
-
# Find
|
| 1226 |
-
|
| 1227 |
-
if not
|
| 1228 |
-
|
| 1229 |
|
| 1230 |
-
if not
|
| 1231 |
-
raise FileNotFoundError(f"
|
| 1232 |
|
| 1233 |
-
# Extract FEN using Gemini Vision
|
| 1234 |
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 1235 |
if not GOOGLE_API_KEY:
|
| 1236 |
raise ValueError("GEMINI_API_KEY not set")
|
| 1237 |
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
|
| 1241 |
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
| 1245 |
-
|
| 1246 |
-
vision_llm = ChatGoogleGenerativeAI(
|
| 1247 |
model="gemini-2.5-flash",
|
| 1248 |
google_api_key=GOOGLE_API_KEY,
|
| 1249 |
temperature=0
|
| 1250 |
)
|
| 1251 |
|
| 1252 |
-
fen_prompt = """Analyze this chess board and provide FEN notation.
|
| 1253 |
-
Return ONLY the FEN string, nothing else.
|
| 1254 |
-
Format: piece_placement active_color castling en_passant halfmove fullmove"""
|
| 1255 |
-
|
| 1256 |
message = HumanMessage(
|
| 1257 |
content=[
|
| 1258 |
-
{
|
| 1259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1260 |
]
|
| 1261 |
)
|
| 1262 |
|
| 1263 |
-
response =
|
| 1264 |
fen = response.content.strip()
|
| 1265 |
|
| 1266 |
-
# Clean FEN
|
| 1267 |
-
for line in fen.split('\n'):
|
| 1268 |
-
line = line.strip().replace('```', '').replace('fen', '')
|
| 1269 |
-
if '/' in line and ' ' in line:
|
| 1270 |
-
fen = line
|
| 1271 |
-
break
|
| 1272 |
-
|
| 1273 |
print(f"β FEN: {fen}")
|
| 1274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1275 |
# Analyze with Stockfish
|
| 1276 |
try:
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
"/usr/games/stockfish"
|
| 1285 |
-
"/usr/local/bin/stockfish",
|
| 1286 |
-
"/usr/bin/stockfish",
|
| 1287 |
-
"stockfish"
|
| 1288 |
-
]
|
| 1289 |
-
|
| 1290 |
-
stockfish_path = None
|
| 1291 |
-
for path in stockfish_paths:
|
| 1292 |
-
if os.path.exists(path):
|
| 1293 |
-
stockfish_path = path
|
| 1294 |
-
break
|
| 1295 |
|
| 1296 |
-
|
| 1297 |
-
raise FileNotFoundError("Stockfish binary not found. Install: apt-get install stockfish")
|
| 1298 |
|
| 1299 |
-
|
| 1300 |
-
|
|
|
|
|
|
|
| 1301 |
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
raise ValueError("No legal move found")
|
| 1305 |
|
| 1306 |
-
# Convert to
|
| 1307 |
-
|
| 1308 |
-
uci_move = chess.Move.from_uci(best_move_uci)
|
| 1309 |
-
san_move = board.san(uci_move)
|
| 1310 |
|
| 1311 |
-
print(f"β Best move: {
|
| 1312 |
|
| 1313 |
telemetry.record_call("analyze_chess_position", time.time() - start_time, True)
|
| 1314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1315 |
|
| 1316 |
except Exception as e:
|
| 1317 |
telemetry.record_call("analyze_chess_position", time.time() - start_time, False)
|
| 1318 |
-
raise ToolError("analyze_chess_position", e, "Check
|
| 1319 |
|
| 1320 |
class ImageAnalysisInput(BaseModel):
|
| 1321 |
file_path: str = Field(description="Image file path")
|
|
@@ -1636,80 +1657,149 @@ class ScrapeInput(BaseModel):
|
|
| 1636 |
@tool(args_schema=ScrapeInput)
|
| 1637 |
@retry_with_backoff(max_retries=3)
|
| 1638 |
def scrape_and_retrieve(url: str, query: str) -> str:
|
| 1639 |
-
"""
|
|
|
|
|
|
|
| 1640 |
start_time = time.time()
|
| 1641 |
|
| 1642 |
try:
|
| 1643 |
-
|
| 1644 |
-
is_valid, msg = validate_tool_inputs("scrape_and_retrieve", {"url": url})
|
| 1645 |
if not is_valid:
|
| 1646 |
raise ValueError(msg)
|
| 1647 |
|
| 1648 |
-
if not rag_manager.is_ready():
|
| 1649 |
-
rag_manager.initialize()
|
| 1650 |
-
|
| 1651 |
-
if not rag_manager.is_ready():
|
| 1652 |
-
raise RuntimeError("RAG not available")
|
| 1653 |
-
|
| 1654 |
print(f"π Scraping: {url}")
|
| 1655 |
-
print(f" Looking for: {query[:
|
| 1656 |
|
| 1657 |
-
|
| 1658 |
-
|
| 1659 |
-
|
| 1660 |
-
|
| 1661 |
-
|
| 1662 |
-
|
| 1663 |
-
|
| 1664 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1665 |
|
| 1666 |
-
#
|
| 1667 |
-
for tag in soup(["script", "style", "nav", "footer", "aside", "header", "iframe"]):
|
| 1668 |
-
tag.extract()
|
| 1669 |
|
| 1670 |
-
|
|
|
|
| 1671 |
|
| 1672 |
-
|
| 1673 |
-
|
|
|
|
| 1674 |
|
| 1675 |
-
text =
|
| 1676 |
-
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 1677 |
-
text = '\n'.join(lines)
|
| 1678 |
|
| 1679 |
-
if len(text) <
|
| 1680 |
-
raise ValueError(f"
|
| 1681 |
|
| 1682 |
print(f"β Extracted {len(text)} characters")
|
| 1683 |
|
| 1684 |
# RAG retrieval
|
| 1685 |
-
|
| 1686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1687 |
|
| 1688 |
-
|
| 1689 |
|
| 1690 |
-
|
| 1691 |
-
|
| 1692 |
-
|
|
|
|
| 1693 |
|
| 1694 |
-
|
| 1695 |
-
del db
|
| 1696 |
-
del retriever
|
| 1697 |
-
import gc
|
| 1698 |
-
gc.collect()
|
| 1699 |
|
| 1700 |
-
|
| 1701 |
-
|
|
|
|
|
|
|
|
|
|
| 1702 |
|
| 1703 |
-
|
| 1704 |
|
| 1705 |
-
|
|
|
|
|
|
|
| 1706 |
|
| 1707 |
telemetry.record_call("scrape_and_retrieve", time.time() - start_time, True)
|
| 1708 |
-
return truncate_if_needed(
|
| 1709 |
|
| 1710 |
-
except requests.Timeout:
|
| 1711 |
-
telemetry.record_call("scrape_and_retrieve", time.time() - start_time, False)
|
| 1712 |
-
raise ToolError("scrape_and_retrieve", TimeoutError("Request timed out"), "Check URL or try later")
|
| 1713 |
except Exception as e:
|
| 1714 |
telemetry.record_call("scrape_and_retrieve", time.time() - start_time, False)
|
| 1715 |
raise ToolError("scrape_and_retrieve", e)
|
|
@@ -1746,34 +1836,37 @@ def analyze_video(file_path: str, query: str) -> str:
|
|
| 1746 |
if not GOOGLE_API_KEY:
|
| 1747 |
raise ValueError("GEMINI_API_KEY not set")
|
| 1748 |
|
| 1749 |
-
#
|
| 1750 |
-
print(f"
|
| 1751 |
-
|
| 1752 |
-
|
| 1753 |
-
|
| 1754 |
-
video_file = genai.upload_file(path=str(video_path))
|
| 1755 |
|
| 1756 |
-
|
| 1757 |
-
while video_file.state.name == "PROCESSING":
|
| 1758 |
-
time.sleep(2)
|
| 1759 |
-
video_file = genai.get_file(video_file.name)
|
| 1760 |
-
|
| 1761 |
-
if video_file.state.name == "FAILED":
|
| 1762 |
-
raise RuntimeError("Video processing failed")
|
| 1763 |
-
|
| 1764 |
-
# Analyze with Gemini
|
| 1765 |
print(f" Analyzing with Gemini...")
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
])
|
| 1772 |
|
| 1773 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1774 |
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
|
| 1778 |
print(f"β Analysis complete: {len(result)} chars")
|
| 1779 |
|
|
@@ -1810,6 +1903,7 @@ defined_tools = [
|
|
| 1810 |
create_plan,
|
| 1811 |
reflect_on_progress,
|
| 1812 |
validate_answer,
|
|
|
|
| 1813 |
|
| 1814 |
# Core tools
|
| 1815 |
search_tool,
|
|
@@ -2071,16 +2165,17 @@ Turn 5: final_answer_tool("3")
|
|
| 2071 |
REMEMBER: wikipedia_search() wants just the SUBJECT NAME!
|
| 2072 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2073 |
|
| 2074 |
-
**YOUTUBE VIDEO
|
| 2075 |
-
β οΈ YouTube URLs
|
| 2076 |
-
|
| 2077 |
-
|
| 2078 |
-
β
|
|
|
|
| 2079 |
|
| 2080 |
Example:
|
| 2081 |
-
|
| 2082 |
-
|
| 2083 |
-
β
CORRECT:
|
| 2084 |
β WRONG: get_youtube_transcript("https://youtube.com/...")
|
| 2085 |
|
| 2086 |
|
|
@@ -2200,6 +2295,36 @@ REMEMBER: One tool per turn. No reasoning without tools. Exact answer format.
|
|
| 2200 |
# Start with Groq
|
| 2201 |
self.llm_with_tools = self.groq_llm
|
| 2202 |
self.current_llm = "groq"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2203 |
|
| 2204 |
# Build agent graph
|
| 2205 |
def agent_node(state: AgentState):
|
|
@@ -2208,6 +2333,8 @@ REMEMBER: One tool per turn. No reasoning without tools. Exact answer format.
|
|
| 2208 |
print(f"\n{'='*70}")
|
| 2209 |
print(f"π€ AGENT TURN {current_turn}/{config.MAX_TURNS}")
|
| 2210 |
print('='*70)
|
|
|
|
|
|
|
| 2211 |
|
| 2212 |
if current_turn > config.MAX_TURNS:
|
| 2213 |
return {
|
|
|
|
| 49 |
from langchain_groq import ChatGroq
|
| 50 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 51 |
|
| 52 |
+
|
| 53 |
# RAG
|
| 54 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 55 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 764 |
@tool(args_schema=SearchInput)
|
| 765 |
@retry_with_backoff(max_retries=3)
|
| 766 |
def search_tool(query: str) -> str:
|
| 767 |
+
"""Web search with caching and language filtering"""
|
| 768 |
start_time = time.time()
|
| 769 |
|
| 770 |
try:
|
|
|
|
| 786 |
|
| 787 |
print(f"π Searching: {query}")
|
| 788 |
|
| 789 |
+
# DuckDuckGo doesn't support these params directly,
|
| 790 |
+
# but we can filter by adding language hints
|
| 791 |
+
# For English results, add hint to query
|
| 792 |
search = DuckDuckGoSearchRun()
|
| 793 |
+
|
| 794 |
+
# Add language hint to force English results
|
| 795 |
+
if not any(keyword in query.lower() for keyword in ['lang:', 'region:']):
|
| 796 |
+
query = f"{query} lang:en"
|
| 797 |
+
|
| 798 |
result = search.run(query)
|
| 799 |
|
| 800 |
if not result or len(result) < 50:
|
|
|
|
| 1221 |
description: str = Field(description="Context about position", default="")
|
| 1222 |
|
| 1223 |
@tool(args_schema=ChessAnalysisInput)
|
| 1224 |
+
def analyze_chess_position(file_path: str) -> str:
|
| 1225 |
"""
|
| 1226 |
+
Analyze chess position from image using Gemini Vision + Stockfish.
|
| 1227 |
+
Extracts FEN, analyzes best move.
|
| 1228 |
"""
|
| 1229 |
start_time = time.time()
|
| 1230 |
|
| 1231 |
try:
|
| 1232 |
+
print(f"βοΈ Analyzing chess: {file_path}")
|
| 1233 |
|
| 1234 |
+
# Find file
|
| 1235 |
+
image_path = find_file(file_path)
|
| 1236 |
+
if not image_path and os.path.exists(file_path):
|
| 1237 |
+
image_path = Path(file_path)
|
| 1238 |
|
| 1239 |
+
if not image_path or not image_path.exists():
|
| 1240 |
+
raise FileNotFoundError(f"Image not found: {file_path}")
|
| 1241 |
|
|
|
|
| 1242 |
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 1243 |
if not GOOGLE_API_KEY:
|
| 1244 |
raise ValueError("GEMINI_API_KEY not set")
|
| 1245 |
|
| 1246 |
+
# Read image as base64
|
| 1247 |
+
with open(image_path, "rb") as f:
|
| 1248 |
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
| 1249 |
|
| 1250 |
+
# Use Gemini to extract FEN
|
| 1251 |
+
llm = ChatGoogleGenerativeAI(
|
|
|
|
|
|
|
|
|
|
| 1252 |
model="gemini-2.5-flash",
|
| 1253 |
google_api_key=GOOGLE_API_KEY,
|
| 1254 |
temperature=0
|
| 1255 |
)
|
| 1256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1257 |
message = HumanMessage(
|
| 1258 |
content=[
|
| 1259 |
+
{
|
| 1260 |
+
"type": "text",
|
| 1261 |
+
"text": """Analyze this chess position and provide the FEN notation.
|
| 1262 |
+
|
| 1263 |
+
CRITICAL: The FEN string MUST include whose turn it is:
|
| 1264 |
+
- If White to move: end with "w - - 0 1"
|
| 1265 |
+
- If Black to move: end with "b - - 0 1"
|
| 1266 |
+
|
| 1267 |
+
Look at the board carefully to determine whose turn it is based on:
|
| 1268 |
+
1. Any text in the image indicating whose turn
|
| 1269 |
+
2. The position context
|
| 1270 |
+
3. If unclear, look at piece positions
|
| 1271 |
+
|
| 1272 |
+
Respond with ONLY the FEN string, nothing else."""
|
| 1273 |
+
},
|
| 1274 |
+
{
|
| 1275 |
+
"type": "image_url",
|
| 1276 |
+
"image_url": {
|
| 1277 |
+
"url": f"data:image/png;base64,{image_data}"
|
| 1278 |
+
}
|
| 1279 |
+
}
|
| 1280 |
]
|
| 1281 |
)
|
| 1282 |
|
| 1283 |
+
response = llm.invoke([message])
|
| 1284 |
fen = response.content.strip()
|
| 1285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1286 |
print(f"β FEN: {fen}")
|
| 1287 |
|
| 1288 |
+
# ===== FIX: Parse whose turn it is from FEN =====
|
| 1289 |
+
# FEN format: position w/b castling en-passant halfmove fullmove
|
| 1290 |
+
fen_parts = fen.split()
|
| 1291 |
+
|
| 1292 |
+
# Ensure we have the turn indicator
|
| 1293 |
+
if len(fen_parts) < 2:
|
| 1294 |
+
# Default to white if not specified
|
| 1295 |
+
fen = f"{fen} w - - 0 1"
|
| 1296 |
+
fen_parts = fen.split()
|
| 1297 |
+
|
| 1298 |
+
# Get whose turn it is
|
| 1299 |
+
turn = fen_parts[1] if len(fen_parts) > 1 else 'w'
|
| 1300 |
+
print(f"β Turn: {'Black' if turn == 'b' else 'White'}")
|
| 1301 |
+
|
| 1302 |
+
# ===== END FIX =====
|
| 1303 |
+
|
| 1304 |
# Analyze with Stockfish
|
| 1305 |
try:
|
| 1306 |
+
board = chess.Board(fen)
|
| 1307 |
+
except ValueError as e:
|
| 1308 |
+
raise ValueError(f"Invalid FEN from Gemini: {fen}. Error: {e}")
|
| 1309 |
+
|
| 1310 |
+
# Configure Stockfish
|
| 1311 |
+
stockfish_path = "/usr/games/stockfish"
|
| 1312 |
+
if not os.path.exists(stockfish_path):
|
| 1313 |
+
raise FileNotFoundError("Stockfish not found at /usr/games/stockfish")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
|
| 1315 |
+
engine = chess.engine.SimpleEngine.popen_uci(stockfish_path)
|
|
|
|
| 1316 |
|
| 1317 |
+
# ===== FIX: Analyze with appropriate depth =====
|
| 1318 |
+
# For tactical positions (like mate puzzles), need deeper analysis
|
| 1319 |
+
result = engine.analyse(board, chess.engine.Limit(depth=20))
|
| 1320 |
+
# ===== END FIX =====
|
| 1321 |
|
| 1322 |
+
best_move = result["pv"][0] # Principal variation (best line)
|
| 1323 |
+
engine.quit()
|
|
|
|
| 1324 |
|
| 1325 |
+
# Convert to algebraic notation
|
| 1326 |
+
move_san = board.san(best_move)
|
|
|
|
|
|
|
| 1327 |
|
| 1328 |
+
print(f"β Best move: {move_san}")
|
| 1329 |
|
| 1330 |
telemetry.record_call("analyze_chess_position", time.time() - start_time, True)
|
| 1331 |
+
|
| 1332 |
+
# ===== FIX: Include turn info in response =====
|
| 1333 |
+
turn_text = "Black" if turn == 'b' else "White"
|
| 1334 |
+
return f"{move_san} ({turn_text} to move, from FEN: {fen})"
|
| 1335 |
+
# ===== END FIX =====
|
| 1336 |
|
| 1337 |
except Exception as e:
|
| 1338 |
telemetry.record_call("analyze_chess_position", time.time() - start_time, False)
|
| 1339 |
+
raise ToolError("analyze_chess_position", e, "Check image quality and Stockfish installation")
|
| 1340 |
|
| 1341 |
class ImageAnalysisInput(BaseModel):
|
| 1342 |
file_path: str = Field(description="Image file path")
|
|
|
|
| 1657 |
@tool(args_schema=ScrapeInput)
|
| 1658 |
@retry_with_backoff(max_retries=3)
|
| 1659 |
def scrape_and_retrieve(url: str, query: str) -> str:
|
| 1660 |
+
"""
|
| 1661 |
+
Scrape webpage and retrieve relevant sections using RAG with smart fallbacks.
|
| 1662 |
+
"""
|
| 1663 |
start_time = time.time()
|
| 1664 |
|
| 1665 |
try:
|
| 1666 |
+
is_valid, msg = validate_tool_inputs("scrape_and_retrieve", {"url": url, "query": query})
|
|
|
|
| 1667 |
if not is_valid:
|
| 1668 |
raise ValueError(msg)
|
| 1669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1670 |
print(f"π Scraping: {url}")
|
| 1671 |
+
print(f" Looking for: {query[:50]}...")
|
| 1672 |
|
| 1673 |
+
# ===== TRY PRIMARY URL =====
|
| 1674 |
+
try:
|
| 1675 |
+
response = requests.get(url, timeout=15, headers={
|
| 1676 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 1677 |
+
})
|
| 1678 |
+
response.raise_for_status()
|
| 1679 |
+
|
| 1680 |
+
except requests.exceptions.HTTPError as e:
|
| 1681 |
+
if e.response.status_code == 404:
|
| 1682 |
+
print(f" β 404 error, trying fallbacks...")
|
| 1683 |
+
|
| 1684 |
+
# ===== FALLBACK 1: Try alternative URL formats =====
|
| 1685 |
+
if "wikipedia.org" in url:
|
| 1686 |
+
fallback_urls = []
|
| 1687 |
+
|
| 1688 |
+
# Example: Wikipedia:Featured_articles/2016_November
|
| 1689 |
+
# Try: Wikipedia:Featured_articles#2016
|
| 1690 |
+
if "/20" in url and "_" in url:
|
| 1691 |
+
# Extract year
|
| 1692 |
+
import re
|
| 1693 |
+
year_match = re.search(r'/(\d{4})', url)
|
| 1694 |
+
if year_match:
|
| 1695 |
+
year = year_match.group(1)
|
| 1696 |
+
# Try anchor link format
|
| 1697 |
+
base_url = url.split('/20')[0]
|
| 1698 |
+
fallback_urls.append(f"{base_url}#{year}")
|
| 1699 |
+
# Try without year suffix
|
| 1700 |
+
fallback_urls.append(base_url)
|
| 1701 |
+
|
| 1702 |
+
# Try with underscores replaced by spaces (URL encoded)
|
| 1703 |
+
if "_" in url:
|
| 1704 |
+
fallback_urls.append(url.replace("_", "%20"))
|
| 1705 |
+
|
| 1706 |
+
# Try each fallback
|
| 1707 |
+
for fallback_url in fallback_urls:
|
| 1708 |
+
try:
|
| 1709 |
+
print(f" Trying fallback: {fallback_url}")
|
| 1710 |
+
response = requests.get(fallback_url, timeout=15, headers={
|
| 1711 |
+
'User-Agent': 'Mozilla/5.0'
|
| 1712 |
+
})
|
| 1713 |
+
response.raise_for_status()
|
| 1714 |
+
url = fallback_url # Update URL for later
|
| 1715 |
+
print(f" β Fallback succeeded!")
|
| 1716 |
+
break
|
| 1717 |
+
except:
|
| 1718 |
+
continue
|
| 1719 |
+
else:
|
| 1720 |
+
# All fallbacks failed
|
| 1721 |
+
# ===== FALLBACK 2: Use Wikipedia search =====
|
| 1722 |
+
print(f" All URL fallbacks failed, trying Wikipedia search...")
|
| 1723 |
+
|
| 1724 |
+
# Extract search terms from URL
|
| 1725 |
+
search_terms = url.split('/')[-1].replace('_', ' ').replace('%20', ' ')
|
| 1726 |
+
|
| 1727 |
+
# Search Wikipedia
|
| 1728 |
+
search_url = f"https://en.wikipedia.org/w/api.php?action=opensearch&search={search_terms}&limit=1&format=json"
|
| 1729 |
+
search_response = requests.get(search_url, timeout=10)
|
| 1730 |
+
search_data = search_response.json()
|
| 1731 |
+
|
| 1732 |
+
if len(search_data) > 3 and search_data[3]:
|
| 1733 |
+
# Found a result
|
| 1734 |
+
wiki_url = search_data[3][0]
|
| 1735 |
+
print(f" β Found via search: {wiki_url}")
|
| 1736 |
+
response = requests.get(wiki_url, timeout=15, headers={
|
| 1737 |
+
'User-Agent': 'Mozilla/5.0'
|
| 1738 |
+
})
|
| 1739 |
+
response.raise_for_status()
|
| 1740 |
+
url = wiki_url
|
| 1741 |
+
else:
|
| 1742 |
+
raise ToolError(
|
| 1743 |
+
"scrape_and_retrieve",
|
| 1744 |
+
Exception(f"404 and all fallbacks failed for {url}"),
|
| 1745 |
+
"Try using wikipedia_search tool to find the correct article first"
|
| 1746 |
+
)
|
| 1747 |
+
|
| 1748 |
+
else:
|
| 1749 |
+
# Non-Wikipedia 404
|
| 1750 |
+
raise
|
| 1751 |
+
else:
|
| 1752 |
+
# Other HTTP error
|
| 1753 |
+
raise
|
| 1754 |
|
| 1755 |
+
# ===== END FALLBACKS =====
|
|
|
|
|
|
|
| 1756 |
|
| 1757 |
+
# Parse content
|
| 1758 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 1759 |
|
| 1760 |
+
# Remove unwanted elements
|
| 1761 |
+
for element in soup(['script', 'style', 'nav', 'header', 'footer']):
|
| 1762 |
+
element.decompose()
|
| 1763 |
|
| 1764 |
+
text = soup.get_text(separator='\n', strip=True)
|
|
|
|
|
|
|
| 1765 |
|
| 1766 |
+
if len(text) < 100:
|
| 1767 |
+
raise ValueError(f"Insufficient content extracted from {url}")
|
| 1768 |
|
| 1769 |
print(f"β Extracted {len(text)} characters")
|
| 1770 |
|
| 1771 |
# RAG retrieval
|
| 1772 |
+
docs = [Document(page_content=text, metadata={"source": url})]
|
| 1773 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 1774 |
+
chunk_size=Config.CHUNK_SIZE,
|
| 1775 |
+
chunk_overlap=Config.CHUNK_OVERLAP
|
| 1776 |
+
)
|
| 1777 |
+
chunks = text_splitter.split_documents(docs)
|
| 1778 |
|
| 1779 |
+
print(f"β Created {len(chunks)} chunks")
|
| 1780 |
|
| 1781 |
+
# Search for relevant chunks
|
| 1782 |
+
vectorstore = FAISS.from_documents(chunks, rag_manager.embeddings)
|
| 1783 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
| 1784 |
+
relevant_docs = retriever.invoke(query)
|
| 1785 |
|
| 1786 |
+
print(f"β Found {len(relevant_docs)} relevant chunks")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1787 |
|
| 1788 |
+
# Format results
|
| 1789 |
+
results = []
|
| 1790 |
+
for i, doc in enumerate(relevant_docs, 1):
|
| 1791 |
+
content = doc.page_content.strip()
|
| 1792 |
+
results.append(f"[Section {i}]\n{content}")
|
| 1793 |
|
| 1794 |
+
result = f"From {url}:\n\n" + "\n\n".join(results)
|
| 1795 |
|
| 1796 |
+
# Cleanup
|
| 1797 |
+
del vectorstore
|
| 1798 |
+
gc.collect()
|
| 1799 |
|
| 1800 |
telemetry.record_call("scrape_and_retrieve", time.time() - start_time, True)
|
| 1801 |
+
return truncate_if_needed(result)
|
| 1802 |
|
|
|
|
|
|
|
|
|
|
| 1803 |
except Exception as e:
|
| 1804 |
telemetry.record_call("scrape_and_retrieve", time.time() - start_time, False)
|
| 1805 |
raise ToolError("scrape_and_retrieve", e)
|
|
|
|
| 1836 |
if not GOOGLE_API_KEY:
|
| 1837 |
raise ValueError("GEMINI_API_KEY not set")
|
| 1838 |
|
| 1839 |
+
# Read video as base64
|
| 1840 |
+
print(f" Reading video file...")
|
| 1841 |
+
with open(video_path, "rb") as f:
|
| 1842 |
+
video_data = base64.b64encode(f.read()).decode("utf-8")
|
|
|
|
|
|
|
| 1843 |
|
| 1844 |
+
# Use Gemini via LangChain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1845 |
print(f" Analyzing with Gemini...")
|
| 1846 |
+
llm = ChatGoogleGenerativeAI(
|
| 1847 |
+
model="gemini-2.5-flash",
|
| 1848 |
+
google_api_key=GOOGLE_API_KEY,
|
| 1849 |
+
temperature=0
|
| 1850 |
+
)
|
|
|
|
| 1851 |
|
| 1852 |
+
# Create message with video
|
| 1853 |
+
message = HumanMessage(
|
| 1854 |
+
content=[
|
| 1855 |
+
{
|
| 1856 |
+
"type": "text",
|
| 1857 |
+
"text": query
|
| 1858 |
+
},
|
| 1859 |
+
{
|
| 1860 |
+
"type": "video_url",
|
| 1861 |
+
"video_url": {
|
| 1862 |
+
"url": f"data:video/mp4;base64,{video_data}"
|
| 1863 |
+
}
|
| 1864 |
+
}
|
| 1865 |
+
]
|
| 1866 |
+
)
|
| 1867 |
|
| 1868 |
+
response = llm.invoke([message])
|
| 1869 |
+
result = response.content
|
| 1870 |
|
| 1871 |
print(f"β Analysis complete: {len(result)} chars")
|
| 1872 |
|
|
|
|
| 1903 |
create_plan,
|
| 1904 |
reflect_on_progress,
|
| 1905 |
validate_answer,
|
| 1906 |
+
analyze_data_file,
|
| 1907 |
|
| 1908 |
# Core tools
|
| 1909 |
search_tool,
|
|
|
|
| 2165 |
REMEMBER: wikipedia_search() wants just the SUBJECT NAME!
|
| 2166 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2167 |
|
| 2168 |
+
**YOUTUBE VIDEO HANDLING:**
|
| 2169 |
+
β οΈ YouTube URLs are BLOCKED on HuggingFace Spaces!
|
| 2170 |
+
|
| 2171 |
+
IF question mentions YouTube URL AND local video file exists:
|
| 2172 |
+
β Use analyze_video tool on the local .mp4 file instead
|
| 2173 |
+
β The local file contains the same video content
|
| 2174 |
|
| 2175 |
Example:
|
| 2176 |
+
Question: "In video https://youtube.com/watch?v=abc, how many birds?"
|
| 2177 |
+
File: files/task_123.mp4
|
| 2178 |
+
β
CORRECT: analyze_video("files/task_123.mp4", "count bird species")
|
| 2179 |
β WRONG: get_youtube_transcript("https://youtube.com/...")
|
| 2180 |
|
| 2181 |
|
|
|
|
| 2295 |
# Start with Groq
|
| 2296 |
self.llm_with_tools = self.groq_llm
|
| 2297 |
self.current_llm = "groq"
|
| 2298 |
+
|
| 2299 |
+
def prune_context_if_needed(state: AgentState) -> AgentState:
|
| 2300 |
+
"""
|
| 2301 |
+
Prune conversation history if it's getting too long.
|
| 2302 |
+
Keeps system message + recent history to stay under token limits.
|
| 2303 |
+
"""
|
| 2304 |
+
messages = state.get("messages", [])
|
| 2305 |
+
|
| 2306 |
+
# Keep first message (system prompt) + last N messages
|
| 2307 |
+
MAX_MESSAGES = 20 # Adjust based on your needs
|
| 2308 |
+
|
| 2309 |
+
if len(messages) > MAX_MESSAGES:
|
| 2310 |
+
print(f"β οΈ Context pruning: {len(messages)} messages β {MAX_MESSAGES}")
|
| 2311 |
+
|
| 2312 |
+
# Always keep system message (if it exists)
|
| 2313 |
+
system_msg = None
|
| 2314 |
+
if messages and isinstance(messages[0], SystemMessage):
|
| 2315 |
+
system_msg = messages[0]
|
| 2316 |
+
messages = messages[1:]
|
| 2317 |
+
|
| 2318 |
+
# Keep only recent messages
|
| 2319 |
+
recent_messages = messages[-(MAX_MESSAGES-1):]
|
| 2320 |
+
|
| 2321 |
+
# Reconstruct
|
| 2322 |
+
if system_msg:
|
| 2323 |
+
state["messages"] = [system_msg] + recent_messages
|
| 2324 |
+
else:
|
| 2325 |
+
state["messages"] = recent_messages
|
| 2326 |
+
|
| 2327 |
+
return state
|
| 2328 |
|
| 2329 |
# Build agent graph
|
| 2330 |
def agent_node(state: AgentState):
|
|
|
|
| 2333 |
print(f"\n{'='*70}")
|
| 2334 |
print(f"π€ AGENT TURN {current_turn}/{config.MAX_TURNS}")
|
| 2335 |
print('='*70)
|
| 2336 |
+
|
| 2337 |
+
state = prune_context_if_needed(state)
|
| 2338 |
|
| 2339 |
if current_turn > config.MAX_TURNS:
|
| 2340 |
return {
|