Spaces:
Running
Running
Commit Β·
cc2fef3
1
Parent(s): 21a19e7
high-priority call to the database (load_messages) to pull the absolute truth
Browse files- Dockerfile +4 -20
- app/agents/adk_mathminds.py +27 -10
- app/api/main.py +6 -2
- app/core/settings.py +9 -0
- app/tools/web_scraper.py +0 -210
- app/worker/tasks.py +9 -13
- cloudbuild.yaml +6 -2
- frontend/app.py +9 -10
- requirements.txt +5 -3
Dockerfile
CHANGED
|
@@ -1,42 +1,26 @@
|
|
| 1 |
-
# Use official Python runtime as a parent image
|
| 2 |
FROM python:3.12-slim
|
| 3 |
|
| 4 |
-
# Set environment variables
|
| 5 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
ENV PYTHONUNBUFFERED=1
|
| 7 |
-
ENV PORT=8000
|
| 8 |
|
| 9 |
-
# Set working directory
|
| 10 |
WORKDIR /app
|
| 11 |
|
| 12 |
-
# Install system dependencies
|
| 13 |
-
# libgl1 and libglib2.0-0 are for OpenCV/Computer Vision
|
| 14 |
-
# curl is for health checks
|
| 15 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 16 |
build-essential \
|
| 17 |
curl \
|
| 18 |
-
|
|
|
|
| 19 |
libglib2.0-0 \
|
| 20 |
&& rm -rf /var/lib/apt/lists/*
|
| 21 |
|
| 22 |
-
# Install Python dependencies
|
| 23 |
COPY requirements.txt .
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
-
# Install Playwright browser and its dependencies
|
| 27 |
-
RUN playwright install chromium
|
| 28 |
-
RUN playwright install-deps chromium
|
| 29 |
-
|
| 30 |
-
# Copy the rest of the application
|
| 31 |
COPY . .
|
| 32 |
|
| 33 |
-
# Create a non-root user and switch to it for security
|
| 34 |
-
# Ensure the user has access to playwright browsers
|
| 35 |
RUN useradd -m appuser && chown -R appuser /app
|
| 36 |
USER appuser
|
| 37 |
|
| 38 |
-
|
| 39 |
-
EXPOSE 8000
|
| 40 |
|
| 41 |
-
|
| 42 |
-
CMD exec gunicorn --bind :$PORT --workers 1 --worker-class uvicorn.workers.UvicornWorker --timeout 0 app.api.main:app
|
|
|
|
|
|
|
| 1 |
FROM python:3.12-slim
|
| 2 |
|
|
|
|
| 3 |
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
ENV PYTHONUNBUFFERED=1
|
|
|
|
| 5 |
|
|
|
|
| 6 |
WORKDIR /app
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
build-essential \
|
| 10 |
curl \
|
| 11 |
+
git \
|
| 12 |
+
libgl1 \
|
| 13 |
libglib2.0-0 \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
|
|
|
| 16 |
COPY requirements.txt .
|
| 17 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
COPY . .
|
| 20 |
|
|
|
|
|
|
|
| 21 |
RUN useradd -m appuser && chown -R appuser /app
|
| 22 |
USER appuser
|
| 23 |
|
| 24 |
+
EXPOSE 8080
|
|
|
|
| 25 |
|
| 26 |
+
CMD exec gunicorn --bind :$PORT --workers 1 --worker-class uvicorn.workers.UvicornWorker --timeout 0 app.api.main:app
|
|
|
app/agents/adk_mathminds.py
CHANGED
|
@@ -15,7 +15,6 @@ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_excep
|
|
| 15 |
|
| 16 |
from app.core.settings import settings
|
| 17 |
from app.core.llm_guard import check_and_increment
|
| 18 |
-
from app.tools.web_scraper import WebScraper
|
| 19 |
from app.tools.symbolic_solver import SymbolicSolver
|
| 20 |
from app.tools.similarity_search import SimilarProblemFinder
|
| 21 |
from app.tools.python_executor import PythonInterpreter
|
|
@@ -43,7 +42,6 @@ class MathMindsADKAgent:
|
|
| 43 |
logger.warning("No Google API Key found. Agent will fail.")
|
| 44 |
|
| 45 |
# Tool instances
|
| 46 |
-
self.web_scraper = WebScraper(headless=True)
|
| 47 |
self.symbolic_solver = SymbolicSolver()
|
| 48 |
self.normalizer = MathQueryNormalizer()
|
| 49 |
self.similar_finder = SimilarProblemFinder()
|
|
@@ -51,17 +49,31 @@ class MathMindsADKAgent:
|
|
| 51 |
self.advanced_ocr = AdvancedOCR()
|
| 52 |
self.vision_analyzer = VisionAnalyzer()
|
| 53 |
|
| 54 |
-
#
|
| 55 |
async def web_search(query: str) -> str:
|
| 56 |
"""
|
| 57 |
Search the internet for current data: prices, news, weather, facts.
|
| 58 |
Args:
|
| 59 |
query: The search query.
|
| 60 |
"""
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
async def math_solver(problem: str) -> str:
|
| 67 |
"""
|
|
@@ -157,9 +169,14 @@ class MathMindsADKAgent:
|
|
| 157 |
"It uses specialized object detection (YOLO) for accurate quantification."
|
| 158 |
"\n3. For GRAPHS, PLOTS, COORDINATE GEOMETRY, or LOG DIAGRAMS: DO NOT use specialized tools. "
|
| 159 |
"Rely on your NATIVE MULTIMODAL VISION to interpret coordinates, slopes, and trends directly."
|
| 160 |
-
"\
|
| 161 |
-
"`execute_python` to
|
| 162 |
-
"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
)
|
| 164 |
)
|
| 165 |
|
|
|
|
| 15 |
|
| 16 |
from app.core.settings import settings
|
| 17 |
from app.core.llm_guard import check_and_increment
|
|
|
|
| 18 |
from app.tools.symbolic_solver import SymbolicSolver
|
| 19 |
from app.tools.similarity_search import SimilarProblemFinder
|
| 20 |
from app.tools.python_executor import PythonInterpreter
|
|
|
|
| 42 |
logger.warning("No Google API Key found. Agent will fail.")
|
| 43 |
|
| 44 |
# Tool instances
|
|
|
|
| 45 |
self.symbolic_solver = SymbolicSolver()
|
| 46 |
self.normalizer = MathQueryNormalizer()
|
| 47 |
self.similar_finder = SimilarProblemFinder()
|
|
|
|
| 49 |
self.advanced_ocr = AdvancedOCR()
|
| 50 |
self.vision_analyzer = VisionAnalyzer()
|
| 51 |
|
| 52 |
+
# Tool definitions
|
| 53 |
async def web_search(query: str) -> str:
|
| 54 |
"""
|
| 55 |
Search the internet for current data: prices, news, weather, facts.
|
| 56 |
Args:
|
| 57 |
query: The search query.
|
| 58 |
"""
|
| 59 |
+
from google import genai
|
| 60 |
+
from google.genai import types
|
| 61 |
+
|
| 62 |
+
# Using a lightweight flash model for the grounded search
|
| 63 |
+
search_client = genai.Client(api_key=self.api_key)
|
| 64 |
+
try:
|
| 65 |
+
response = search_client.models.generate_content(
|
| 66 |
+
model="gemini-2.5-flash",
|
| 67 |
+
contents=f"Find the latest information for: {query}",
|
| 68 |
+
config=types.GenerateContentConfig(
|
| 69 |
+
tools=[types.Tool(google_search=types.GoogleSearchRetrieval())],
|
| 70 |
+
temperature=0.0
|
| 71 |
+
)
|
| 72 |
+
)
|
| 73 |
+
return response.text or "No specific information found."
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Native Grounding failed: {e}")
|
| 76 |
+
return f"Error searching web: {str(e)}"
|
| 77 |
|
| 78 |
async def math_solver(problem: str) -> str:
|
| 79 |
"""
|
|
|
|
| 169 |
"It uses specialized object detection (YOLO) for accurate quantification."
|
| 170 |
"\n3. For GRAPHS, PLOTS, COORDINATE GEOMETRY, or LOG DIAGRAMS: DO NOT use specialized tools. "
|
| 171 |
"Rely on your NATIVE MULTIMODAL VISION to interpret coordinates, slopes, and trends directly."
|
| 172 |
+
"\n\nSOLVING & INTERPRETATION GUIDELINES:"
|
| 173 |
+
"\n1. Once you have machine-readable data, use `math_solver` or `execute_python` to solve."
|
| 174 |
+
"\n2. IF `math_solver` FAILS or returns an empty result: Immediately attempt the problem using `execute_python`. "
|
| 175 |
+
"In Python, you can use specialized libraries like `numpy`, `scipy`, or `sympy` for numerical and symbolic solutions."
|
| 176 |
+
"\n3. INTERPRET LATEX: Tool outputs (especially from SymPy) are often in raw LaTeX. "
|
| 177 |
+
"NEVER just display the raw LaTeX to the user. Always explain the steps in clear English. "
|
| 178 |
+
"Wrap LaTeX in `$ ... $` for inline or `$$ ... $$` for blocks so the UI renders it properly."
|
| 179 |
+
"\n\nCRITICAL: Always explain your reasoning before and after using tools. If a tool fails, explain WHY and try a different approach."
|
| 180 |
)
|
| 181 |
)
|
| 182 |
|
app/api/main.py
CHANGED
|
@@ -3,7 +3,7 @@ os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
| 3 |
from typing import Any, Dict, Optional, List
|
| 4 |
import sys
|
| 5 |
import asyncio
|
| 6 |
-
|
| 7 |
if sys.platform == 'win32':
|
| 8 |
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
| 9 |
|
|
@@ -72,6 +72,9 @@ app = FastAPI(
|
|
| 72 |
version="1.0.0",
|
| 73 |
lifespan=lifespan
|
| 74 |
)
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# CORS Configuration
|
| 77 |
app.add_middleware(
|
|
@@ -427,7 +430,8 @@ async def update_profile(
|
|
| 427 |
|
| 428 |
if __name__ == "__main__":
|
| 429 |
import uvicorn
|
| 430 |
-
|
|
|
|
| 431 |
# ββ Auth Endpoints (DECOMMISSIONED - Use Firebase) ββββββββββββββββββββββββββ
|
| 432 |
|
| 433 |
@app.post("/auth/signup")
|
|
|
|
| 3 |
from typing import Any, Dict, Optional, List
|
| 4 |
import sys
|
| 5 |
import asyncio
|
| 6 |
+
|
| 7 |
if sys.platform == 'win32':
|
| 8 |
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
| 9 |
|
|
|
|
| 72 |
version="1.0.0",
|
| 73 |
lifespan=lifespan
|
| 74 |
)
|
| 75 |
+
@app.get("/")
|
| 76 |
+
async def root():
|
| 77 |
+
return {"message": "MathMinds API running"}
|
| 78 |
|
| 79 |
# CORS Configuration
|
| 80 |
app.add_middleware(
|
|
|
|
| 430 |
|
| 431 |
if __name__ == "__main__":
|
| 432 |
import uvicorn
|
| 433 |
+
port = int(os.environ.get("PORT", 8080))
|
| 434 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 435 |
# ββ Auth Endpoints (DECOMMISSIONED - Use Firebase) ββββββββββββββββββββββββββ
|
| 436 |
|
| 437 |
@app.post("/auth/signup")
|
app/core/settings.py
CHANGED
|
@@ -10,6 +10,11 @@ class Settings(BaseSettings):
|
|
| 10 |
# Core API Keys (Required)
|
| 11 |
GOOGLE_API_KEY: str
|
| 12 |
GOOGLE_CLOUD_PROJECT: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# Environment
|
|
@@ -61,6 +66,10 @@ class Settings(BaseSettings):
|
|
| 61 |
raise ValueError("REDIS_URL must be set in production environment")
|
| 62 |
if not self.FIREBASE_CREDENTIALS_PATH:
|
| 63 |
raise ValueError("FIREBASE_CREDENTIALS_PATH must be set in production environment")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# Set Defaults for Development
|
| 66 |
else:
|
|
|
|
| 10 |
# Core API Keys (Required)
|
| 11 |
GOOGLE_API_KEY: str
|
| 12 |
GOOGLE_CLOUD_PROJECT: Optional[str] = None
|
| 13 |
+
|
| 14 |
+
# Vertex AI Search (Grounding/Discovery Engine)
|
| 15 |
+
VERTEX_SEARCH_PROJECT_ID: Optional[str] = None
|
| 16 |
+
VERTEX_SEARCH_LOCATION: str = "global"
|
| 17 |
+
VERTEX_SEARCH_DATA_STORE_ID: Optional[str] = None
|
| 18 |
|
| 19 |
|
| 20 |
# Environment
|
|
|
|
| 66 |
raise ValueError("REDIS_URL must be set in production environment")
|
| 67 |
if not self.FIREBASE_CREDENTIALS_PATH:
|
| 68 |
raise ValueError("FIREBASE_CREDENTIALS_PATH must be set in production environment")
|
| 69 |
+
if not self.VERTEX_SEARCH_DATA_STORE_ID:
|
| 70 |
+
# We allow it to be empty if the user wants to fallback to scraping,
|
| 71 |
+
# but for 100% production readiness we should warn.
|
| 72 |
+
logger.warning("VERTEX_SEARCH_DATA_STORE_ID is not set. Scraper will use fallback logic.")
|
| 73 |
|
| 74 |
# Set Defaults for Development
|
| 75 |
else:
|
app/tools/web_scraper.py
DELETED
|
@@ -1,210 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import asyncio
|
| 3 |
-
import re
|
| 4 |
-
import random
|
| 5 |
-
from typing import Optional, Dict, Any
|
| 6 |
-
from playwright.sync_api import sync_playwright
|
| 7 |
-
from playwright_stealth import Stealth
|
| 8 |
-
from bs4 import BeautifulSoup
|
| 9 |
-
from fake_useragent import UserAgent
|
| 10 |
-
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
def run_playwright_sync(query: str, headless: bool, extraction_focus: Optional[str] = None) -> Dict[str, Any]:
|
| 14 |
-
"""
|
| 15 |
-
Refined Scraper with Stealth Mode:
|
| 16 |
-
1. Stealth Integration: Uses playwright-stealth to bypass bot detection.
|
| 17 |
-
2. Dynamic Search: Simulates human-like interaction on search engines.
|
| 18 |
-
3. Table Preservation: Converts HTML tables to structured text.
|
| 19 |
-
4. Anti-Detection: Enhanced headers, randomized delays, and metadata masking.
|
| 20 |
-
"""
|
| 21 |
-
ua = UserAgent()
|
| 22 |
-
user_agent = ua.chrome
|
| 23 |
-
|
| 24 |
-
try:
|
| 25 |
-
with sync_playwright() as p:
|
| 26 |
-
# P1: Browser Launch
|
| 27 |
-
browser = p.chromium.launch(headless=headless)
|
| 28 |
-
|
| 29 |
-
# Create context with realistic window size and headers
|
| 30 |
-
context = browser.new_context(
|
| 31 |
-
user_agent=user_agent,
|
| 32 |
-
viewport={"width": 1920, "height": 1080},
|
| 33 |
-
device_scale_factor=1,
|
| 34 |
-
has_touch=False,
|
| 35 |
-
is_mobile=False,
|
| 36 |
-
extra_http_headers={
|
| 37 |
-
"Accept-Language": "en-US,en;q=0.9",
|
| 38 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
| 39 |
-
"Sec-Fetch-Site": "none",
|
| 40 |
-
"Sec-Fetch-Mode": "navigate",
|
| 41 |
-
"Sec-Fetch-Dest": "document",
|
| 42 |
-
"Upgrade-Insecure-Requests": "1"
|
| 43 |
-
}
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
page = context.new_page()
|
| 47 |
-
|
| 48 |
-
# --- P0: APPLY STEALTH ---
|
| 49 |
-
Stealth().apply_stealth_sync(page)
|
| 50 |
-
|
| 51 |
-
# --- P1: IMPROVED NAVIGATION LOGIC ---
|
| 52 |
-
target_url = None
|
| 53 |
-
if query.startswith("http"):
|
| 54 |
-
target_url = query
|
| 55 |
-
else:
|
| 56 |
-
try:
|
| 57 |
-
logger.info("Performing stealth search on DuckDuckGo...")
|
| 58 |
-
# Go to home page first to establish cookies/session
|
| 59 |
-
page.goto("https://duckduckgo.com/", wait_until="networkidle", timeout=30000)
|
| 60 |
-
|
| 61 |
-
# Human-like delay before typing
|
| 62 |
-
page.wait_for_timeout(random.randint(500, 1500))
|
| 63 |
-
|
| 64 |
-
search_input = 'input[name="q"]'
|
| 65 |
-
page.wait_for_selector(search_input, timeout=10000)
|
| 66 |
-
|
| 67 |
-
# Simulate realistic typing speed
|
| 68 |
-
page.type(search_input, query, delay=random.randint(50, 200))
|
| 69 |
-
page.wait_for_timeout(random.randint(300, 700))
|
| 70 |
-
page.press(search_input, "Enter")
|
| 71 |
-
|
| 72 |
-
# Wait for results
|
| 73 |
-
page.wait_for_load_state("networkidle", timeout=20000)
|
| 74 |
-
|
| 75 |
-
# Robust search for result links
|
| 76 |
-
selectors = [
|
| 77 |
-
'a[data-testid="result-title-a"]',
|
| 78 |
-
'#links .result__a',
|
| 79 |
-
'h2 a',
|
| 80 |
-
'.result__url' # Backup
|
| 81 |
-
]
|
| 82 |
-
|
| 83 |
-
for selector in selectors:
|
| 84 |
-
try:
|
| 85 |
-
logger.info(f"Trying selector: {selector}")
|
| 86 |
-
first_link = page.wait_for_selector(selector, timeout=5000)
|
| 87 |
-
if first_link:
|
| 88 |
-
target_url = first_link.get_attribute("href")
|
| 89 |
-
if target_url:
|
| 90 |
-
logger.info(f"Found target_url: {target_url} using {selector}")
|
| 91 |
-
break
|
| 92 |
-
except Exception:
|
| 93 |
-
continue
|
| 94 |
-
|
| 95 |
-
except Exception as e:
|
| 96 |
-
logger.warning(f"Stealth DDG search failed: {e}. Trying secondary fallback.")
|
| 97 |
-
# Use a clean direct search URL if interactive fails
|
| 98 |
-
target_url = f"https://duckduckgo.com/?q={query}"
|
| 99 |
-
|
| 100 |
-
# CRITICAL: Ensure we have a URL to navigate to
|
| 101 |
-
if not target_url:
|
| 102 |
-
target_url = f"https://duckduckgo.com/?q={query}"
|
| 103 |
-
logger.info(f"Final fallback to DDG search page: {target_url}")
|
| 104 |
-
|
| 105 |
-
logger.info(f"Navigating to final target: {target_url}")
|
| 106 |
-
|
| 107 |
-
# --- P0: PAGE NAVIGATION & CONTENT LOAD ---
|
| 108 |
-
try:
|
| 109 |
-
# Some sites need more time to execute JS after networkidle
|
| 110 |
-
page.goto(target_url, timeout=45000, wait_until="networkidle")
|
| 111 |
-
page.wait_for_timeout(random.randint(1000, 2000))
|
| 112 |
-
except Exception as e:
|
| 113 |
-
logger.warning(f"Deep network idle timeout on {target_url}, using DOM snapshot.")
|
| 114 |
-
# If target_url is None, this will fail. Handled above.
|
| 115 |
-
try:
|
| 116 |
-
page.goto(target_url, wait_until="domcontentloaded", timeout=15000)
|
| 117 |
-
except Exception:
|
| 118 |
-
pass
|
| 119 |
-
|
| 120 |
-
# --- P0: STRUCTURED CONTENT EXTRACTION ---
|
| 121 |
-
html_content = page.content()
|
| 122 |
-
soup = BeautifulSoup(html_content, "html.parser")
|
| 123 |
-
|
| 124 |
-
# Remove purely visual/scripting elements
|
| 125 |
-
for element in soup(["script", "style", "nav", "footer", "header", "noscript", "svg", "iframe", "button"]):
|
| 126 |
-
element.decompose()
|
| 127 |
-
|
| 128 |
-
# PRESERVE TABLES (Enhanced)
|
| 129 |
-
for table in soup.find_all("table"):
|
| 130 |
-
table_text = []
|
| 131 |
-
# Handle headers if present
|
| 132 |
-
thead = table.find("thead")
|
| 133 |
-
if thead:
|
| 134 |
-
headers = [h.get_text(strip=True) for h in thead.find_all(["th", "td"])]
|
| 135 |
-
if any(headers):
|
| 136 |
-
table_text.append(" | ".join(headers))
|
| 137 |
-
table_text.append("-" * len(" | ".join(headers)))
|
| 138 |
-
|
| 139 |
-
# Handle rows
|
| 140 |
-
for row in table.find_all("tr"):
|
| 141 |
-
# Skip rows that are already handled in thead
|
| 142 |
-
if thead and row.parent == thead:
|
| 143 |
-
continue
|
| 144 |
-
cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
|
| 145 |
-
if any(cells): # Avoid empty rows
|
| 146 |
-
table_text.append(" | ".join(cells))
|
| 147 |
-
|
| 148 |
-
if table_text:
|
| 149 |
-
table.replace_with("\n[TABLE START]\n" + "\n".join(table_text) + "\n[TABLE END]\n")
|
| 150 |
-
|
| 151 |
-
# Get clean, structured text
|
| 152 |
-
text = soup.get_text(separator="\n", strip=True)
|
| 153 |
-
|
| 154 |
-
# Content Filtering
|
| 155 |
-
if extraction_focus:
|
| 156 |
-
lines = text.split("\n")
|
| 157 |
-
pattern = re.compile(re.escape(extraction_focus), re.IGNORECASE)
|
| 158 |
-
# Capture context (line before and after)
|
| 159 |
-
relevant_content = []
|
| 160 |
-
for i, line in enumerate(lines):
|
| 161 |
-
if pattern.search(line):
|
| 162 |
-
if i > 0: relevant_content.append(lines[i-1])
|
| 163 |
-
relevant_content.append(line)
|
| 164 |
-
if i < len(lines) - 1: relevant_content.append(lines[i+1])
|
| 165 |
-
relevant_content.append("-" * 10)
|
| 166 |
-
|
| 167 |
-
if relevant_content:
|
| 168 |
-
final_content = "\n".join(relevant_content[:60]) # Larger snippet for context
|
| 169 |
-
else:
|
| 170 |
-
final_content = text[:5000] + "\n[Note: Extraction focus term not found]"
|
| 171 |
-
else:
|
| 172 |
-
final_content = text[:10000] # Increased for better LLM performance
|
| 173 |
-
|
| 174 |
-
browser.close()
|
| 175 |
-
|
| 176 |
-
return {
|
| 177 |
-
"source": "web_scraper",
|
| 178 |
-
"url": str(target_url), # Ensure string for JSON safety
|
| 179 |
-
"content": final_content,
|
| 180 |
-
"status": "success"
|
| 181 |
-
}
|
| 182 |
-
|
| 183 |
-
except Exception as e:
|
| 184 |
-
logger.error(f"Stealth scraping failed: {e}")
|
| 185 |
-
return {"source": "web_scraper", "error": str(e), "status": "error"}
|
| 186 |
-
|
| 187 |
-
class WebScraper:
|
| 188 |
-
def __init__(self, headless: bool = True):
|
| 189 |
-
self.headless = headless
|
| 190 |
-
|
| 191 |
-
async def scrape(self, query: str, extraction_focus: Optional[str] = None) -> Dict[str, Any]:
|
| 192 |
-
"""Dispatches the scraping task to the backround worker (Celery)"""
|
| 193 |
-
logger.info(f"WebScraper: Initializing stealth scrape for: {query}")
|
| 194 |
-
|
| 195 |
-
try:
|
| 196 |
-
from app.worker.tasks import scrape_task
|
| 197 |
-
task = scrape_task.delay(query, self.headless, extraction_focus)
|
| 198 |
-
|
| 199 |
-
# Wait for result with a generous timeout (stealth takes longer)
|
| 200 |
-
max_retries = 90
|
| 201 |
-
for _ in range(max_retries):
|
| 202 |
-
if task.ready():
|
| 203 |
-
return task.result
|
| 204 |
-
await asyncio.sleep(1)
|
| 205 |
-
|
| 206 |
-
return {"source": "web_scraper", "error": "Worker timeout (Page was likely too heavy)", "status": "error"}
|
| 207 |
-
|
| 208 |
-
except Exception as e:
|
| 209 |
-
logger.warning(f"Worker unavailable, falling back to local thread: {e}")
|
| 210 |
-
return await asyncio.to_thread(run_playwright_sync, query, self.headless, extraction_focus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/worker/tasks.py
CHANGED
|
@@ -1,22 +1,18 @@
|
|
| 1 |
import logging
|
| 2 |
from app.worker.celery_app import celery_app
|
| 3 |
-
from app.tools.web_scraper import run_playwright_sync
|
| 4 |
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
@celery_app.task(name="app.worker.tasks.scrape_task", bind=True)
|
| 8 |
def scrape_task(self, query: str, headless: bool = True, extraction_focus: str = None):
|
| 9 |
"""
|
| 10 |
-
|
|
|
|
| 11 |
"""
|
| 12 |
-
logger.info(f"
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
"source": "web_scraper",
|
| 20 |
-
"error": str(e),
|
| 21 |
-
"status": "error"
|
| 22 |
-
}
|
|
|
|
| 1 |
import logging
|
| 2 |
from app.worker.celery_app import celery_app
|
|
|
|
| 3 |
|
| 4 |
logger = logging.getLogger(__name__)
|
| 5 |
|
| 6 |
@celery_app.task(name="app.worker.tasks.scrape_task", bind=True)
|
| 7 |
def scrape_task(self, query: str, headless: bool = True, extraction_focus: str = None):
|
| 8 |
"""
|
| 9 |
+
Legacy scraper task. Web search has been migrated to Native Gemini Grounding.
|
| 10 |
+
This task is now a placeholder to maintain backward compatibility with any remaining calls.
|
| 11 |
"""
|
| 12 |
+
logger.info(f"Scrape task {self.request.id} called for: {query} (Legacy/No-Op)")
|
| 13 |
+
return {
|
| 14 |
+
"source": "web_scraper_legacy",
|
| 15 |
+
"content": "Notice: Web search has been migrated to Native Gemini Grounding. Use the web_search tool directly.",
|
| 16 |
+
"status": "success",
|
| 17 |
+
"url": "N/A"
|
| 18 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
cloudbuild.yaml
CHANGED
|
@@ -43,10 +43,14 @@ steps:
|
|
| 43 |
- 'run'
|
| 44 |
- 'deploy'
|
| 45 |
- 'mathminds-worker'
|
| 46 |
-
- '--
|
| 47 |
-
- '--
|
| 48 |
- '--command=celery,-A,app.worker.celery_app,worker,--loglevel=info,--pool=solo'
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
images:
|
| 51 |
- 'gcr.io/$PROJECT_ID/mathminds-backend:latest'
|
| 52 |
- 'gcr.io/$PROJECT_ID/mathminds-frontend:latest'
|
|
|
|
| 43 |
- 'run'
|
| 44 |
- 'deploy'
|
| 45 |
- 'mathminds-worker'
|
| 46 |
+
- '--set-env-vars=ENV=production,REDIS_URL=$_REDIS_URL,MONGO_URI=$_MONGO_URI'
|
| 47 |
+
- '--min-instances=1'
|
| 48 |
- '--command=celery,-A,app.worker.celery_app,worker,--loglevel=info,--pool=solo'
|
| 49 |
|
| 50 |
+
substitutions:
|
| 51 |
+
_REDIS_URL: "redis://localhost:6379/0"
|
| 52 |
+
_MONGO_URI: "mongodb://localhost:27017/mathminds"
|
| 53 |
+
|
| 54 |
images:
|
| 55 |
- 'gcr.io/$PROJECT_ID/mathminds-backend:latest'
|
| 56 |
- 'gcr.io/$PROJECT_ID/mathminds-frontend:latest'
|
frontend/app.py
CHANGED
|
@@ -442,6 +442,12 @@ def chat_interface():
|
|
| 442 |
active_sess = get_active_session()
|
| 443 |
st.title(active_sess["title"] if active_sess else "Chat")
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
# ββ 1. Render history βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 446 |
for msg in st.session_state.messages:
|
| 447 |
role = msg["role"]
|
|
@@ -592,23 +598,16 @@ def chat_interface():
|
|
| 592 |
|
| 593 |
status_msg.update(label="Solved!", state="complete", expanded=False)
|
| 594 |
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
full_answer if full_answer else "Processed. Check reasoning steps.",
|
| 599 |
-
reasoning="\n".join(logic_trace),
|
| 600 |
-
metadata={"source": "agent"}
|
| 601 |
-
)
|
| 602 |
load_sessions() # Refresh titles
|
| 603 |
elif r.status_code == 401:
|
| 604 |
_clear_user_state()
|
| 605 |
st.session_state.user = None
|
| 606 |
st.error("Session expired. Please log in again.")
|
| 607 |
-
st.rerun()
|
| 608 |
else:
|
| 609 |
st.error(f"Error: {r.status_code}")
|
| 610 |
-
st.session_state.is_processing = False
|
| 611 |
-
st.rerun()
|
| 612 |
|
| 613 |
except Exception as e:
|
| 614 |
st.error(f"Connection error: {e}")
|
|
|
|
| 442 |
active_sess = get_active_session()
|
| 443 |
st.title(active_sess["title"] if active_sess else "Chat")
|
| 444 |
|
| 445 |
+
# β
SELF-HEALING: Reset processing lock if assistant has already replied
|
| 446 |
+
if st.session_state.is_processing and st.session_state.messages:
|
| 447 |
+
if st.session_state.messages[-1]["role"] == "assistant":
|
| 448 |
+
st.session_state.is_processing = False
|
| 449 |
+
# No rerun needed here, just continue to render
|
| 450 |
+
|
| 451 |
# ββ 1. Render history βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 452 |
for msg in st.session_state.messages:
|
| 453 |
role = msg["role"]
|
|
|
|
| 598 |
|
| 599 |
status_msg.update(label="Solved!", state="complete", expanded=False)
|
| 600 |
|
| 601 |
+
# Force sync with database to ensure UI has the latest persisted state
|
| 602 |
+
if st.session_state.active_session_id:
|
| 603 |
+
load_messages(st.session_state.active_session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
load_sessions() # Refresh titles
|
| 605 |
elif r.status_code == 401:
|
| 606 |
_clear_user_state()
|
| 607 |
st.session_state.user = None
|
| 608 |
st.error("Session expired. Please log in again.")
|
|
|
|
| 609 |
else:
|
| 610 |
st.error(f"Error: {r.status_code}")
|
|
|
|
|
|
|
| 611 |
|
| 612 |
except Exception as e:
|
| 613 |
st.error(f"Connection error: {e}")
|
requirements.txt
CHANGED
|
@@ -17,14 +17,15 @@ gunicorn
|
|
| 17 |
paddlepaddle
|
| 18 |
paddleocr
|
| 19 |
Pillow
|
| 20 |
-
|
|
|
|
| 21 |
wolframalpha
|
| 22 |
sympy
|
| 23 |
ultralytics
|
| 24 |
streamlit-drawable-canvas
|
| 25 |
pydantic-settings
|
| 26 |
-
beautifulsoup4
|
| 27 |
-
fake-useragent
|
| 28 |
ollama
|
| 29 |
firebase-admin
|
| 30 |
supabase
|
|
@@ -39,4 +40,5 @@ PyJWT
|
|
| 39 |
|
| 40 |
selenium
|
| 41 |
webdriver-manager
|
|
|
|
| 42 |
google-adk @ git+https://github.com/google/adk-python.git@main
|
|
|
|
| 17 |
paddlepaddle
|
| 18 |
paddleocr
|
| 19 |
Pillow
|
| 20 |
+
# Scraping retired (Switched to Native Grounding)
|
| 21 |
+
# playwright
|
| 22 |
wolframalpha
|
| 23 |
sympy
|
| 24 |
ultralytics
|
| 25 |
streamlit-drawable-canvas
|
| 26 |
pydantic-settings
|
| 27 |
+
# beautifulsoup4
|
| 28 |
+
# fake-useragent
|
| 29 |
ollama
|
| 30 |
firebase-admin
|
| 31 |
supabase
|
|
|
|
| 40 |
|
| 41 |
selenium
|
| 42 |
webdriver-manager
|
| 43 |
+
# google-cloud-discoveryengine
|
| 44 |
google-adk @ git+https://github.com/google/adk-python.git@main
|