ghadgemadhuri92 commited on
Commit
c43db9b
·
1 Parent(s): d1ec696

Fixed WebScraper and standardized URLs

Browse files
app/core/settings.py CHANGED
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
24
 
25
  # API Config
26
  API_HOST: str = "0.0.0.0"
27
- API_PORT: int = 8000
28
  LOG_LEVEL: str = "INFO"
29
  TIMEOUT_SECONDS: int = 120
30
 
 
24
 
25
  # API Config
26
  API_HOST: str = "0.0.0.0"
27
+ PORT: int = 8000 # Standard Render/Cloud Run env var
28
  LOG_LEVEL: str = "INFO"
29
  TIMEOUT_SECONDS: int = 120
30
 
app/memory/cache.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import logging
3
  import os
4
  from typing import Any, Dict, Optional
 
5
 
6
  import redis
7
  from redis.exceptions import RedisError
@@ -23,7 +24,7 @@ class CacheManager:
23
  redis_url: Redis connection string (used if pool not provided).
24
  connection_pool: Existing Redis connection pool.
25
  """
26
- self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379/0")
27
  self.redis_client = None
28
 
29
  try:
 
2
  import logging
3
  import os
4
  from typing import Any, Dict, Optional
5
+ from app.core.settings import settings
6
 
7
  import redis
8
  from redis.exceptions import RedisError
 
24
  redis_url: Redis connection string (used if pool not provided).
25
  connection_pool: Existing Redis connection pool.
26
  """
27
+ self.redis_url = redis_url or settings.REDIS_URL
28
  self.redis_client = None
29
 
30
  try:
app/memory/database.py CHANGED
@@ -1,7 +1,7 @@
1
  import logging
2
  import os
3
- from datetime import datetime, timezone
4
  from typing import Any, Dict, Optional, List
 
5
 
6
  import pymongo
7
  from pymongo import IndexModel, ASCENDING
@@ -24,7 +24,7 @@ class DatabaseManager:
24
  mongo_uri: MongoDB connection string.
25
  client: Existing PyMongo client (shared pool).
26
  """
27
- self.mongo_uri = mongo_uri or os.getenv("MONGO_URI", "mongodb://localhost:27017/")
28
  self.client = None
29
  self.db = None
30
  self.collection = None
@@ -45,9 +45,11 @@ class DatabaseManager:
45
  self.client.server_info()
46
 
47
  # Setup DB and collection
48
- db_name = "mathminds_ai"
49
  try:
50
- uri_db = pymongo.uri_parser.parse_uri(self.mongo_uri).get('database')
 
 
51
  if uri_db:
52
  db_name = uri_db
53
  except Exception:
 
1
  import logging
2
  import os
 
3
  from typing import Any, Dict, Optional, List
4
+ from app.core.settings import settings
5
 
6
  import pymongo
7
  from pymongo import IndexModel, ASCENDING
 
24
  mongo_uri: MongoDB connection string.
25
  client: Existing PyMongo client (shared pool).
26
  """
27
+ self.mongo_uri = mongo_uri or settings.MONGO_URI
28
  self.client = None
29
  self.db = None
30
  self.collection = None
 
45
  self.client.server_info()
46
 
47
  # Setup DB and collection
48
+ db_name = settings.MONGO_DB_NAME
49
  try:
50
+ # If URI contains a DB name, it will override the setting default
51
+ parsed_uri = pymongo.uri_parser.parse_uri(self.mongo_uri)
52
+ uri_db = parsed_uri.get('database')
53
  if uri_db:
54
  db_name = uri_db
55
  except Exception:
app/worker/celery_app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from celery import Celery
2
+ import os
3
+ from app.core.settings import settings
4
+
5
+ # Initialize Celery
6
+ celery_app = Celery(
7
+ "mathminds",
8
+ broker=settings.REDIS_URL,
9
+ backend=settings.REDIS_URL,
10
+ include=["app.worker.tasks"]
11
+ )
12
+
13
+ # Optional configuration
14
+ celery_app.conf.update(
15
+ task_serializer="json",
16
+ accept_content=["json"],
17
+ result_serializer="json",
18
+ timezone="UTC",
19
+ enable_utc=True,
20
+ task_track_started=True,
21
+ task_time_limit=300, # 5 minutes max
22
+ )
23
+
24
+ if __name__ == "__main__":
25
+ celery_app.start()
app/worker/tasks.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from app.worker.celery_app import celery_app
3
+ from app.tools.web_scraper import run_playwright_sync
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ @celery_app.task(name="app.worker.tasks.scrape_task", bind=True)
8
+ def scrape_task(self, query: str, headless: bool = True, extraction_focus: str = None):
9
+ """
10
+ Celery task for web scraping.
11
+ """
12
+ logger.info(f"Task {self.request.id} started for query: {query}")
13
+ try:
14
+ result = run_playwright_sync(query, headless, extraction_focus)
15
+ return result
16
+ except Exception as e:
17
+ logger.error(f"Task failed: {e}")
18
+ return {
19
+ "source": "web_scraper",
20
+ "error": str(e),
21
+ "status": "error"
22
+ }
check_redis.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import redis
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0")
7
+
8
+ def check_redis():
9
+ print(f"Checking Redis at: {redis_url}")
10
+ try:
11
+ r = redis.from_url(redis_url)
12
+ r.ping()
13
+ print("✅ Redis is UP!")
14
+ except Exception as e:
15
+ print(f"❌ Redis is DOWN or unreachable: {e}")
16
+
17
+ if __name__ == "__main__":
18
+ check_redis()
debug_celery_worker.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import sys
4
+ import os
5
+
6
+ # Add the current directory to sys.path so we can import 'app'
7
+ sys.path.append(os.getcwd())
8
+
9
+ from app.worker.tasks import scrape_task
10
+ import time
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ async def debug_scrape():
16
+ print("Triggering Celery Scrape Task...")
17
+ query = "gold rate in india today"
18
+
19
+ try:
20
+ # Dispatch task
21
+ result = scrape_task.delay(query)
22
+ print(f"Task ID: {result.id}")
23
+
24
+ # Wait for result
25
+ start_time = time.time()
26
+ max_wait = 60 # seconds
27
+
28
+ while time.time() - start_time < max_wait:
29
+ if result.ready():
30
+ print("Task Ready!")
31
+ print("Result Status:", result.status)
32
+ # Safely handle potential encoding issues when printing to console
33
+ try:
34
+ res_content = str(result.result)
35
+ print("Result Content (partial):", res_content[:200].encode('ascii', 'ignore').decode('ascii'))
36
+ except Exception as e:
37
+ print(f"Result received, but print failed: {e}")
38
+ return
39
+
40
+ print(f"Waiting... (status: {result.status})")
41
+ await asyncio.sleep(2)
42
+
43
+ print("Task timed out. Is the worker running?")
44
+
45
+ except Exception as e:
46
+ print(f"Dispatch failed: {e}")
47
+
48
+ if __name__ == "__main__":
49
+ asyncio.run(debug_scrape())
frontend/Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV PORT=8501
8
+
9
+ # Set working directory
10
+ WORKDIR /app
11
+
12
+ # Install system dependencies (curl for health checks)
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ curl \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Install Python dependencies
18
+ # We reuse the root requirements.txt for simplicity, or we could have a specific one
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copy the rest of the application
23
+ COPY . .
24
+
25
+ # Create a non-root user and switch to it for security
26
+ RUN useradd -m appuser && chown -R appuser /app
27
+ USER appuser
28
+
29
+ # Expose Streamlit's default port
30
+ EXPOSE 8501
31
+
32
+ # Command to run the Streamlit app
33
+ CMD streamlit run frontend/app.py --server.port=$PORT --server.address=0.0.0.0
frontend/app.py CHANGED
@@ -107,8 +107,8 @@ st.markdown("""
107
  # ====================================================
108
  # Config
109
  # ====================================================
110
- BASE_API_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
111
- API_URL = f"{BASE_API_URL}/solve"
112
 
113
 
114
  # ====================================================
@@ -154,7 +154,7 @@ def load_sessions():
154
  """Fetch THIS user's chat sessions from the backend and populate state."""
155
  try:
156
  headers = get_auth_headers()
157
- response = requests.get(f"{BASE_API_URL}/chat/sessions", headers=headers, timeout=30)
158
  if response.status_code == 200:
159
  st.session_state.chat_sessions = response.json()
160
  # Mark that we've successfully loaded data for this specific user
@@ -197,7 +197,7 @@ def load_messages(session_id):
197
  try:
198
  headers = get_auth_headers()
199
  response = requests.get(
200
- f"{BASE_API_URL}/chat/sessions/{session_id}/messages",
201
  headers=headers, timeout=30
202
  )
203
  if response.status_code == 200:
@@ -232,7 +232,7 @@ def add_message(role, content, sent_to_api=False, **kwargs):
232
  def new_chat():
233
  try:
234
  headers = get_auth_headers()
235
- response = requests.post(f"{BASE_API_URL}/chat/sessions", headers=headers, timeout=30)
236
  if response.status_code == 200:
237
  new_s = response.json()
238
  st.session_state.active_session_id = new_s["session_id"]
@@ -248,7 +248,7 @@ def new_chat():
248
  def delete_chat(sid):
249
  try:
250
  headers = get_auth_headers()
251
- response = requests.delete(f"{BASE_API_URL}/chat/sessions/{sid}", headers=headers, timeout=30)
252
  if response.status_code == 200:
253
  if st.session_state.active_session_id == sid:
254
  st.session_state.active_session_id = None
@@ -265,7 +265,7 @@ def rename_chat(sid, new_title):
265
  try:
266
  headers = get_auth_headers()
267
  response = requests.patch(
268
- f"{BASE_API_URL}/chat/sessions/{sid}",
269
  headers=headers, json={"title": new_title}, timeout=30
270
  )
271
  if response.status_code == 200:
@@ -387,7 +387,7 @@ def profile_interface():
387
 
388
  if "profile_data" not in st.session_state:
389
  try:
390
- r = requests.get(f"{BASE_API_URL}/users/profile", headers=headers, timeout=30)
391
  st.session_state.profile_data = r.json() if r.status_code == 200 else {}
392
  except Exception:
393
  st.session_state.profile_data = {}
@@ -410,7 +410,7 @@ def profile_interface():
410
  if st.form_submit_button("Save Profile", use_container_width=True, type="primary"):
411
  payload = {"display_name": display_name, "math_level": math_level, "interests": interests}
412
  try:
413
- r = requests.post(f"{BASE_API_URL}/users/profile", json=payload, headers=headers)
414
  if r.status_code == 200:
415
  st.success("Profile updated!")
416
  st.session_state.profile_data = payload
 
107
  # ====================================================
108
  # Config
109
  # ====================================================
110
+ BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8000")
111
+ API_URL = f"{BACKEND_URL}/solve"
112
 
113
 
114
  # ====================================================
 
154
  """Fetch THIS user's chat sessions from the backend and populate state."""
155
  try:
156
  headers = get_auth_headers()
157
+ response = requests.get(f"{BACKEND_URL}/chat/sessions", headers=headers, timeout=30)
158
  if response.status_code == 200:
159
  st.session_state.chat_sessions = response.json()
160
  # Mark that we've successfully loaded data for this specific user
 
197
  try:
198
  headers = get_auth_headers()
199
  response = requests.get(
200
+ f"{BACKEND_URL}/chat/sessions/{session_id}/messages",
201
  headers=headers, timeout=30
202
  )
203
  if response.status_code == 200:
 
232
  def new_chat():
233
  try:
234
  headers = get_auth_headers()
235
+ response = requests.post(f"{BACKEND_URL}/chat/sessions", headers=headers, timeout=30)
236
  if response.status_code == 200:
237
  new_s = response.json()
238
  st.session_state.active_session_id = new_s["session_id"]
 
248
  def delete_chat(sid):
249
  try:
250
  headers = get_auth_headers()
251
+ response = requests.delete(f"{BACKEND_URL}/chat/sessions/{sid}", headers=headers, timeout=30)
252
  if response.status_code == 200:
253
  if st.session_state.active_session_id == sid:
254
  st.session_state.active_session_id = None
 
265
  try:
266
  headers = get_auth_headers()
267
  response = requests.patch(
268
+ f"{BACKEND_URL}/chat/sessions/{sid}",
269
  headers=headers, json={"title": new_title}, timeout=30
270
  )
271
  if response.status_code == 200:
 
387
 
388
  if "profile_data" not in st.session_state:
389
  try:
390
+ r = requests.get(f"{BACKEND_URL}/users/profile", headers=headers, timeout=30)
391
  st.session_state.profile_data = r.json() if r.status_code == 200 else {}
392
  except Exception:
393
  st.session_state.profile_data = {}
 
410
  if st.form_submit_button("Save Profile", use_container_width=True, type="primary"):
411
  payload = {"display_name": display_name, "math_level": math_level, "interests": interests}
412
  try:
413
+ r = requests.post(f"{BACKEND_URL}/users/profile", json=payload, headers=headers)
414
  if r.status_code == 200:
415
  st.success("Profile updated!")
416
  st.session_state.profile_data = payload
frontend/firebase_utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ FIREBASE_WEB_API_KEY = os.getenv("FIREBASE_WEB_API_KEY")
8
+
9
+ def sign_in_with_email(email, password):
10
+ """
11
+ Signs in a user using Firebase Auth REST API.
12
+ Returns (id_token, local_id, email, error_message)
13
+ """
14
+ if not FIREBASE_WEB_API_KEY:
15
+ return None, None, None, "FIREBASE_WEB_API_KEY is not set in .env"
16
+
17
+ url = f"https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={FIREBASE_WEB_API_KEY}"
18
+ payload = {
19
+ "email": email,
20
+ "password": password,
21
+ "returnSecureToken": True
22
+ }
23
+
24
+ try:
25
+ response = requests.post(url, json=payload)
26
+ data = response.json()
27
+
28
+ if response.status_code == 200:
29
+ return data["idToken"], data["localId"], data["email"], None
30
+ else:
31
+ error_msg = data.get("error", {}).get("message", "Unknown error")
32
+ return None, None, None, error_msg
33
+ except Exception as e:
34
+ return None, None, None, str(e)
35
+
36
+ def sign_up_with_email(email, password):
37
+ """
38
+ Registers a new user using Firebase Auth REST API.
39
+ """
40
+ if not FIREBASE_WEB_API_KEY:
41
+ return None, None, None, "FIREBASE_WEB_API_KEY is not set in .env"
42
+
43
+ url = f"https://identitytoolkit.googleapis.com/v1/accounts:signUp?key={FIREBASE_WEB_API_KEY}"
44
+ payload = {
45
+ "email": email,
46
+ "password": password,
47
+ "returnSecureToken": True
48
+ }
49
+
50
+ try:
51
+ response = requests.post(url, json=payload)
52
+ data = response.json()
53
+
54
+ if response.status_code == 200:
55
+ return data["idToken"], data["localId"], data["email"], None
56
+ else:
57
+ error_msg = data.get("error", {}).get("message", "Unknown error")
58
+ return None, None, None, error_msg
59
+ except Exception as e:
60
+ return None, None, None, str(e)
test_scraper_local.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import os
4
+
5
+ # Add the current directory to sys.path so we can import 'app'
6
+ sys.path.append(os.getcwd())
7
+
8
+ from app.tools.web_scraper import run_playwright_sync
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def test_direct_scrape():
14
+ print("--- Testing run_playwright_sync DIRECTLY (Subprocess-safe) ---")
15
+ query = "current gold rate in mumbai"
16
+
17
+ try:
18
+ # We run it synchronously as it's designed
19
+ result = run_playwright_sync(query, headless=True)
20
+
21
+ print("\n[RESULT]")
22
+ if result.get("status") == "success":
23
+ print(f"URL: {result.get('url')}")
24
+ print(f"Content Length: {len(result.get('content', ''))}")
25
+ print(f"Sample: {result.get('content')[:500]}...")
26
+ else:
27
+ print(f"Error: {result.get('error')}")
28
+
29
+ except Exception as e:
30
+ print(f"Crashed: {e}")
31
+
32
+ if __name__ == "__main__":
33
+ test_direct_scrape()