umer6016 commited on
Commit
438349d
·
1 Parent(s): 96cf5ed

Improve scraping depth to 20 pages and increase context window to 60k chars

Browse files
backend/app/services/scrape_pipeline.py CHANGED
@@ -94,7 +94,7 @@ IMPORTANT_PAGE_KEYWORDS = [
94
  'connect', 'social', 'links',
95
  ]
96
 
97
- MAX_PAGES_TO_SCRAPE = 10
98
  REQUEST_TIMEOUT = 15
99
  MAX_RETRIES = 3
100
  RETRY_DELAY = 1.0 # seconds between retries
@@ -1324,15 +1324,15 @@ async def run_full_research_new(url: str, force_refresh: bool = False, progress=
1324
  system_prompt = f"""You are an AI assistant for {raw_name} ({url}).
1325
 
1326
  RULES:
1327
- 1. Answer ONLY from the knowledge base below - never make things up.
1328
- 2. Search the knowledge carefully before saying "I don't know".
1329
- 3. For bio questions, check the HOMEPAGE section first.
1330
- 4. Give partial info if available (e.g., "The site mentions X but not Y...").
1331
- 5. Keep answers concise and helpful.
1332
 
1333
  === KNOWLEDGE BASE ===
1334
 
1335
- {chatbot_context[:10000]}
1336
 
1337
  === END ===
1338
  """
@@ -1470,15 +1470,15 @@ RULES:
1470
  system_prompt = f"""You are an AI assistant for {raw_name} ({url}).
1471
 
1472
  RULES:
1473
- 1. Answer ONLY from the knowledge base below - never make things up.
1474
- 2. Search the knowledge carefully before saying "I don't know".
1475
- 3. For bio questions, check the HOMEPAGE section first.
1476
- 4. Give partial info if available (e.g., "The site mentions X but not Y...").
1477
- 5. Keep answers concise and helpful.
1478
 
1479
  === KNOWLEDGE BASE ===
1480
 
1481
- {chatbot_context[:10000]}
1482
 
1483
  === END ===
1484
  """
 
94
  'connect', 'social', 'links',
95
  ]
96
 
97
+ MAX_PAGES_TO_SCRAPE = 20
98
  REQUEST_TIMEOUT = 15
99
  MAX_RETRIES = 3
100
  RETRY_DELAY = 1.0 # seconds between retries
 
1324
  system_prompt = f"""You are an AI assistant for {raw_name} ({url}).
1325
 
1326
  RULES:
1327
+ 1. Base your answers primarily on the KNOWLEDGE BASE below.
1328
+ 2. If the knowledge base doesn't have the answer, you may use your general knowledge to explain standard concepts or fill minor gaps, but MUST clarify that this info comes from general knowledge, not the specific website.
1329
+ 3. For bio/overview questions, check the HOMEPAGE section first.
1330
+ 4. Keep answers concise, helpful, and use Markdown (bullet points, bold text) for readability.
1331
+ 5. If the user asks about something completely unrelated to the site (e.g. "who is batman"), politely bring them back to the topic of {raw_name}.
1332
 
1333
  === KNOWLEDGE BASE ===
1334
 
1335
+ {chatbot_context[:60000]}
1336
 
1337
  === END ===
1338
  """
 
1470
  system_prompt = f"""You are an AI assistant for {raw_name} ({url}).
1471
 
1472
  RULES:
1473
+ 1. Base your answers primarily on the KNOWLEDGE BASE below.
1474
+ 2. If the knowledge base doesn't have the answer, you may use your general knowledge to explain standard concepts or fill minor gaps, but MUST clarify that this info comes from general knowledge, not the specific website.
1475
+ 3. For bio/overview questions, check the HOMEPAGE section first.
1476
+ 4. Keep answers concise, helpful, and use Markdown (bullet points, bold text) for readability.
1477
+ 5. If the user asks about something completely unrelated to the site (e.g. "who is batman"), politely bring them back to the topic of {raw_name}.
1478
 
1479
  === KNOWLEDGE BASE ===
1480
 
1481
+ {chatbot_context[:60000]}
1482
 
1483
  === END ===
1484
  """