BenjaminKaindu0506 commited on
Commit
1be7393
·
0 Parent(s):

Initial commit: UA Student Navigator Chatbot with OpenRouter integration

Browse files
Files changed (11) hide show
  1. .gitignore +33 -0
  2. README_HF.md +67 -0
  3. app.py +407 -0
  4. crawl.py +55 -0
  5. extract.py +124 -0
  6. fetch.py +267 -0
  7. llm.py +506 -0
  8. rank.py +85 -0
  9. rate_limit.py +126 -0
  10. requirements.txt +6 -0
  11. search.py +259 -0
.gitignore ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ venv/
8
+ env/
9
+ ENV/
10
+ .venv
11
+
12
+ # Environment variables
13
+ .env
14
+ .env.local
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+ *~
22
+
23
+ # OS
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Project specific
28
+ searxng-data/
29
+ *.log
30
+
31
+ # Hugging Face
32
+ .huggingface/
33
+
README_HF.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: UA Student Navigator Chatbot
3
+ emoji: 🎓
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.28.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 🎓 UA Student Navigator Chatbot
14
+
15
+ A web-retrieval + summarization agent that answers University of Arizona (UA) student questions using only official UA webpages and always shows sources.
16
+
17
+ ## Features
18
+
19
+ - 🔍 **SearXNG Integration**: Searches UA domains (arizona.edu and subdomains)
20
+ - 📄 **Full Page Reading**: Fetches and extracts complete content from webpages
21
+ - 🤖 **OpenRouter Integration**: Access to 30+ free, open-source models
22
+ - 🕰️ **Wayback Machine**: Automatic fallback for blocked pages
23
+ - 🕷️ **Controlled Crawling**: Optionally crawls internal links from user-provided URLs
24
+ - 📊 **Evidence-Based**: All answers are grounded in extracted evidence with citations
25
+ - 🚫 **No Hallucinations**: Explicitly states when information is not found
26
+ - ⚡ **Rate Limit Management**: Smart rate limiting with partial results
27
+
28
+ ## Getting Started
29
+
30
+ 1. **Get an OpenRouter API Key** (free at https://openrouter.ai)
31
+ 2. **Enter your API key** in the sidebar
32
+ 3. **Select a model** from the dropdown (30+ free models available)
33
+ 4. **Ask questions** about University of Arizona!
34
+
35
+ ## Usage
36
+
37
+ ### Search Query
38
+ Ask questions like:
39
+ - "What campus resources are available?"
40
+ - "What are the housing deadlines?"
41
+ - "What majors are in the College of Science?"
42
+
43
+ ### Website URL
44
+ Analyze specific UA webpages by providing a URL and what you're looking for.
45
+
46
+ ## Available Models
47
+
48
+ The app automatically loads free, open-source models including:
49
+ - `xiaomi/mimo-v2-flash:free` - 262K context
50
+ - `deepseek/deepseek-r1-0528:free` - 163K context, reasoning
51
+ - `openai/gpt-oss-20b:free` - 131K context
52
+ - And many more!
53
+
54
+ ## Rate Limits
55
+
56
+ The app automatically manages rate limits and will show partial results if limits are reached. Try different models if one is rate-limited.
57
+
58
+ ## Privacy
59
+
60
+ - All API keys are stored locally in your session
61
+ - No data is stored or logged
62
+ - All searches are limited to arizona.edu domains
63
+
64
+ ## License
65
+
66
+ MIT License - See LICENSE file for details
67
+
app.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UA Student Navigator Chatbot - Main Streamlit App
3
+ Now with OpenRouter integration for free open-source models!
4
+ """
5
+ import streamlit as st
6
+ import os
7
+ import time
8
+ from dotenv import load_dotenv
9
+ from typing import List, Dict, Optional
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Import our modules
15
+ from search import ua_search
16
+ from fetch import fetch_page
17
+ from extract import extract_evidence
18
+ from rank import rank_pages, rank_snippets
19
+ from llm import get_openrouter_models, generate_answer_openrouter, check_ollama_available, check_model_exists
20
+ from crawl import crawl_website
21
+ from rate_limit import get_rate_limit_tracker
22
+
23
+
24
+ # Page configuration
25
+ st.set_page_config(
26
+ page_title="UA Student Navigator",
27
+ page_icon="🎓",
28
+ layout="wide"
29
+ )
30
+
31
+ # Initialize session state
32
+ if 'search_results' not in st.session_state:
33
+ st.session_state.search_results = []
34
+ if 'evidence' not in st.session_state:
35
+ st.session_state.evidence = None
36
+ if 'available_models' not in st.session_state:
37
+ st.session_state.available_models = []
38
+
39
+
40
+ def get_available_models(api_key: str):
41
+ """Get and cache available OpenRouter models."""
42
+ if not st.session_state.available_models:
43
+ with st.spinner("Loading available models..."):
44
+ models = get_openrouter_models(api_key)
45
+ st.session_state.available_models = models
46
+ return st.session_state.available_models
47
+
48
+
49
+ def process_search_query(query: str, max_results: int = 8, model: Optional[str] = None, api_key: Optional[str] = None):
50
+ """Process a search query and return evidence."""
51
+ # Adjust max_results based on rate limits if model is provided
52
+ if model and api_key:
53
+ rate_tracker = get_rate_limit_tracker()
54
+ max_searches = rate_tracker.calculate_max_searches(model, api_key)
55
+ if max_searches < max_results:
56
+ st.warning(f"⚠️ Rate limit: Reducing search results from {max_results} to {max_searches} to avoid rate limits")
57
+ max_results = max_searches
58
+
59
+ with st.spinner("Searching UA websites..."):
60
+ search_results = ua_search(query, max_results=max_results)
61
+
62
+ if not search_results:
63
+ return None, """No search results found. This could be because:
64
+ 1. SearXNG is not accessible (check if it's running or try a different instance)
65
+ 2. No UA pages matched your query
66
+
67
+ **You can still use the app:**
68
+ - Try the "🌐 Website URL" tab to analyze a specific UA webpage directly"""
69
+
70
+ st.info(f"Found {len(search_results)} search results. Fetching pages...")
71
+
72
+ pages = []
73
+ rate_limited = False
74
+ for i, result in enumerate(search_results):
75
+ # Check rate limit before each fetch if model is provided
76
+ if model and api_key:
77
+ rate_tracker = get_rate_limit_tracker()
78
+ can_request, rate_limit_msg = rate_tracker.can_make_request(model, api_key)
79
+ if not can_request:
80
+ st.warning(f"⚠️ Rate limit reached after processing {i} pages. Showing partial results.")
81
+ rate_limited = True
82
+ break
83
+
84
+ with st.spinner(f"Fetching: {result['url']}"):
85
+ page = fetch_page(result['url'])
86
+ if page:
87
+ pages.append(page)
88
+
89
+ if not pages:
90
+ return None, "Could not fetch any pages. They may be blocked or unavailable."
91
+
92
+ evidence = extract_evidence(pages, query)
93
+ ranked_pages = rank_pages(pages, query)
94
+ evidence['pages'] = ranked_pages
95
+
96
+ ranked_snippets = rank_snippets(evidence['snippets'], query)
97
+ evidence['snippets'] = ranked_snippets[:25]
98
+ evidence['rate_limited'] = rate_limited
99
+
100
+ return evidence, None
101
+
102
+
103
+ def process_url_query(url: str, query: str, max_pages: int = 12, max_depth: int = 1):
104
+ """Process a URL query with optional crawling."""
105
+ with st.spinner(f"Fetching and analyzing: {url}"):
106
+ page = fetch_page(url)
107
+ if not page:
108
+ return None, f"Could not fetch the page at {url}. Please check the URL."
109
+
110
+ pages = [page]
111
+
112
+ if max_pages > 1 and max_depth > 0:
113
+ st.info(f"Crawling internal links (max {max_pages} pages, depth {max_depth})...")
114
+ crawled_pages = crawl_website(url, query, max_pages=max_pages, max_depth=max_depth)
115
+ seen_urls = {page['url']}
116
+ for p in crawled_pages:
117
+ if p['url'] not in seen_urls:
118
+ pages.append(p)
119
+ seen_urls.add(p['url'])
120
+
121
+ evidence = extract_evidence(pages, query)
122
+ ranked_pages = rank_pages(pages, query)
123
+ evidence['pages'] = ranked_pages
124
+
125
+ ranked_snippets = rank_snippets(evidence['snippets'], query)
126
+ evidence['snippets'] = ranked_snippets[:25]
127
+
128
+ return evidence, None
129
+
130
+
131
+ def display_results(evidence: Dict, query: str, model: str, api_key: str, use_openrouter: bool = True):
132
+ """Display results in the UI."""
133
+ if not evidence:
134
+ return
135
+
136
+ # Generate answer
137
+ snippets = evidence.get('snippets', [])
138
+ source_urls = list(set([s.get('source_url', '') for s in snippets]))
139
+
140
+ if snippets:
141
+ st.info(f"📊 Using {len(snippets)} evidence snippets from {len(source_urls)} sources to generate answer...")
142
+
143
+ # Show progress for long-running requests
144
+ progress_bar = st.progress(0)
145
+ status_text = st.empty()
146
+
147
+ try:
148
+ status_text.text("🔄 Connecting to model...")
149
+ progress_bar.progress(10)
150
+
151
+ if use_openrouter:
152
+ status_text.text("🤖 Generating answer with OpenRouter...")
153
+ progress_bar.progress(30)
154
+ partial_evidence = evidence if evidence.get('rate_limited') else None
155
+ answer, was_rate_limited, rate_limit_msg = generate_answer_openrouter(
156
+ query, snippets, source_urls, model, api_key, partial_evidence
157
+ )
158
+
159
+ if was_rate_limited:
160
+ st.warning(f"⚠️ **Rate Limit Reached**\n\n{rate_limit_msg or 'Maximum requests per minute reached for this model.'}")
161
+ st.info("💡 **Tip:** Try a different model or wait a moment before trying again.")
162
+ else:
163
+ status_text.text("🖥️ Generating answer with Ollama...")
164
+ progress_bar.progress(30)
165
+ from llm import generate_answer
166
+ ollama_host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
167
+ answer = generate_answer(query, snippets, source_urls, ollama_host, model)
168
+ was_rate_limited = False
169
+ rate_limit_msg = None
170
+
171
+ progress_bar.progress(100)
172
+ status_text.text("✅ Answer generated!")
173
+
174
+ except Exception as e:
175
+ progress_bar.progress(0)
176
+ status_text.text(f"❌ Error: {str(e)}")
177
+ answer = f"Error generating answer: {str(e)}"
178
+ was_rate_limited = False
179
+ rate_limit_msg = None
180
+
181
+ finally:
182
+ # Clear progress after a moment
183
+ time.sleep(0.5)
184
+ progress_bar.empty()
185
+ status_text.empty()
186
+
187
+ # Display Final Answer
188
+ st.header("📝 Final Answer")
189
+ st.markdown(answer)
190
+
191
+ # Key Findings
192
+ st.header("🔍 Key Findings")
193
+ findings = evidence.get('snippets', [])[:10]
194
+ if findings:
195
+ for i, finding in enumerate(findings, 1):
196
+ st.markdown(f"**{i}.** {finding.get('text', '')}")
197
+ st.caption(f"Source: [{finding.get('source_title', 'Link')}]({finding.get('source_url', '')})")
198
+ else:
199
+ st.info("No specific findings extracted.")
200
+
201
+ # Dates Found
202
+ dates = evidence.get('dates', [])
203
+ if dates:
204
+ st.header("📅 Dates Found")
205
+ for date_info in dates[:10]:
206
+ st.markdown(f"**{date_info.get('date', 'Unknown')}**")
207
+ st.markdown(f"*Context:* {date_info.get('context', '')}")
208
+ st.caption(f"Source: [{date_info.get('source_title', 'Link')}]({date_info.get('source_url', '')})")
209
+
210
+ # Sources
211
+ st.header("📚 Sources")
212
+ pages = evidence.get('pages', [])
213
+ if pages:
214
+ for i, page in enumerate(pages[:10], 1):
215
+ score = page.get('relevance_score', 0)
216
+ source_indicator = ""
217
+ if page.get('source') == 'wayback_machine':
218
+ source_indicator = " (via Wayback Machine)"
219
+ st.markdown(f"**{i}.** [{page.get('title', 'No title')}]({page.get('url', '')}){source_indicator}")
220
+ st.caption(f"Relevance score: {score:.2f}")
221
+
222
+
223
+ # Main UI
224
+ st.title("🎓 UA Student Navigator Chatbot")
225
+ st.markdown("Ask questions about University of Arizona using official UA webpages only.")
226
+
227
+ # Sidebar for configuration
228
+ with st.sidebar:
229
+ st.header("⚙️ Configuration")
230
+
231
+ # OpenRouter API Key
232
+ default_key = os.getenv('OPENROUTER_API_KEY', '')
233
+ api_key = st.text_input(
234
+ "OpenRouter API Key",
235
+ value=default_key,
236
+ type="password",
237
+ help="Your OpenRouter API key"
238
+ )
239
+
240
+ if not api_key or api_key.strip() == '':
241
+ st.warning("⚠️ Please enter your OpenRouter API key")
242
+ st.stop()
243
+
244
+ api_key = api_key.strip() # Remove any whitespace
245
+
246
+ # Model Selection
247
+ st.subheader("🤖 Model Selection")
248
+ use_openrouter = st.radio(
249
+ "LLM Provider",
250
+ ["OpenRouter (Free Open-Source Models)", "Ollama (Local)"],
251
+ index=0,
252
+ help="Choose between OpenRouter cloud models or local Ollama"
253
+ )
254
+
255
+ if use_openrouter == "OpenRouter (Free Open-Source Models)":
256
+ # Get available models
257
+ models = get_available_models(api_key)
258
+
259
+ if models:
260
+ model_options = {f"{m['name']} ({m['context_length']:,} tokens)": m['id'] for m in models[:15]}
261
+ selected_model_name = st.selectbox(
262
+ "Select Model",
263
+ options=list(model_options.keys()),
264
+ index=0,
265
+ help="Free open-source models available via OpenRouter"
266
+ )
267
+ selected_model = model_options[selected_model_name]
268
+
269
+ # Show model info and rate limits
270
+ selected_model_info = next((m for m in models if m['id'] == selected_model), None)
271
+ if selected_model_info:
272
+ rate_tracker = get_rate_limit_tracker()
273
+ model_limits = rate_tracker.get_model_rate_limit(selected_model, api_key)
274
+ remaining = rate_tracker.get_remaining_requests(selected_model, api_key)
275
+ max_searches = rate_tracker.calculate_max_searches(selected_model, api_key)
276
+
277
+ with st.expander("Model Details"):
278
+ st.write(f"**ID:** {selected_model_info['id']}")
279
+ st.write(f"**Context Length:** {selected_model_info['context_length']:,} tokens")
280
+ st.write(f"**Description:** {selected_model_info['description']}")
281
+ st.write("---")
282
+ st.write("**Rate Limits:**")
283
+ st.write(f"- Requests per minute: {model_limits.get('requests_per_minute', 'N/A')}")
284
+ st.write(f"- Remaining requests: {remaining}/{model_limits.get('requests_per_minute', 'N/A')}")
285
+ st.write(f"- Max searches recommended: {max_searches}")
286
+ else:
287
+ st.error("Could not load models. Check your API key.")
288
+ st.stop()
289
+ else:
290
+ # Ollama configuration
291
+ ollama_host = st.text_input(
292
+ "Ollama Host",
293
+ value=os.getenv('OLLAMA_HOST', 'http://localhost:11434'),
294
+ help="Ollama API endpoint"
295
+ )
296
+ os.environ['OLLAMA_HOST'] = ollama_host
297
+
298
+ ollama_model = st.text_input(
299
+ "Ollama Model",
300
+ value=os.getenv('OLLAMA_MODEL', 'qwen2.5:7b'),
301
+ help="Model name (must be pulled locally)"
302
+ )
303
+ os.environ['OLLAMA_MODEL'] = ollama_model
304
+
305
+ selected_model = ollama_model
306
+
307
+ # Check Ollama status
308
+ available, error = check_ollama_available(ollama_host)
309
+ if available:
310
+ st.success("✓ Ollama is running")
311
+ exists, error, suggested = check_model_exists(ollama_model, ollama_host)
312
+ if exists:
313
+ st.success(f"✓ Model '{ollama_model}' is available")
314
+ else:
315
+ st.error(f"✗ {error}")
316
+ else:
317
+ st.error("✗ Ollama not running")
318
+
319
+ # SearXNG Configuration
320
+ st.subheader("🔍 SearXNG Settings")
321
+ searxng_url = st.text_input(
322
+ "SearXNG URL",
323
+ value=os.getenv('SEARXNG_URL', 'http://localhost:8080'),
324
+ help="URL of your SearXNG instance"
325
+ )
326
+ os.environ['SEARXNG_URL'] = searxng_url
327
+
328
+ st.markdown("---")
329
+ if st.button("🔄 Refresh Models"):
330
+ st.session_state.available_models = []
331
+ st.rerun()
332
+
333
+
334
+ # Main interface
335
+ tab1, tab2 = st.tabs(["🔍 Search Query", "🌐 Website URL"])
336
+
337
+ with tab1:
338
+ st.header("Ask a Question")
339
+ query = st.text_input(
340
+ "Enter your question about UA:",
341
+ placeholder="e.g., What are the deadlines for housing?",
342
+ key="search_query"
343
+ )
344
+
345
+ max_results = st.slider("Max search results", 3, 15, 8)
346
+
347
+ if st.button("Search", type="primary"):
348
+ if not query:
349
+ st.warning("Please enter a question.")
350
+ else:
351
+ # Pass model and API key for rate limit management
352
+ selected_model_for_search = selected_model if use_openrouter == "OpenRouter (Free Open-Source Models)" else None
353
+ evidence, error = process_search_query(query, max_results=max_results, model=selected_model_for_search, api_key=api_key)
354
+
355
+ if error:
356
+ st.error(error)
357
+ elif evidence:
358
+ st.session_state.evidence = evidence
359
+ use_openrouter_flag = use_openrouter == "OpenRouter (Free Open-Source Models)"
360
+ display_results(evidence, query, selected_model, api_key, use_openrouter_flag)
361
+ else:
362
+ st.error("No evidence extracted.")
363
+
364
+ with tab2:
365
+ st.header("Analyze a UA Webpage")
366
+ url = st.text_input(
367
+ "Enter a UA webpage URL:",
368
+ placeholder="https://global.arizona.edu/...",
369
+ key="url_input"
370
+ )
371
+
372
+ url_query = st.text_input(
373
+ "What information are you looking for?",
374
+ placeholder="e.g., What do international students need to do for CPT?",
375
+ key="url_query"
376
+ )
377
+
378
+ col1, col2 = st.columns(2)
379
+ with col1:
380
+ max_pages = st.number_input("Max pages to crawl", 1, 20, 12)
381
+ with col2:
382
+ max_depth = st.number_input("Max crawl depth", 0, 2, 1)
383
+
384
+ if st.button("Analyze Website", type="primary"):
385
+ if not url:
386
+ st.warning("Please enter a URL.")
387
+ elif not url_query:
388
+ st.warning("Please enter what you're looking for.")
389
+ else:
390
+ evidence, error = process_url_query(url, url_query, max_pages=max_pages, max_depth=max_depth)
391
+
392
+ if error:
393
+ st.error(error)
394
+ elif evidence:
395
+ st.session_state.evidence = evidence
396
+ use_openrouter_flag = use_openrouter == "OpenRouter (Free Open-Source Models)"
397
+ display_results(evidence, url_query, selected_model, api_key, use_openrouter_flag)
398
+ else:
399
+ st.error("No evidence extracted.")
400
+
401
+ # Display cached results if available
402
+ if st.session_state.evidence:
403
+ st.sidebar.markdown("---")
404
+ if st.sidebar.button("Clear Results"):
405
+ st.session_state.evidence = None
406
+ st.rerun()
407
+
crawl.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Controlled website crawler for user-provided URLs.
3
+ """
4
+ from typing import List, Dict, Set
5
+ from urllib.parse import urlparse, urljoin
6
+ from fetch import fetch_page, get_internal_links
7
+ import time
8
+
9
+
10
+ def crawl_website(start_url: str, query: str, max_pages: int = 12, max_depth: int = 1) -> List[Dict[str, str]]:
11
+ """Crawl a website starting from a URL, following internal links."""
12
+ parsed_start = urlparse(start_url)
13
+ base_domain = parsed_start.netloc.lower()
14
+ base_path = parsed_start.path.rstrip('/')
15
+
16
+ pages = []
17
+ visited: Set[str] = set()
18
+ to_visit: List[tuple[str, int]] = [(start_url, 0)]
19
+
20
+ while to_visit and len(pages) < max_pages:
21
+ current_url, depth = to_visit.pop(0)
22
+
23
+ if current_url in visited:
24
+ continue
25
+
26
+ if depth > max_depth:
27
+ continue
28
+
29
+ print(f"Fetching (depth {depth}): {current_url}")
30
+ page = fetch_page(current_url)
31
+
32
+ if page:
33
+ pages.append(page)
34
+ visited.add(current_url)
35
+
36
+ if depth < max_depth and len(pages) < max_pages:
37
+ try:
38
+ import httpx
39
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
40
+ with httpx.Client(timeout=10.0, follow_redirects=True) as client:
41
+ response = client.get(current_url, headers=headers)
42
+ if response.status_code == 200 and 'text/html' in response.headers.get('content-type', '').lower():
43
+ html = response.text
44
+ links = get_internal_links(html, current_url, same_domain_only=True)
45
+
46
+ for link in links:
47
+ if link not in visited:
48
+ to_visit.append((link, depth + 1))
49
+ except Exception as e:
50
+ print(f"Error getting links from {current_url}: {e}")
51
+
52
+ time.sleep(0.5)
53
+
54
+ return pages
55
+
extract.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract evidence, dates, and relevant snippets from content.
3
+ """
4
+ import re
5
+ from typing import List, Dict
6
+ from datetime import datetime
7
+
8
+
9
+ def extract_dates(text: str) -> List[Dict[str, str]]:
10
+ """Extract dates from text in various formats."""
11
+ dates = []
12
+
13
+ pattern1 = r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})\b'
14
+ for match in re.finditer(pattern1, text, re.IGNORECASE):
15
+ month, day, year = match.groups()
16
+ date_str = f"{month} {day}, {year}"
17
+ start = max(0, match.start() - 50)
18
+ end = min(len(text), match.end() + 50)
19
+ context = text[start:end].strip()
20
+ dates.append({'date': date_str, 'context': context, 'raw': match.group()})
21
+
22
+ pattern2 = r'\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b'
23
+ for match in re.finditer(pattern2, text):
24
+ month, day, year = match.groups()
25
+ date_str = f"{month}/{day}/{year}"
26
+ start = max(0, match.start() - 50)
27
+ end = min(len(text), match.end() + 50)
28
+ context = text[start:end].strip()
29
+ dates.append({'date': date_str, 'context': context, 'raw': match.group()})
30
+
31
+ pattern3 = r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b'
32
+ for match in re.finditer(pattern3, text):
33
+ year, month, day = match.groups()
34
+ date_str = f"{month}/{day}/{year}"
35
+ start = max(0, match.start() - 50)
36
+ end = min(len(text), match.end() + 50)
37
+ context = text[start:end].strip()
38
+ dates.append({'date': date_str, 'context': context, 'raw': match.group()})
39
+
40
+ pattern4 = r'\b(Spring|Summer|Fall|Winter)\s+(\d{4})\b'
41
+ for match in re.finditer(pattern4, text, re.IGNORECASE):
42
+ season, year = match.groups()
43
+ date_str = f"{season} {year}"
44
+ start = max(0, match.start() - 50)
45
+ end = min(len(text), match.end() + 50)
46
+ context = text[start:end].strip()
47
+ dates.append({'date': date_str, 'context': context, 'raw': match.group()})
48
+
49
+ seen = set()
50
+ unique_dates = []
51
+ for d in dates:
52
+ if d['date'] not in seen:
53
+ seen.add(d['date'])
54
+ unique_dates.append(d)
55
+
56
+ return unique_dates
57
+
58
+
59
+ def extract_relevant_snippets(content: str, query: str, max_snippets: int = 15, snippet_length: int = 300) -> List[str]:
60
+ """Extract relevant snippets from content based on query keywords."""
61
+ keywords = [w.lower() for w in query.split() if len(w) > 3]
62
+
63
+ if not keywords:
64
+ sentences = re.split(r'[.!?]+', content)
65
+ return [s.strip()[:snippet_length] for s in sentences[:max_snippets] if s.strip()]
66
+
67
+ sentences = re.split(r'[.!?]+', content)
68
+ scored_sentences = []
69
+ for sentence in sentences:
70
+ sentence_lower = sentence.lower()
71
+ score = sum(1 for keyword in keywords if keyword in sentence_lower)
72
+ if score > 0:
73
+ scored_sentences.append((score, sentence.strip()))
74
+
75
+ scored_sentences.sort(reverse=True, key=lambda x: x[0])
76
+ snippets = []
77
+ seen = set()
78
+
79
+ for score, sentence in scored_sentences[:max_snippets * 2]:
80
+ snippet = sentence[:snippet_length]
81
+ if snippet not in seen and len(snippet) > 50:
82
+ seen.add(snippet)
83
+ snippets.append(snippet)
84
+ if len(snippets) >= max_snippets:
85
+ break
86
+
87
+ if len(snippets) < max_snippets:
88
+ for sentence in sentences:
89
+ snippet = sentence.strip()[:snippet_length]
90
+ if snippet not in seen and len(snippet) > 50:
91
+ seen.add(snippet)
92
+ snippets.append(snippet)
93
+ if len(snippets) >= max_snippets:
94
+ break
95
+
96
+ return snippets
97
+
98
+
99
+ def extract_evidence(pages: List[Dict[str, str]], query: str) -> Dict:
100
+ """Extract evidence from multiple pages."""
101
+ all_snippets = []
102
+ all_dates = []
103
+
104
+ for page in pages:
105
+ content = page.get('content', '')
106
+ if not content:
107
+ continue
108
+
109
+ snippets = extract_relevant_snippets(content, query, max_snippets=8, snippet_length=300)
110
+ for snippet in snippets:
111
+ all_snippets.append({
112
+ 'text': snippet,
113
+ 'source_url': page['url'],
114
+ 'source_title': page.get('title', 'No title')
115
+ })
116
+
117
+ dates = extract_dates(content)
118
+ for date_info in dates:
119
+ date_info['source_url'] = page['url']
120
+ date_info['source_title'] = page.get('title', 'No title')
121
+ all_dates.append(date_info)
122
+
123
+ return {'snippets': all_snippets, 'dates': all_dates, 'pages': pages}
124
+
fetch.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fetch and extract content from web pages.
3
+ Includes Wayback Machine fallback for 403 errors.
4
+ """
5
+ import httpx
6
+ from bs4 import BeautifulSoup
7
+ from typing import Optional, Dict
8
+ from urllib.parse import urljoin, urlparse
9
+ import re
10
+ import json
11
+ import time
12
+
13
+
14
+ def clean_text(text: str) -> str:
15
+ """Clean extracted text."""
16
+ text = re.sub(r'\s+', ' ', text)
17
+ text = text.strip()
18
+ return text
19
+
20
+
21
+ def extract_main_content(html: str, url: str, max_chars: int = 120000) -> str:
22
+ """Extract main content from HTML, removing scripts, styles, nav, footer."""
23
+ soup = BeautifulSoup(html, 'lxml')
24
+
25
+ for script in soup(['script', 'style', 'noscript']):
26
+ script.decompose()
27
+
28
+ for nav in soup.find_all(['nav', 'header', 'footer', 'aside']):
29
+ nav.decompose()
30
+
31
+ junk_patterns = ['nav', 'navigation', 'menu', 'sidebar', 'footer', 'header', 'cookie', 'banner', 'advertisement', 'ad-', 'social-']
32
+
33
+ for element in soup.find_all(class_=re.compile('|'.join(junk_patterns), re.I)):
34
+ element.decompose()
35
+
36
+ for element in soup.find_all(id=re.compile('|'.join(junk_patterns), re.I)):
37
+ element.decompose()
38
+
39
+ main_content = None
40
+ main_selectors = ['main', 'article', '[role="main"]', '.content', '.main-content', '.post-content', '#content', '#main-content', '#main']
41
+
42
+ for selector in main_selectors:
43
+ main_content = soup.select_one(selector)
44
+ if main_content:
45
+ break
46
+
47
+ if not main_content:
48
+ main_content = soup.find('body')
49
+
50
+ if not main_content:
51
+ return ""
52
+
53
+ text = main_content.get_text(separator=' ', strip=True)
54
+ text = clean_text(text)
55
+
56
+ if len(text) > max_chars:
57
+ text = text[:max_chars] + "... [truncated]"
58
+
59
+ return text
60
+
61
+
62
+ def get_wayback_snapshot(url: str, timeout: float = 20.0) -> Optional[str]:
63
+ """Get the latest Wayback Machine snapshot URL for a given URL."""
64
+ try:
65
+ api_url = f"https://web.archive.org/cdx/search/cdx?url={url}&output=json&limit=1&collapse=urlkey"
66
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
67
+
68
+ for attempt in range(2):
69
+ try:
70
+ with httpx.Client(timeout=timeout, headers=headers) as client:
71
+ response = client.get(api_url)
72
+ response.raise_for_status()
73
+ data = response.json()
74
+ if len(data) > 1:
75
+ timestamp = data[1][1]
76
+ wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
77
+ return wayback_url
78
+ break
79
+ except httpx.TimeoutException:
80
+ if attempt == 0:
81
+ print(f"Wayback API timeout, retrying...")
82
+ time.sleep(1)
83
+ continue
84
+ else:
85
+ raise
86
+ except Exception as e:
87
+ if attempt == 0:
88
+ print(f"Wayback API error, retrying...")
89
+ time.sleep(1)
90
+ continue
91
+ else:
92
+ raise
93
+
94
+ return None
95
+ except Exception as e:
96
+ print(f"Error getting Wayback snapshot for {url}: {e}")
97
+ return None
98
+
99
+
100
+ def extract_wayback_content(html: str) -> Optional[str]:
101
+ """Extract original content from Wayback Machine wrapper HTML."""
102
+ try:
103
+ soup = BeautifulSoup(html, 'html.parser')
104
+ content_divs = soup.find_all('div', id='webpage')
105
+ if not content_divs:
106
+ content_divs = soup.find_all('div', class_='webpage')
107
+ if not content_divs:
108
+ content_divs = soup.find_all('div', {'id': re.compile('content|main', re.I)})
109
+
110
+ if content_divs:
111
+ return str(content_divs[0])
112
+
113
+ body = soup.find('body')
114
+ if body:
115
+ for elem in body.find_all(['div', 'script', 'style'], class_=re.compile('wm-|wayback', re.I)):
116
+ elem.decompose()
117
+ return str(body)
118
+
119
+ return html
120
+ except Exception as e:
121
+ print(f"Error extracting Wayback content: {e}")
122
+ return html
123
+
124
+
125
+ def fetch_page(url: str, timeout: float = 30.0, use_wayback_fallback: bool = True) -> Optional[Dict[str, str]]:
126
+ """Fetch a webpage and extract its content. Falls back to Wayback Machine if 403 error occurs."""
127
+ headers = {
128
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
129
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
130
+ 'Accept-Language': 'en-US,en;q=0.9',
131
+ 'Accept-Encoding': 'gzip, deflate, br',
132
+ 'Connection': 'keep-alive',
133
+ 'Upgrade-Insecure-Requests': '1',
134
+ 'Sec-Fetch-Dest': 'document',
135
+ 'Sec-Fetch-Mode': 'navigate',
136
+ 'Sec-Fetch-Site': 'none',
137
+ 'Cache-Control': 'max-age=0'
138
+ }
139
+
140
+ try:
141
+ with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
142
+ response = client.get(url, headers=headers)
143
+ response.raise_for_status()
144
+
145
+ content_type = response.headers.get('content-type', '').lower()
146
+ if 'text/html' not in content_type:
147
+ print(f"Skipping non-HTML content: {content_type}")
148
+ return None
149
+
150
+ html = response.text
151
+ soup = BeautifulSoup(html, 'lxml')
152
+ title_tag = soup.find('title')
153
+ title = title_tag.get_text(strip=True) if title_tag else 'No title'
154
+ content = extract_main_content(html, url)
155
+
156
+ if not content or len(content) < 100:
157
+ print(f"Warning: Very little content extracted from {url}")
158
+
159
+ return {'url': url, 'title': title, 'content': content}
160
+
161
+ except httpx.TimeoutException:
162
+ print(f"Timeout fetching {url}")
163
+ return None
164
+ except httpx.HTTPStatusError as e:
165
+ if e.response.status_code == 403 and use_wayback_fallback:
166
+ print(f"HTTP 403 error fetching {url}, trying Wayback Machine...")
167
+ return fetch_from_wayback(url, timeout)
168
+ else:
169
+ print(f"HTTP error {e.response.status_code} fetching {url}")
170
+ return None
171
+ except Exception as e:
172
+ print(f"Error fetching {url}: {e}")
173
+ return None
174
+
175
+
176
+ def fetch_from_wayback(url: str, timeout: float = 30.0) -> Optional[Dict[str, str]]:
177
+ """Fetch a webpage from Wayback Machine (Internet Archive)."""
178
+ try:
179
+ wayback_url = get_wayback_snapshot(url, timeout=10.0)
180
+ if not wayback_url:
181
+ print(f"No Wayback Machine snapshot found for {url}")
182
+ return None
183
+
184
+ print(f"Fetching from Wayback Machine: {wayback_url}")
185
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'}
186
+
187
+ with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
188
+ response = client.get(wayback_url)
189
+ response.raise_for_status()
190
+ html = response.text
191
+ original_html = extract_wayback_content(html)
192
+ if original_html:
193
+ html = original_html
194
+
195
+ soup = BeautifulSoup(html, 'lxml')
196
+ title_tag = soup.find('title')
197
+ title = title_tag.get_text(strip=True) if title_tag else ''
198
+
199
+ title = re.sub(r'^.*?Wayback Machine\s*[:\-]\s*', '', title, flags=re.I)
200
+ title = re.sub(r'\s*[:\-]\s*Wayback Machine.*?$', '', title, flags=re.I)
201
+ title = re.sub(r'^.*?Internet Archive\s*[:\-]\s*', '', title, flags=re.I)
202
+
203
+ if not title or title.lower() in ['wayback machine', 'internet archive']:
204
+ h1 = soup.find('h1')
205
+ if h1:
206
+ title = h1.get_text(strip=True)
207
+ else:
208
+ meta_title = soup.find('meta', property='og:title')
209
+ if meta_title:
210
+ title = meta_title.get('content', '')
211
+
212
+ if not title or title.lower() in ['wayback machine', 'internet archive', 'no title']:
213
+ parsed = urlparse(url)
214
+ title = parsed.path.strip('/').replace('/', ' - ').replace('-', ' ').title()
215
+ if not title:
216
+ title = parsed.netloc.replace('.', ' ').title()
217
+
218
+ if not title:
219
+ title = 'No title'
220
+
221
+ content = extract_main_content(html, url)
222
+
223
+ if not content or len(content) < 100:
224
+ print(f"Warning: Very little content extracted from Wayback snapshot for {url}")
225
+
226
+ return {'url': url, 'title': title, 'content': content, 'source': 'wayback_machine'}
227
+
228
+ except httpx.TimeoutException:
229
+ print(f"Timeout fetching from Wayback Machine: {url}")
230
+ return None
231
+ except httpx.HTTPStatusError as e:
232
+ print(f"HTTP error {e.response.status_code} fetching from Wayback Machine: {url}")
233
+ return None
234
+ except Exception as e:
235
+ print(f"Error fetching from Wayback Machine {url}: {e}")
236
+ return None
237
+
238
+
239
+ def get_internal_links(html: str, base_url: str, same_domain_only: bool = True) -> list:
240
+ """Extract internal links from HTML."""
241
+ soup = BeautifulSoup(html, 'lxml')
242
+ base_parsed = urlparse(base_url)
243
+ base_domain = base_parsed.netloc.lower()
244
+
245
+ links = []
246
+ seen = set()
247
+
248
+ for a_tag in soup.find_all('a', href=True):
249
+ href = a_tag['href']
250
+ absolute_url = urljoin(base_url, href)
251
+ parsed = urlparse(absolute_url)
252
+ clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
253
+ if parsed.query:
254
+ clean_url += f"?{parsed.query}"
255
+
256
+ if same_domain_only:
257
+ if parsed.netloc.lower() != base_domain:
258
+ continue
259
+
260
+ if clean_url in seen or clean_url == base_url:
261
+ continue
262
+
263
+ seen.add(clean_url)
264
+ links.append(clean_url)
265
+
266
+ return links
267
+
llm.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM integration for answer generation.
3
+ Supports both OpenRouter (free open-source models) and Ollama (local models).
4
+ """
5
+ import httpx
6
+ import os
7
+ import time
8
+ from typing import List, Dict, Optional, Tuple
9
+ from rate_limit import get_rate_limit_tracker
10
+
11
+
12
+ def get_openrouter_models(api_key: str) -> List[Dict[str, str]]:
13
+ """
14
+ Get list of free, open-source models from OpenRouter.
15
+
16
+ Returns:
17
+ List of model dicts with 'id', 'name', 'context_length' keys
18
+ """
19
+ try:
20
+ headers = {
21
+ 'Authorization': f'Bearer {api_key}',
22
+ 'Content-Type': 'application/json'
23
+ }
24
+
25
+ with httpx.Client(timeout=15.0) as client:
26
+ response = client.get('https://openrouter.ai/api/v1/models', headers=headers)
27
+ response.raise_for_status()
28
+ data = response.json()
29
+
30
+ free_open_source_models = []
31
+
32
+ for model in data.get('data', []):
33
+ pricing = model.get('pricing', {})
34
+ prompt_price = pricing.get('prompt', '0')
35
+ completion_price = pricing.get('completion', '0')
36
+
37
+ # Must be free
38
+ is_free = (prompt_price == '0' or prompt_price == 0 or prompt_price is None) and \
39
+ (completion_price == '0' or completion_price == 0 or completion_price is None)
40
+
41
+ if not is_free:
42
+ continue
43
+
44
+ model_id = model.get('id', '')
45
+ name = model.get('name', '')
46
+ description = model.get('description', '').lower()
47
+
48
+ # Check if open source
49
+ is_open_source = any(keyword in model_id.lower() or keyword in name.lower() or keyword in description
50
+ for keyword in ['open', 'oss', 'apache', 'mit', 'open-source', 'open source',
51
+ 'openweight', 'open-source', 'mimo', 'deepseek-r1'])
52
+
53
+ # Filter for general-purpose models (not just coding)
54
+ # But include some good ones even if they mention coding
55
+ context_len = model.get('context_length', 0)
56
+
57
+ # Include models that are free and (open source OR explicitly marked as free)
58
+ if is_free and (is_open_source or ':free' in model_id.lower()) and context_len >= 32000:
59
+ # Skip models that require data policy configuration
60
+ if 'gpt-oss-120b' in model_id.lower():
61
+ continue # Requires data policy configuration
62
+
63
+ # Prefer general-purpose, but include some coding models that are good
64
+ if 'coder' in model_id.lower() and 'devstral' not in model_id.lower():
65
+ continue # Skip pure coding models except devstral
66
+
67
+ free_open_source_models.append({
68
+ 'id': model_id,
69
+ 'name': name,
70
+ 'context_length': context_len,
71
+ 'description': model.get('description', '')[:200]
72
+ })
73
+
74
+ # Sort by context length (descending)
75
+ free_open_source_models.sort(key=lambda x: x['context_length'], reverse=True)
76
+
77
+ # Filter out known problematic models
78
+ reliable_models = []
79
+ problematic_models = ['gpt-oss-120b:free'] # Requires data policy
80
+
81
+ for model in free_open_source_models:
82
+ if not any(prob in model['id'].lower() for prob in problematic_models):
83
+ reliable_models.append(model)
84
+
85
+ return reliable_models
86
+
87
+ except Exception as e:
88
+ print(f"Error fetching OpenRouter models: {e}")
89
+ return []
90
+
91
+
92
+ def generate_answer_openrouter(
93
+ query: str,
94
+ evidence_snippets: List[Dict[str, str]],
95
+ source_urls: List[str],
96
+ model: str,
97
+ api_key: str,
98
+ partial_evidence: Optional[Dict] = None
99
+ ) -> Tuple[str, bool, Optional[str]]:
100
+ """
101
+ Generate answer using OpenRouter API with free open-source models.
102
+
103
+ Args:
104
+ query: User question
105
+ evidence_snippets: List of dicts with 'text' and 'source_url' keys
106
+ source_urls: List of source URLs
107
+ model: Model ID from OpenRouter
108
+ api_key: OpenRouter API key
109
+ partial_evidence: Optional partial evidence dict if rate limited
110
+
111
+ Returns:
112
+ Tuple of (answer_text, was_rate_limited, rate_limit_message)
113
+ """
114
+ # Build evidence text - limit to prevent timeouts
115
+ # Use top 15 snippets to balance context vs speed
116
+ evidence_text = "\n\n".join([
117
+ f"[Source: {snippet.get('source_url', 'unknown')}]\n{snippet.get('text', '')}"
118
+ for snippet in evidence_snippets[:15] # Limit to 15 for faster responses
119
+ ])
120
+
121
+ # Count evidence
122
+ evidence_count = len(evidence_snippets)
123
+ source_count = len(source_urls)
124
+
125
+ # Build system prompt
126
+ system_prompt = """You are a professional research assistant for University of Arizona students. Your job is to ensure students obtain comprehensive information about the University of Arizona and everything it has to offer, including but not limited to:
127
+
128
+ 1. Student life
129
+ 2. Academic and research opportunities
130
+ 3. Clubs and student organizations
131
+ 4. Off-campus resources provided by the University of Arizona
132
+ 5. Campus resources
133
+ 6. Staff, professors, and entertainment spots
134
+ 7. Updates on the University of Arizona that affect students
135
+
136
+ Your mission is to make student life and student research much easier. You have access to research tools (SearXNG) that allow you to search across arizona.edu domains and read through official UA websites.
137
+
138
+ WORK STRUCTURE:
139
+ • The user asks a question
140
+ • Research has been conducted across arizona.edu domains
141
+ • Websites and URLs have been scraped and read through
142
+ • Key facts and dates matching the user's query have been extracted
143
+ • You now have evidence from official UA sources
144
+
145
+ YOUR RESPONSIBILITIES:
146
+ 1. **USE THE EVIDENCE PROVIDED**: The evidence below comes from official UA webpages. You MUST use this evidence to answer the question.
147
+ 2. **BE COMPREHENSIVE**: Extract and present all relevant information from the evidence that answers or relates to the user's question.
148
+ 3. **CITE SOURCES**: Every key claim must include the source URL in your response.
149
+ 4. **STRUCTURE CLEARLY**: Display your findings in an ordered, structured manner using day-to-day English.
150
+ 5. **BE HELPFUL**: If the evidence contains relevant information (even if not a complete answer), present it. Do not say "I couldn't find this" if there is ANY relevant evidence.
151
+ 6. **HONEST ABOUT GAPS**: Only if you have NO relevant evidence at all should you indicate what information might be missing. But first, thoroughly review ALL the evidence provided.
152
+
153
+ CRITICAL: The evidence has been extracted from official UA sources. Your job is to USE this evidence to help the student. Do not dismiss it. Analyze it carefully and extract all relevant information."""
154
+
155
+ # Build user prompt
156
+ user_prompt = f"""Question: {query}
157
+
158
+ EVIDENCE FROM UA SOURCES (from {evidence_count} snippets across {source_count} sources):
159
+ {evidence_text}
160
+
161
+ SOURCE URLs:
162
+ {chr(10).join(f"- {url}" for url in source_urls[:15])}
163
+
164
+ INSTRUCTIONS:
165
+ 1. Carefully review ALL the evidence provided above
166
+ 2. Extract and present ALL information from the evidence that is relevant to the question
167
+ 3. Structure your answer clearly with bullet points or numbered lists where appropriate
168
+ 4. Include source URLs for each key piece of information
169
+ 5. If the evidence contains relevant information (even if partial), present it - do not say "I couldn't find this"
170
+ 6. Only indicate missing information if you have reviewed ALL evidence and found NOTHING relevant
171
+
172
+ Now provide a comprehensive answer based on the evidence above:"""
173
+
174
+ try:
175
+ headers = {
176
+ 'Authorization': f'Bearer {api_key}',
177
+ 'Content-Type': 'application/json',
178
+ 'HTTP-Referer': 'https://github.com', # Optional: for OpenRouter analytics
179
+ 'X-Title': 'UA Student Navigator' # Optional: for OpenRouter analytics
180
+ }
181
+
182
+ payload = {
183
+ 'model': model,
184
+ 'messages': [
185
+ {'role': 'system', 'content': system_prompt},
186
+ {'role': 'user', 'content': user_prompt}
187
+ ],
188
+ 'temperature': 0.7,
189
+ 'max_tokens': 1500 # Reduced to prevent timeouts
190
+ }
191
+
192
+ # Check rate limit before making request
193
+ rate_tracker = get_rate_limit_tracker()
194
+ can_request, rate_limit_msg = rate_tracker.can_make_request(model, api_key)
195
+
196
+ if not can_request:
197
+ # Return partial answer if we have partial evidence
198
+ if partial_evidence:
199
+ partial_answer = _generate_partial_answer(query, partial_evidence)
200
+ return partial_answer, True, rate_limit_msg
201
+ return "Rate limit reached. Please wait a moment and try again.", True, rate_limit_msg
202
+
203
+ # Record request
204
+ rate_tracker.record_request(model)
205
+
206
+ # Use longer timeout and retry logic for reliability
207
+ max_retries = 2
208
+ timeout = 120.0 # Increased to 120 seconds for large models
209
+
210
+ for attempt in range(max_retries):
211
+ try:
212
+ with httpx.Client(timeout=timeout) as client:
213
+ response = client.post(
214
+ 'https://openrouter.ai/api/v1/chat/completions',
215
+ json=payload,
216
+ headers=headers
217
+ )
218
+
219
+ # Check for rate limit (429)
220
+ if response.status_code == 429:
221
+ error_data = {}
222
+ try:
223
+ error_data = response.json()
224
+ except:
225
+ pass
226
+ error_msg = error_data.get('error', {}).get('message', 'Rate limit exceeded')
227
+
228
+ # Exponential backoff
229
+ wait_time = (2 ** attempt) * 2 # 2s, 4s
230
+ if attempt < max_retries - 1:
231
+ time.sleep(wait_time)
232
+ continue
233
+ else:
234
+ # Return partial answer if available
235
+ if partial_evidence:
236
+ partial_answer = _generate_partial_answer(query, partial_evidence)
237
+ return partial_answer, True, f"Rate limit reached: {error_msg}"
238
+ return "Rate limit reached. Please wait a moment and try again.", True, error_msg
239
+
240
+ response.raise_for_status()
241
+ result = response.json()
242
+
243
+ # Extract answer
244
+ if 'choices' in result and len(result['choices']) > 0:
245
+ answer = result['choices'][0]['message']['content']
246
+ return answer.strip(), False, None
247
+ else:
248
+ return f"Error: Unexpected response format from OpenRouter", False, None
249
+
250
+ except httpx.TimeoutException:
251
+ if attempt < max_retries - 1:
252
+ # Reduce evidence on retry to speed up
253
+ if attempt == 1:
254
+ evidence_text = "\n\n".join([
255
+ f"[Source: {snippet.get('source_url', 'unknown')}]\n{snippet.get('text', '')}"
256
+ for snippet in evidence_snippets[:10] # Reduce to 10 on retry
257
+ ])
258
+ user_prompt = f"""Question: {query}
259
+
260
+ EVIDENCE FROM UA SOURCES (from {min(len(evidence_snippets), 10)} snippets):
261
+ {evidence_text}
262
+
263
+ SOURCE URLs:
264
+ {chr(10).join(f"- {url}" for url in source_urls[:10])}
265
+
266
+ INSTRUCTIONS:
267
+ 1. Review the evidence above
268
+ 2. Extract and present ALL relevant information
269
+ 3. Include source URLs for key claims
270
+ 4. If evidence contains relevant information, present it
271
+
272
+ Provide a comprehensive answer:"""
273
+ payload['messages'][1]['content'] = user_prompt
274
+ continue
275
+ else:
276
+ # Return partial answer if available
277
+ if partial_evidence:
278
+ partial_answer = _generate_partial_answer(query, partial_evidence)
279
+ return partial_answer, False, "Request timed out. Showing partial results."
280
+ return "Error: Request to OpenRouter timed out after multiple attempts. The model may be slow or overloaded. Try a different model.", False, None
281
+
282
+ except httpx.TimeoutException:
283
+ if partial_evidence:
284
+ partial_answer = _generate_partial_answer(query, partial_evidence)
285
+ return partial_answer, False, "Request timed out. Showing partial results."
286
+ return "Error: Request to OpenRouter timed out. Try a different model or reduce the amount of evidence.", False, None
287
+ except httpx.HTTPStatusError as e:
288
+ error_data = {}
289
+ try:
290
+ error_data = e.response.json()
291
+ except:
292
+ pass
293
+
294
+ error_msg = error_data.get('error', {}).get('message', f'HTTP {e.response.status_code}')
295
+ if 'rate' in error_msg.lower() or '429' in str(e.response.status_code):
296
+ # Return partial answer if available
297
+ if partial_evidence:
298
+ partial_answer = _generate_partial_answer(query, partial_evidence)
299
+ return partial_answer, True, f"Rate limit reached: {error_msg}"
300
+ return f"Rate limit reached. Please try a different model or wait a moment.", True, error_msg
301
+ return f"Error: OpenRouter API returned: {error_msg}", False, None
302
+ except Exception as e:
303
+ if partial_evidence:
304
+ partial_answer = _generate_partial_answer(query, partial_evidence)
305
+ return partial_answer, False, f"Error occurred: {str(e)}. Showing partial results."
306
+ return f"Error generating answer: {str(e)}", False, None
307
+
308
+
309
+ def _generate_partial_answer(query: str, partial_evidence: Dict) -> str:
310
+ """Generate a summary from partial evidence when rate limited."""
311
+ pages = partial_evidence.get('pages', [])
312
+ snippets = partial_evidence.get('snippets', [])
313
+ dates = partial_evidence.get('dates', [])
314
+
315
+ answer_parts = []
316
+ answer_parts.append("⚠️ **Rate Limit Reached - Partial Results**\n\n")
317
+ answer_parts.append("I've reached the maximum number of requests for this model. Here's what I found so far:\n\n")
318
+
319
+ if snippets:
320
+ answer_parts.append("**Key Findings:**\n")
321
+ for i, snippet in enumerate(snippets[:10], 1):
322
+ answer_parts.append(f"{i}. {snippet.get('text', '')[:200]}\n")
323
+ answer_parts.append(f" *Source: [{snippet.get('source_title', 'Link')}]({snippet.get('source_url', '')})*\n\n")
324
+
325
+ if dates:
326
+ answer_parts.append("\n**Important Dates:**\n")
327
+ for date_info in dates[:5]:
328
+ answer_parts.append(f"- **{date_info.get('date', 'Unknown')}**: {date_info.get('context', '')[:150]}\n")
329
+ answer_parts.append(f" *Source: [{date_info.get('source_title', 'Link')}]({date_info.get('source_url', '')})*\n")
330
+
331
+ if pages:
332
+ answer_parts.append("\n\n**Sources Analyzed:**\n")
333
+ for i, page in enumerate(pages[:10], 1):
334
+ source_indicator = " (via Wayback Machine)" if page.get('source') == 'wayback_machine' else ""
335
+ answer_parts.append(f"{i}. [{page.get('title', 'No title')}]({page.get('url', '')}){source_indicator}\n")
336
+
337
+ answer_parts.append("\n\n**Note:** To see complete results, please wait a moment and try again, or select a different model.")
338
+
339
+ return "".join(answer_parts)
340
+
341
+
342
+ def generate_answer(
343
+ query: str,
344
+ evidence_snippets: List[Dict[str, str]],
345
+ source_urls: List[str],
346
+ ollama_host: Optional[str] = None,
347
+ model: Optional[str] = None
348
+ ) -> str:
349
+ """
350
+ Generate answer using Ollama LLM based on evidence (for local models).
351
+
352
+ Args:
353
+ query: User question
354
+ evidence_snippets: List of dicts with 'text' and 'source_url' keys
355
+ source_urls: List of source URLs
356
+ ollama_host: Ollama host URL
357
+ model: Model name
358
+
359
+ Returns:
360
+ Generated answer text
361
+ """
362
+ if ollama_host is None:
363
+ ollama_host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
364
+
365
+ if model is None:
366
+ model = os.getenv('OLLAMA_MODEL', 'qwen2.5:7b')
367
+
368
+ # Build evidence text
369
+ evidence_text = "\n\n".join([
370
+ f"[Source: {snippet.get('source_url', 'unknown')}]\n{snippet.get('text', '')}"
371
+ for snippet in evidence_snippets[:20]
372
+ ])
373
+
374
+ evidence_count = len(evidence_snippets)
375
+ source_count = len(source_urls)
376
+
377
+ # Build system prompt
378
+ system_prompt = """You are a professional research assistant for University of Arizona students. Your job is to ensure students obtain comprehensive information about the University of Arizona and everything it has to offer, including but not limited to:
379
+
380
+ 1. Student life
381
+ 2. Academic and research opportunities
382
+ 3. Clubs and student organizations
383
+ 4. Off-campus resources provided by the University of Arizona
384
+ 5. Campus resources
385
+ 6. Staff, professors, and entertainment spots
386
+ 7. Updates on the University of Arizona that affect students
387
+
388
+ Your mission is to make student life and student research much easier. You have access to research tools (SearXNG) that allow you to search across arizona.edu domains and read through official UA websites.
389
+
390
+ WORK STRUCTURE:
391
+ • The user asks a question
392
+ • Research has been conducted across arizona.edu domains
393
+ • Websites and URLs have been scraped and read through
394
+ • Key facts and dates matching the user's query have been extracted
395
+ • You now have evidence from official UA sources
396
+
397
+ YOUR RESPONSIBILITIES:
398
+ 1. **USE THE EVIDENCE PROVIDED**: The evidence below comes from official UA webpages. You MUST use this evidence to answer the question.
399
+ 2. **BE COMPREHENSIVE**: Extract and present all relevant information from the evidence that answers or relates to the user's question.
400
+ 3. **CITE SOURCES**: Every key claim must include the source URL in your response.
401
+ 4. **STRUCTURE CLEARLY**: Display your findings in an ordered, structured manner using day-to-day English.
402
+ 5. **BE HELPFUL**: If the evidence contains relevant information (even if not a complete answer), present it. Do not say "I couldn't find this" if there is ANY relevant evidence.
403
+ 6. **HONEST ABOUT GAPS**: Only if you have NO relevant evidence at all should you indicate what information might be missing. But first, thoroughly review ALL the evidence provided.
404
+
405
+ CRITICAL: The evidence has been extracted from official UA sources. Your job is to USE this evidence to help the student. Do not dismiss it. Analyze it carefully and extract all relevant information."""
406
+
407
+ # Build user prompt
408
+ user_prompt = f"""Question: {query}
409
+
410
+ EVIDENCE FROM UA SOURCES (from {evidence_count} snippets across {source_count} sources):
411
+ {evidence_text}
412
+
413
+ SOURCE URLs:
414
+ {chr(10).join(f"- {url}" for url in source_urls[:15])}
415
+
416
+ INSTRUCTIONS:
417
+ 1. Carefully review ALL the evidence provided above
418
+ 2. Extract and present ALL information from the evidence that is relevant to the question
419
+ 3. Structure your answer clearly with bullet points or numbered lists where appropriate
420
+ 4. Include source URLs for each key piece of information
421
+ 5. If the evidence contains relevant information (even if partial), present it - do not say "I couldn't find this"
422
+ 6. Only indicate missing information if you have reviewed ALL evidence and found NOTHING relevant
423
+
424
+ Now provide a comprehensive answer based on the evidence above:"""
425
+
426
+ try:
427
+ with httpx.Client(timeout=60.0) as client:
428
+ response = client.post(
429
+ f"{ollama_host.rstrip('/')}/api/chat",
430
+ json={
431
+ "model": model,
432
+ "messages": [
433
+ {"role": "system", "content": system_prompt},
434
+ {"role": "user", "content": user_prompt}
435
+ ],
436
+ "stream": False
437
+ }
438
+ )
439
+ response.raise_for_status()
440
+ result = response.json()
441
+ answer = result.get('message', {}).get('content', '')
442
+ return answer.strip()
443
+
444
+ except httpx.TimeoutException:
445
+ return "Error: Request to Ollama timed out. Please try again."
446
+ except httpx.HTTPStatusError as e:
447
+ return f"Error: Ollama API returned status {e.response.status_code}. Please check your Ollama setup."
448
+ except Exception as e:
449
+ return f"Error generating answer: {str(e)}"
450
+
451
+
452
+ # Keep Ollama functions for backward compatibility
453
+ def check_ollama_available(ollama_host: Optional[str] = None) -> Tuple[bool, Optional[str]]:
454
+ """Check if Ollama is running and accessible."""
455
+ if ollama_host is None:
456
+ ollama_host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
457
+
458
+ try:
459
+ with httpx.Client(timeout=5.0) as client:
460
+ response = client.get(f"{ollama_host.rstrip('/')}/api/tags")
461
+ response.raise_for_status()
462
+ return True, None
463
+ except Exception as e:
464
+ return False, f"Ollama is not running or not accessible at {ollama_host}: {str(e)}"
465
+
466
+
467
+ def check_model_exists(model: str, ollama_host: Optional[str] = None) -> Tuple[bool, Optional[str], Optional[str]]:
468
+ """Check if the specified Ollama model exists locally."""
469
+ if ollama_host is None:
470
+ ollama_host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
471
+
472
+ try:
473
+ with httpx.Client(timeout=10.0) as client:
474
+ response = client.get(f"{ollama_host.rstrip('/')}/api/tags")
475
+ response.raise_for_status()
476
+ models = response.json().get('models', [])
477
+ model_names = [m.get('name', '') for m in models]
478
+
479
+ if model in model_names or any(m.startswith(model) for m in model_names):
480
+ return True, None, model
481
+
482
+ # Suggest alternatives
483
+ preferred_models = ['qwen2.5:7b', 'gemma3:latest', 'phi3:latest', 'llama3.2:3b']
484
+ suggested = None
485
+ for preferred in preferred_models:
486
+ if preferred in model_names or any(m.startswith(preferred.split(':')[0]) for m in model_names):
487
+ for m in model_names:
488
+ if m == preferred or m.startswith(preferred.split(':')[0] + ':'):
489
+ suggested = m
490
+ break
491
+ if suggested:
492
+ break
493
+
494
+ if not suggested and model_names:
495
+ suggested = model_names[0]
496
+
497
+ error_msg = f"Model '{model}' not found."
498
+ if suggested:
499
+ error_msg += f" Suggested: '{suggested}' (run: ollama pull {suggested})"
500
+ else:
501
+ error_msg += f" Available: {', '.join(model_names[:5])}"
502
+
503
+ return False, error_msg, suggested
504
+ except Exception as e:
505
+ return False, f"Error checking model: {str(e)}", None
506
+
rank.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rank pages by relevance to query.
3
+ """
4
+ from typing import List, Dict
5
+ import re
6
+
7
+
8
+ def calculate_relevance_score(page: Dict[str, str], query: str) -> float:
9
+ """Calculate relevance score for a page based on query."""
10
+ query_lower = query.lower()
11
+ query_words = [w.lower() for w in query.split() if len(w) > 2]
12
+
13
+ if not query_words:
14
+ return 0.0
15
+
16
+ title = page.get('title', '').lower()
17
+ content = page.get('content', '').lower()
18
+ url = page.get('url', '').lower()
19
+
20
+ score = 0.0
21
+
22
+ for word in query_words:
23
+ if word in title:
24
+ score += 10.0
25
+ if query_lower in title:
26
+ score += 20.0
27
+
28
+ for word in query_words:
29
+ if word in url:
30
+ score += 5.0
31
+
32
+ content_matches = 0
33
+ for word in query_words:
34
+ count = content.count(word)
35
+ content_matches += min(count, 10)
36
+
37
+ score += content_matches * 1.0
38
+
39
+ if query_lower in content:
40
+ score += 15.0
41
+
42
+ content_length = len(content)
43
+ if content_length > 0:
44
+ score = score / (1 + content_length / 10000)
45
+
46
+ return score
47
+
48
+
49
+ def rank_pages(pages: List[Dict[str, str]], query: str) -> List[Dict[str, str]]:
50
+ """Rank pages by relevance to query."""
51
+ scored_pages = []
52
+ for page in pages:
53
+ score = calculate_relevance_score(page, query)
54
+ page_copy = page.copy()
55
+ page_copy['relevance_score'] = score
56
+ scored_pages.append(page_copy)
57
+
58
+ scored_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
59
+ return scored_pages
60
+
61
+
62
+ def rank_snippets(snippets: List[Dict[str, str]], query: str) -> List[Dict[str, str]]:
63
+ """Rank snippets by relevance."""
64
+ query_lower = query.lower()
65
+ query_words = [w.lower() for w in query.split() if len(w) > 2]
66
+
67
+ scored_snippets = []
68
+ for snippet in snippets:
69
+ text_lower = snippet.get('text', '').lower()
70
+ score = 0.0
71
+
72
+ for word in query_words:
73
+ if word in text_lower:
74
+ score += 1.0
75
+
76
+ if query_lower in text_lower:
77
+ score += 5.0
78
+
79
+ snippet_copy = snippet.copy()
80
+ snippet_copy['relevance_score'] = score
81
+ scored_snippets.append(snippet_copy)
82
+
83
+ scored_snippets.sort(key=lambda x: x['relevance_score'], reverse=True)
84
+ return scored_snippets
85
+
rate_limit.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rate limit tracking and management for OpenRouter API.
3
+ """
4
+ import httpx
5
+ import time
6
+ from typing import Dict, Optional, Tuple
7
+ from collections import defaultdict
8
+ from datetime import datetime, timedelta
9
+
10
+
11
+ class RateLimitTracker:
12
+ """Track rate limits per model and manage request throttling."""
13
+
14
+ def __init__(self):
15
+ self.request_counts: Dict[str, list] = defaultdict(list) # model -> [timestamps]
16
+ self.rate_limit_info: Dict[str, Dict] = {} # model -> rate limit info
17
+ self.last_check: Optional[datetime] = None
18
+ self.global_rate_limit: Optional[Dict] = None
19
+
20
+ def get_rate_limits(self, api_key: str) -> Dict:
21
+ """Get rate limit information from OpenRouter API."""
22
+ # Cache for 5 minutes
23
+ if self.global_rate_limit and self.last_check:
24
+ if datetime.now() - self.last_check < timedelta(minutes=5):
25
+ return self.global_rate_limit
26
+
27
+ try:
28
+ headers = {'Authorization': f'Bearer {api_key}'}
29
+ with httpx.Client(timeout=10.0) as client:
30
+ response = client.get('https://openrouter.ai/api/v1/key', headers=headers)
31
+ if response.status_code == 200:
32
+ data = response.json()
33
+ self.global_rate_limit = data
34
+ self.last_check = datetime.now()
35
+ return data
36
+ except Exception as e:
37
+ print(f"Error fetching rate limits: {e}")
38
+
39
+ # Default conservative limits if API fails
40
+ return {
41
+ 'limit': {
42
+ 'requests_per_minute': 20,
43
+ 'requests_per_day': 1000
44
+ }
45
+ }
46
+
47
+ def get_model_rate_limit(self, model: str, api_key: str) -> Dict:
48
+ """Get rate limit for a specific model."""
49
+ # Free models typically have lower limits
50
+ # Conservative defaults based on common free tier limits
51
+ default_limits = {
52
+ 'requests_per_minute': 10,
53
+ 'requests_per_day': 200,
54
+ 'tokens_per_minute': 100000
55
+ }
56
+
57
+ # Some models may have different limits
58
+ model_specific = {
59
+ 'xiaomi/mimo-v2-flash:free': {'requests_per_minute': 15, 'requests_per_day': 300},
60
+ 'deepseek/deepseek-r1-0528:free': {'requests_per_minute': 10, 'requests_per_day': 200},
61
+ 'openai/gpt-oss-20b:free': {'requests_per_minute': 5, 'requests_per_day': 100},
62
+ 'nvidia/nemotron-3-nano-30b-a3b:free': {'requests_per_minute': 10, 'requests_per_day': 200},
63
+ }
64
+
65
+ if model in model_specific:
66
+ return {**default_limits, **model_specific[model]}
67
+
68
+ return default_limits
69
+
70
+ def can_make_request(self, model: str, api_key: str) -> Tuple[bool, Optional[str]]:
71
+ """Check if we can make a request without hitting rate limit."""
72
+ limits = self.get_model_rate_limit(model, api_key)
73
+ now = datetime.now()
74
+
75
+ # Clean old timestamps (older than 1 minute)
76
+ cutoff = now - timedelta(minutes=1)
77
+ self.request_counts[model] = [
78
+ ts for ts in self.request_counts[model]
79
+ if ts > cutoff
80
+ ]
81
+
82
+ # Check per-minute limit
83
+ requests_last_minute = len(self.request_counts[model])
84
+ if requests_last_minute >= limits.get('requests_per_minute', 10):
85
+ wait_time = 60 - (now - self.request_counts[model][0]).total_seconds()
86
+ return False, f"Rate limit reached: {requests_last_minute}/{limits.get('requests_per_minute')} requests per minute. Wait {int(wait_time)}s"
87
+
88
+ return True, None
89
+
90
+ def record_request(self, model: str):
91
+ """Record that a request was made."""
92
+ self.request_counts[model].append(datetime.now())
93
+
94
+ def get_remaining_requests(self, model: str, api_key: str) -> int:
95
+ """Get remaining requests for this minute."""
96
+ limits = self.get_model_rate_limit(model, api_key)
97
+ now = datetime.now()
98
+
99
+ # Clean old timestamps
100
+ cutoff = now - timedelta(minutes=1)
101
+ self.request_counts[model] = [
102
+ ts for ts in self.request_counts[model]
103
+ if ts > cutoff
104
+ ]
105
+
106
+ requests_last_minute = len(self.request_counts[model])
107
+ max_per_minute = limits.get('requests_per_minute', 10)
108
+ return max(0, max_per_minute - requests_last_minute)
109
+
110
+ def calculate_max_searches(self, model: str, api_key: str) -> int:
111
+ """Calculate maximum searches based on rate limits."""
112
+ remaining = self.get_remaining_requests(model, api_key)
113
+ # Reserve 1 request for final answer generation
114
+ # Use 80% of remaining to be safe
115
+ max_searches = max(1, int(remaining * 0.8) - 1)
116
+ return min(max_searches, 10) # Cap at 10 for practical purposes
117
+
118
+
119
+ # Global rate limit tracker instance
120
+ _rate_limit_tracker = RateLimitTracker()
121
+
122
+
123
+ def get_rate_limit_tracker() -> RateLimitTracker:
124
+ """Get the global rate limit tracker."""
125
+ return _rate_limit_tracker
126
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ httpx>=0.25.0
3
+ beautifulsoup4>=4.12.0
4
+ lxml>=4.9.0
5
+ python-dotenv>=1.0.0
6
+
search.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SearXNG integration for searching UA domains with DuckDuckGo fallback.
3
+ """
4
+ import httpx
5
+ import os
6
+ from typing import List, Dict, Optional
7
+ from urllib.parse import urlparse, quote
8
+ import re
9
+ from bs4 import BeautifulSoup
10
+
11
+
12
+ def is_ua_domain(url: str) -> bool:
13
+ """Check if URL belongs to UA domain (arizona.edu or subdomains)."""
14
+ try:
15
+ parsed = urlparse(url)
16
+ domain = parsed.netloc.lower()
17
+ return domain.endswith('.arizona.edu') or domain == 'arizona.edu'
18
+ except Exception:
19
+ return False
20
+
21
+
22
+ def ua_search(query: str, max_results: int = 10, searxng_url: Optional[str] = None) -> List[Dict[str, str]]:
23
+ """
24
+ Search using SearXNG, filtering to UA domains only.
25
+ Tries multiple SearXNG instances as fallback.
26
+
27
+ Args:
28
+ query: Search query
29
+ max_results: Maximum number of results to return
30
+ searxng_url: SearXNG instance URL (defaults to env var)
31
+
32
+ Returns:
33
+ List of dicts with 'title', 'url', 'snippet' keys, filtered to UA domains
34
+ """
35
+ if searxng_url is None:
36
+ searxng_url = os.getenv('SEARXNG_URL', 'http://localhost:8080')
37
+
38
+ # Enhance query to prefer UA domains
39
+ enhanced_query = f"site:arizona.edu {query}"
40
+
41
+ # List of SearXNG instances to try (fallback order)
42
+ searxng_instances = [
43
+ searxng_url,
44
+ 'https://searx.be',
45
+ 'https://search.sapti.me',
46
+ 'https://searx.tiekoetter.com',
47
+ ]
48
+
49
+ # Remove duplicates while preserving order
50
+ seen = set()
51
+ unique_instances = []
52
+ for instance in searxng_instances:
53
+ if instance not in seen:
54
+ seen.add(instance)
55
+ unique_instances.append(instance)
56
+
57
+ last_error = None
58
+
59
+ for instance_url in unique_instances:
60
+ try:
61
+ # Proper headers to avoid bot detection
62
+ headers = {
63
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
64
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.8,*/*;q=0.7',
65
+ 'Accept-Language': 'en-US,en;q=0.9',
66
+ 'Referer': instance_url.rstrip('/'),
67
+ 'Origin': instance_url.rstrip('/'),
68
+ }
69
+
70
+ # For local SearXNG, use HTML parsing since JSON API is blocked
71
+ client = httpx.Client(timeout=30.0, follow_redirects=True, headers=headers)
72
+
73
+ # Visit main page first to establish session
74
+ try:
75
+ client.get(instance_url.rstrip('/'), timeout=5.0)
76
+ except:
77
+ pass
78
+
79
+ # Try JSON API first
80
+ api_url = f"{instance_url.rstrip('/')}/search"
81
+ params = {'q': enhanced_query, 'format': 'json'}
82
+
83
+ try:
84
+ response = client.get(api_url, params=params)
85
+ if response.status_code == 200:
86
+ try:
87
+ data = response.json()
88
+ client.close()
89
+ # Success with JSON, process it below
90
+ except:
91
+ # Not JSON, fall through to HTML parsing
92
+ raise ValueError("Not JSON")
93
+ else:
94
+ # 403 or other error, try HTML parsing
95
+ raise ValueError(f"HTTP {response.status_code}")
96
+ except:
97
+ # JSON API failed, try HTML parsing
98
+ try:
99
+ # Use POST method for HTML search (more likely to work)
100
+ html_params = {'q': enhanced_query}
101
+ response = client.post(api_url, data=html_params, timeout=30.0)
102
+
103
+ if response.status_code == 200:
104
+ # Parse HTML results
105
+ soup = BeautifulSoup(response.text, 'html.parser')
106
+ data = {'results': []}
107
+
108
+ # SearXNG uses <article> tags for results
109
+ articles = soup.find_all('article')
110
+
111
+ for article in articles:
112
+ try:
113
+ # Extract URL from link in article
114
+ link = article.find('a', href=True)
115
+ if not link:
116
+ # Try h3 > a structure
117
+ h3 = article.find('h3')
118
+ if h3:
119
+ link = h3.find('a', href=True)
120
+
121
+ if link:
122
+ url = link.get('href', '')
123
+
124
+ # Extract title
125
+ title = link.get_text(strip=True)
126
+ if not title:
127
+ h3 = article.find('h3')
128
+ if h3:
129
+ title = h3.get_text(strip=True)
130
+
131
+ # Extract snippet/content
132
+ snippet_elem = article.find('p')
133
+ if not snippet_elem:
134
+ snippet_elem = article.find('div', class_='content')
135
+ if not snippet_elem:
136
+ snippet_elem = article.find('div')
137
+
138
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
139
+
140
+ if url:
141
+ data['results'].append({
142
+ 'url': url,
143
+ 'title': title or 'No title',
144
+ 'content': snippet
145
+ })
146
+ except Exception as e:
147
+ continue
148
+
149
+ client.close()
150
+ # Success with HTML parsing
151
+ else:
152
+ client.close()
153
+ last_error = f"Instance {instance_url} returned HTTP {response.status_code}"
154
+ continue
155
+ except Exception as e:
156
+ client.close()
157
+ last_error = f"Instance {instance_url} error: {str(e)}"
158
+ continue
159
+
160
+ results = []
161
+ seen_urls = set()
162
+
163
+ # Extract results from SearXNG response
164
+ search_results = data.get('results', [])
165
+
166
+ for result in search_results:
167
+ url = result.get('url', '') or result.get('link', '')
168
+ if not url:
169
+ continue
170
+
171
+ # Filter to UA domains only
172
+ if not is_ua_domain(url):
173
+ continue
174
+
175
+ # Deduplicate
176
+ if url in seen_urls:
177
+ continue
178
+ seen_urls.add(url)
179
+
180
+ title = result.get('title', '') or result.get('name', 'No title')
181
+ snippet = result.get('content', '') or result.get('snippet', '') or result.get('description', '')
182
+
183
+ results.append({
184
+ 'title': title,
185
+ 'url': url,
186
+ 'snippet': snippet[:500]
187
+ })
188
+
189
+ if len(results) >= max_results:
190
+ break
191
+
192
+ # If we got results, return them
193
+ if results:
194
+ return results
195
+
196
+ except httpx.TimeoutException:
197
+ last_error = f"Instance {instance_url} timed out"
198
+ continue
199
+ except httpx.RequestError as e:
200
+ last_error = f"Instance {instance_url} request error: {str(e)}"
201
+ continue
202
+ except httpx.HTTPStatusError as e:
203
+ last_error = f"Instance {instance_url} HTTP error: {e.response.status_code}"
204
+ continue
205
+ except Exception as e:
206
+ last_error = f"Instance {instance_url} error: {str(e)}"
207
+ continue
208
+
209
+ # If all SearXNG instances failed, try DuckDuckGo as fallback
210
+ print(f"SearXNG search failed on all instances. Last error: {last_error}")
211
+ print("Trying DuckDuckGo as fallback...")
212
+
213
+ return duckduckgo_fallback_search(enhanced_query, max_results)
214
+
215
+
216
+ def duckduckgo_fallback_search(query: str, max_results: int = 10) -> List[Dict[str, str]]:
217
+ """Fallback search using DuckDuckGo HTML scraping."""
218
+ try:
219
+ search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
220
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
221
+
222
+ with httpx.Client(timeout=15.0, follow_redirects=True) as client:
223
+ response = client.get(search_url, headers=headers)
224
+ response.raise_for_status()
225
+
226
+ soup = BeautifulSoup(response.text, 'html.parser')
227
+ results = []
228
+ seen_urls = set()
229
+
230
+ for result in soup.find_all('div', class_='result'):
231
+ try:
232
+ link_elem = result.find('a', class_='result__a')
233
+ if not link_elem:
234
+ continue
235
+ url = link_elem.get('href', '')
236
+ if not url or not is_ua_domain(url):
237
+ continue
238
+ if url in seen_urls:
239
+ continue
240
+ seen_urls.add(url)
241
+
242
+ title = link_elem.get_text(strip=True) or 'No title'
243
+ snippet_elem = result.find('a', class_='result__snippet')
244
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else ''
245
+
246
+ results.append({'title': title, 'url': url, 'snippet': snippet[:500]})
247
+ if len(results) >= max_results:
248
+ break
249
+ except Exception:
250
+ continue
251
+
252
+ if results:
253
+ print(f"✓ DuckDuckGo fallback found {len(results)} results")
254
+ return results
255
+ except Exception as e:
256
+ print(f"DuckDuckGo fallback also failed: {e}")
257
+
258
+ return []
259
+